Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions bench/persistent_runtime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Benchmark the persistent runtime + warm scoring against the per-rollout default.

Isolates the runtime/scoring overhead (no model generation, which would mask it): runs gsm8k's
`verify.py` N times at a fixed concurrency, both ways, through the real framework API.

ephemeral: make_runtime + start() + run_uv_script(cold) + stop() per call
— exactly today's per-rollout scoring path.
persistent: RuntimePool.acquire() + run_uv_script(warm=True) + release() per call
— runtimes (and their warm workers) reused; the math_verify import is paid once
per pooled runtime, not per call.

Usage: uv run python bench/persistent_runtime.py [N=1000] [CONCURRENCY=128]
"""

import asyncio
import sys
import time
from pathlib import Path

from verifiers.v1.runtimes import RuntimePool, SubprocessConfig, make_runtime

VERIFY = (
Path(__file__).resolve().parents[1] / "environments/gsm8k_v1/gsm8k_v1/verify.py"
).read_bytes()
GOLD, PRED = "42", "#### 42"


async def ephemeral(n: int, concurrency: int) -> float:
sem = asyncio.Semaphore(concurrency)

async def one() -> None:
async with sem:
rt = make_runtime(SubprocessConfig())
await rt.start()
try:
r = await rt.run_uv_script(VERIFY, args=[GOLD, PRED])
assert r.stdout.strip().endswith("1.0"), r
finally:
await rt.stop()

t0 = time.time()
await asyncio.gather(*(one() for _ in range(n)))
return time.time() - t0


async def persistent(n: int, concurrency: int) -> float:
sem = asyncio.Semaphore(concurrency)
cfg = SubprocessConfig(persistent=True)

async def one(pool: RuntimePool) -> None:
async with sem:
rt = await pool.acquire(cfg)
try:
r = await rt.run_uv_script(VERIFY, args=[GOLD, PRED], warm=True)
assert r.stdout.strip() == "1.0", r
finally:
await pool.release(rt)

async with RuntimePool() as pool:
t0 = time.time()
await asyncio.gather(*(one(pool) for _ in range(n)))
return time.time() - t0


async def main(n: int, concurrency: int) -> None:
# warm the class-level interpreter cache once so neither mode pays uv-resolve
rt = make_runtime(SubprocessConfig())
await rt.start()
await rt.run_uv_script(VERIFY, args=[GOLD, PRED])
await rt.stop()

ephemeral_dt = await ephemeral(n, concurrency)
persistent_dt = await persistent(n, concurrency)

print(f"gsm8k verify scoring — n={n} concurrency={concurrency}")
print(
f" ephemeral (runtime + import per call): {ephemeral_dt:7.2f}s ({1000 * ephemeral_dt / n:6.2f} ms/call)"
)
print(
f" persistent (pool + warm worker): {persistent_dt:7.2f}s ({1000 * persistent_dt / n:6.2f} ms/call)"
)
print(
f" speedup: {ephemeral_dt / persistent_dt:6.1f}x"
)


if __name__ == "__main__":
n = int(sys.argv[1]) if len(sys.argv) > 1 else 1000
concurrency = int(sys.argv[2]) if len(sys.argv) > 2 else 128
asyncio.run(main(n, concurrency))
3 changes: 3 additions & 0 deletions environments/aime24_v1/aime24_v1/taskset.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,12 @@ async def correct(
prediction = (
trace.assistant_messages[-1].content if trace.assistant_messages else ""
)
# warm=True: on a persistent runtime, reuse a warm worker that imports math_verify once
# instead of execing a fresh interpreter per rollout (no-op on an ephemeral runtime).
result = await runtime.run_uv_script(
VERIFY,
args=[task.answer, prediction or "", str(self.config.math_verify_timeout)],
warm=True,
)
if result.exit_code != 0:
raise RuntimeError(f"verify.py failed: {result.stderr.strip()[-500:]}")
Expand Down
62 changes: 34 additions & 28 deletions environments/aime24_v1/aime24_v1/verify.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,22 @@
# /// script
# dependencies = ["math-verify"]
# ///
"""Score one math answer by math-verify equivalence of the model's boxed answer vs the
gold, run inside the rollout's runtime via `uv run`. uv installs `math-verify` into its
own cache here — the dependency never touches the eval process. Takes the gold answer
(argv[1]), the model's prediction (argv[2]), and a timeout in seconds (argv[3]); prints
1.0 if they're equivalent, else 0.0.
"""Score one math answer by math-verify equivalence of the model's boxed answer vs the gold,
run inside the rollout's runtime via `uv run` (or a warm worker). uv installs `math-verify` into
its own cache here — the dependency never touches the eval process. `main(argv)` takes the gold
answer (argv[0]), the model's prediction (argv[1]), and a timeout in seconds (argv[2]); returns
"1.0" if they're equivalent, else "0.0".

Exposing `main(argv) -> str` (plus the `__main__` footer) lets the runtime keep this as a warm
worker — `import math_verify` paid once, not per call (see `Runtime.run_uv_script(warm=True)`) —
while staying `uv run verify.py <gold> <pred> <timeout>`-able cold. `main` must `return` (never
`sys.exit`, which would kill a reused worker).
"""

import sys

from math_verify import parse, verify

gold, pred, timeout = sys.argv[1], sys.argv[2], int(sys.argv[3])

if "<think>" in pred and "</think>" not in pred:
print(0.0)
sys.exit(0)
pred = pred.split("</think>")[-1]


def extract_boxed(text: str) -> str:
"""Content of the last ``\\boxed{...}`` in ``text``, or "" if there is none."""
Expand All @@ -32,20 +30,28 @@ def extract_boxed(text: str) -> str:
return text[start + len("\\boxed{") : i - 1] if depth == 0 else ""


answer = extract_boxed(pred)
if not answer:
print(0.0)
sys.exit(0)
try:
score = (
1.0
if verify(
parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
timeout_seconds=timeout,
def main(argv: list[str]) -> str:
gold, pred, timeout = argv[0], argv[1], int(argv[2])
if "<think>" in pred and "</think>" not in pred:
return "0.0"
pred = pred.split("</think>")[-1]
answer = extract_boxed(pred)
if not answer:
return "0.0"
try:
score = (
1.0
if verify(
parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
timeout_seconds=timeout,
)
else 0.0
)
else 0.0
)
except Exception:
score = 0.0
print(score)
except Exception:
score = 0.0
return str(score)


if __name__ == "__main__":
print(main(sys.argv[1:]))
4 changes: 3 additions & 1 deletion environments/gsm8k_v1/gsm8k_v1/taskset.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ async def correct(
prediction = (
trace.assistant_messages[-1].content if trace.assistant_messages else ""
)
# warm=True: on a persistent runtime, reuse a warm worker that imports math_verify once
# instead of execing a fresh interpreter per rollout (no-op on an ephemeral runtime).
result = await runtime.run_uv_script(
VERIFY, args=[task.answer, prediction or ""]
VERIFY, args=[task.answer, prediction or ""], warm=True
)
if result.exit_code != 0:
raise RuntimeError(f"verify.py failed: {result.stderr.strip()[-500:]}")
Expand Down
49 changes: 30 additions & 19 deletions environments/gsm8k_v1/gsm8k_v1/verify.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,42 @@
# /// script
# dependencies = ["math-verify"]
# ///
"""Score one GSM8K answer, run inside the rollout's runtime via `uv run`.
"""Score one GSM8K answer, run inside the rollout's runtime via `uv run` (or a warm worker).

uv installs `math-verify` into its own cache here — the dependency never touches
the eval process. Takes the gold answer (`argv[1]`) and the model's prediction
(`argv[2]`) and prints 1.0 if they match the same number, else 0.0.
uv installs `math-verify` into its own cache here — the dependency never touches the eval
process. `main(argv)` takes the gold answer (`argv[0]`) and the model's prediction (`argv[1]`)
and returns "1.0" if they match the same number, else "0.0".

The model is asked for its answer after '####'. Both the gold and the prediction
are wrapped in \\boxed{} before math-verify (matching the math-env scorer) so
parsing is robust; a malformed prediction fails to verify rather than crashing.
Exposing `main(argv) -> str` (plus the `__main__` footer) lets the runtime keep this as a warm
worker — `import math_verify` is paid once, not per call (see `Runtime.run_uv_script(warm=True)`)
— while staying `uv run verify.py <gold> <pred>`-able cold. The model is asked for its answer
after '####'. Both the gold and the prediction are wrapped in \\boxed{} before math-verify
(matching the math-env scorer) so parsing is robust; a malformed prediction fails to verify
rather than crashing.
"""

import re
import sys

from math_verify import parse, verify

gold, pred = sys.argv[1], sys.argv[2]
matches = re.findall(r"####\s*(.+)", pred)
prediction = matches[-1].strip() if matches else pred
try:
score = (
1.0
if verify(parse("\\boxed{" + gold + "}"), parse("\\boxed{" + prediction + "}"))
else 0.0
)
except Exception:
score = 0.0
print(score)

def main(argv: list[str]) -> str:
gold, pred = argv[0], argv[1]
matches = re.findall(r"####\s*(.+)", pred)
prediction = matches[-1].strip() if matches else pred
try:
score = (
1.0
if verify(
parse("\\boxed{" + gold + "}"), parse("\\boxed{" + prediction + "}")
)
else 0.0
)
except Exception:
score = 0.0
return str(score)


if __name__ == "__main__":
print(main(sys.argv[1:]))
3 changes: 3 additions & 0 deletions environments/math_env_v1/math_env_v1/taskset.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,12 @@ async def correct(
prediction = (
trace.assistant_messages[-1].content if trace.assistant_messages else ""
)
# warm=True: on a persistent runtime, reuse a warm worker that imports math_verify once
# instead of execing a fresh interpreter per rollout (no-op on an ephemeral runtime).
result = await runtime.run_uv_script(
VERIFY,
args=[task.answer, prediction or "", str(self.config.math_verify_timeout)],
warm=True,
)
if result.exit_code != 0:
raise RuntimeError(f"verify.py failed: {result.stderr.strip()[-500:]}")
Expand Down
62 changes: 34 additions & 28 deletions environments/math_env_v1/math_env_v1/verify.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,22 @@
# /// script
# dependencies = ["math-verify"]
# ///
"""Score one math answer by math-verify equivalence of the model's boxed answer vs the
gold, run inside the rollout's runtime via `uv run`. uv installs `math-verify` into its
own cache here — the dependency never touches the eval process. Takes the gold answer
(argv[1]), the model's prediction (argv[2]), and a timeout in seconds (argv[3]); prints
1.0 if they're equivalent, else 0.0.
"""Score one math answer by math-verify equivalence of the model's boxed answer vs the gold,
run inside the rollout's runtime via `uv run` (or a warm worker). uv installs `math-verify` into
its own cache here — the dependency never touches the eval process. `main(argv)` takes the gold
answer (argv[0]), the model's prediction (argv[1]), and a timeout in seconds (argv[2]); returns
"1.0" if they're equivalent, else "0.0".

Exposing `main(argv) -> str` (plus the `__main__` footer) lets the runtime keep this as a warm
worker — `import math_verify` paid once, not per call (see `Runtime.run_uv_script(warm=True)`) —
while staying `uv run verify.py <gold> <pred> <timeout>`-able cold. `main` must `return` (never
`sys.exit`, which would kill a reused worker).
"""

import sys

from math_verify import parse, verify

gold, pred, timeout = sys.argv[1], sys.argv[2], int(sys.argv[3])

if "<think>" in pred and "</think>" not in pred:
print(0.0)
sys.exit(0)
pred = pred.split("</think>")[-1]


def extract_boxed(text: str) -> str:
"""Content of the last ``\\boxed{...}`` in ``text``, or "" if there is none."""
Expand All @@ -32,20 +30,28 @@ def extract_boxed(text: str) -> str:
return text[start + len("\\boxed{") : i - 1] if depth == 0 else ""


answer = extract_boxed(pred)
if not answer:
print(0.0)
sys.exit(0)
try:
score = (
1.0
if verify(
parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
timeout_seconds=timeout,
def main(argv: list[str]) -> str:
gold, pred, timeout = argv[0], argv[1], int(argv[2])
if "<think>" in pred and "</think>" not in pred:
return "0.0"
pred = pred.split("</think>")[-1]
answer = extract_boxed(pred)
if not answer:
return "0.0"
try:
score = (
1.0
if verify(
parse("\\boxed{" + gold + "}", parsing_timeout=timeout),
parse("\\boxed{" + answer + "}", parsing_timeout=timeout),
timeout_seconds=timeout,
)
else 0.0
)
else 0.0
)
except Exception:
score = 0.0
print(score)
except Exception:
score = 0.0
return str(score)


if __name__ == "__main__":
print(main(sys.argv[1:]))
21 changes: 20 additions & 1 deletion verifiers/v1/GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,25 @@ async def verified(self, task, trace, runtime) -> float:
return float(r.stdout.strip() == "1.0")
```

By default each `run_uv_script` execs a fresh interpreter, so a heavy import (`math-verify` pulls
in sympy, ~0.2s) is paid every rollout. Pass `warm=True` to instead route to a long-lived worker
that imports once and answers many calls — provided the script exposes `main(argv) -> str` (its
result is the call's stdout) and stays `uv run`-able cold via a
`if __name__ == "__main__": print(main(sys.argv[1:]))` footer:

```python
r = await runtime.run_uv_script(VERIFY, args=[...], warm=True)
```

The worker lives as long as the runtime, so warm only pays off on a **persistent** runtime
(`--harness.runtime.persistent true`): runtimes are then taken from an eval/train-level pool and
reused across rollouts (workspace `reset` between uses, torn down only at the end), so the import
is paid once per pooled runtime instead of per rollout — ~12x faster gsm8k scoring at 1000
rollouts / 128 concurrency, more as the run gets longer (`bench/persistent_runtime.py`). On an
ephemeral runtime `warm=True` is a no-op (runs cold). Currently warm-capable on the subprocess
runtime; persistence itself works on every runtime (and on remote runtimes also skips the
per-rollout sandbox+tunnel provisioning).

## Stop conditions

A rollout ends when the harness finishes, a framework budget trips (`--max-turns`, token caps), or
Expand Down Expand Up @@ -265,7 +284,7 @@ isolated environment. On a `runtime` you can call:
| method | what |
| --- | --- |
| `run(argv, env)` | exec a command to completion → `ProgramResult(exit_code, stdout, stderr)` |
| `run_uv_script(src, args, env)` | run a PEP 723 script (inline deps resolve in-runtime); `args` are shell-`"$@"`-safe |
| `run_uv_script(src, args, env, warm=False)` | run a PEP 723 script (inline deps resolve in-runtime); `args` are shell-`"$@"`-safe. `warm=True` reuses a long-lived worker (import once) on a persistent runtime — see [In-runtime scoring](#in-runtime-scoring) |
| `run_background(argv, env, log)` | start a long-lived process (e.g. a colocated server) |
| `read(path)` / `write(path, data)` | workspace files (bytes), across the container/sandbox boundary |
| `expose(port)` | publish a port *inside* the runtime to a host-reachable URL (`None` when local) |
Expand Down
Loading
Loading