From 0631af56b77f6a2e4a715b78e8224a6ba8b2ee66 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Fri, 19 Jun 2026 04:36:36 +0000 Subject: [PATCH 1/7] feat(v1): add mini-browse-apps-platform-v1 environment Sandboxed local-app browser-agent environment ported to the current v1 API. Tasks are pulled dynamically from the Prime hub and cached locally. The browser agent is proprietary and fetched at run time from a private repo (not vendored); the harness stages it into the sandbox. Co-packages the taskset and harness. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../mini_browse_apps_platform_v1/README.md | 54 ++ .../mini_browse_apps_platform_v1/__init__.py | 16 + .../harness/.gitignore | 2 + .../harness/__init__.py | 262 ++++++++ .../harness/contract.py | 28 + .../harness/diagnostics.py | 51 ++ .../harness/program.py | 560 ++++++++++++++++++ .../mini_browse_apps_platform_v1/judge.py | 150 +++++ .../mini_browse_apps_platform_v1/taskset.py | 533 +++++++++++++++++ .../pyproject.toml | 13 + pyproject.toml | 2 + uv.lock | 13 + 12 files changed, 1684 insertions(+) create mode 100644 environments/mini_browse_apps_platform_v1/README.md create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py create mode 100644 environments/mini_browse_apps_platform_v1/pyproject.toml diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md new file mode 100644 index 000000000..b769f268f --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/README.md @@ -0,0 +1,54 @@ +# mini-browse-apps-platform-v1 + +Sandboxed local-app **browser-agent** environment. Each task boots a local single-page web app +(SPA server + headless-Chromium CDP service) inside a per-task Docker image; a browser agent drives +it by **screenshots → vision model → click/type actions**, then submits a structured JSON result. An +LLM judge scores the submission against a deterministic answer key. + +The model must be **multimodal** (the agent's only input is screenshots). + +## Proprietary agent (fetched at run time) + +The browser agent is **proprietary and not vendored in this repo**. The harness fetches it at run +time from a **private GitHub repo** (pinned to a commit), caches it under +`~/.cache/verifiers/browse-agent//`, then stages it into the sandbox. Configure via +`--harness.*`: + +| Field | Default | Meaning | +| --- | --- | --- | +| `agent_repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. | +| `agent_ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent_path` is set).** | +| `agent_token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent_repo`. | +| `agent_path` | _(unset)_ | Local dir containing `/` — skips the fetch (development). | + +So set `export MINI_BROWSE_GITHUB_TOKEN=` and `--harness.agent-ref `, or point +`--harness.agent-path` at a local checkout. + +## Tasks (pulled dynamically) + +Tasks are **pulled from the Prime hub and cached locally** — nothing is bundled in this package. +`load_tasks` pulls the dataset from `prime/mini-browse-apps-platform-v1` (private; via `prime env +pull`) into `~/.cache/verifiers/mini-browse-apps//`. Override with `--taskset.dataset_path +` or repoint `--taskset.hub_env_id` / `--taskset.hub_version`. + +## Run + +The taskset and harness are co-packaged (resolved via `__all__`), so `--harness.id` matches the +taskset id. The task image is a Prime-registry image, so use the `prime` runtime: + +```bash +export MINI_BROWSE_GITHUB_TOKEN= +uv run eval mini-browse-apps-platform-v1 \ + --harness.id mini-browse-apps-platform-v1 \ + --harness.runtime.type prime \ + --harness.agent-ref \ + -m \ + -n 1 -r 1 -c 1 +``` + +## Reward & metrics + +`answer_key` (weight 1.0) judges the submitted result against the gold answer key (`judge_model`, +default `google/gemini-3.1-pro-preview` via pinference); reward 1.0 == judge verdict "yes". Metrics: +`result_present`, `submitted_result_present`, `agent_error`, `transcript_image_count`, +`message_count`. diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py new file mode 100644 index 000000000..9a4e10f35 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py @@ -0,0 +1,16 @@ +"""mini-browse-apps-platform-v1 — sandboxed local-app Mini Browse browser tasks (v1). + +Co-packages the taskset and its browser harness; both are resolved by id from this module's +`__all__` (`--taskset.id` / `--harness.id mini-browse-apps-platform-v1`). +""" + +from .harness import MiniBrowseHarness, MiniBrowseHarnessConfig +from .taskset import MiniBrowseAppsConfig, MiniBrowseAppsTaskset, MiniBrowseAppTask + +__all__ = [ + "MiniBrowseAppsTaskset", + "MiniBrowseAppsConfig", + "MiniBrowseAppTask", + "MiniBrowseHarness", + "MiniBrowseHarnessConfig", +] diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore new file mode 100644 index 000000000..887009b99 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore @@ -0,0 +1,2 @@ +# The browser agent is proprietary and fetched at run time from a private repo — never commit it. +vendor/ diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py new file mode 100644 index 000000000..8ded780df --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py @@ -0,0 +1,262 @@ +"""Browser-app harness: stages a privately-distributed browser agent into the sandbox. + +The browser agent is proprietary and is NOT vendored in this repo. It is fetched at run time +from a private, auth-gated GitHub repo (pinned to a commit), cached locally, then tarred and +staged into the sandbox, where `program.py` (a uv script) imports and runs it. For local +development, point `agent_path` at a checkout instead of fetching. +""" + +from __future__ import annotations + +import io +import os +import shlex +import shutil +import tarfile +import tempfile +from pathlib import Path +from typing import Literal + +import httpx +from verifiers.v1.clients import RolloutContext +from verifiers.v1.errors import HarnessError +from verifiers.v1.harness import Harness, HarnessConfig +from verifiers.v1.runtimes import DockerConfig, ProgramResult, Runtime, RuntimeConfig +from verifiers.v1.trace import Trace + +from .contract import ( + METRICS_PATH, + MiniBrowseTaskPayload, + PROGRESS_PATH, + RESULT_PATH, + TASK_PAYLOAD_PATH, + TRANSCRIPT_PATH, + WORKSPACE_ROOT, +) +from .diagnostics import read_jsonl_tail + +PROGRAM_SOURCE = (Path(__file__).resolve().parent / "program.py").read_text() + +AGENT_RUNTIME = "/opt/browse-agent-runtime" +AGENT_TARBALL = "/tmp/vf-browse-agent-runtime.tgz" + +CoordinateMode = Literal["relative_1000", "absolute", "auto"] + + +class MiniBrowseHarnessConfig(HarnessConfig): + """Reusable browser harness; fetches its proprietary agent from a private repo.""" + + id: str = "mini-browse-apps-platform-v1" + runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm") + # --- proprietary agent source (not vendored; fetched at run time) --- + agent_repo: str = "PrimeIntellect-ai/plex-mini-browse" + """Private GitHub repo (owner/name) the agent is fetched from.""" + agent_ref: str = "" + """Pinned commit sha to fetch (required unless `agent_path` is set).""" + agent_package: str = "mini_browse" + """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage.""" + agent_token_env: str = "MINI_BROWSE_GITHUB_TOKEN" + """Env var holding a GitHub token with read access to `agent_repo`.""" + agent_path: str | None = None + """Local dir containing `/` — when set, skips the GitHub fetch (dev).""" + agent_cache_dir: str | None = None + """Where fetched agent revisions are cached (default: ~/.cache/verifiers/browse-agent).""" + # --- agent behavior --- + max_steps: int = 75 + coordinate_mode: CoordinateMode = "relative_1000" + keep_last_images: int = 3 + image_compaction_at_tokens: int = 45_000 + include_builtin_tools: bool = False + browser_start_min_interval_seconds: float = 0.0 + browser_start_jitter_seconds: float = 0.0 + browser_start_max_in_flight: int = 0 + record_frames: bool = False + task_payload_path: str = TASK_PAYLOAD_PATH + result_path: str = RESULT_PATH + transcript_path: str = TRANSCRIPT_PATH + metrics_path: str = METRICS_PATH + progress_path: str = PROGRESS_PATH + workspace_root: str = WORKSPACE_ROOT + + +class MiniBrowseHarness(Harness[MiniBrowseHarnessConfig]): + """Stages the privately-fetched browser agent and executes its agent loop.""" + + SUPPORTS_TASK_TOOLS = False + SUPPORTS_MESSAGE_PROMPT = False + + async def launch( + self, + ctx: RolloutContext, + trace: Trace, + runtime: Runtime, + endpoint: str, + secret: str, + mcp_urls: dict[str, str], + ) -> ProgramResult: + if mcp_urls: + names = ", ".join(sorted(mcp_urls)) + raise ValueError( + f"Browser harness does not expose v1 MCP task tools: {names}" + ) + if trace.task.system_prompt: + raise ValueError( + "Browser harness owns the system prompt; put task-specific instructions " + "in task.prompt or the task payload." + ) + if not isinstance(trace.task.prompt, str): + raise ValueError("Browser harness requires a string task prompt") + + await self._stage_agent(runtime) + env = { + **self.config.env, + "OPENAI_BASE_URL": endpoint, + "OPENAI_API_KEY": secret, + "OPENAI_MODEL": ctx.model, + "PYTHONPATH": self._pythonpath(), + "MINI_BROWSE_COORDINATE_MODE": self.config.coordinate_mode, + "MINI_BROWSE_KEEP_LAST_IMAGES": str(self.config.keep_last_images), + "MINI_BROWSE_IMAGE_COMPACTION_AT_TOKENS": str( + self.config.image_compaction_at_tokens + ), + "MINI_BROWSE_INCLUDE_BUILTIN_TOOLS": ( + "1" if self.config.include_builtin_tools else "0" + ), + "MINI_BROWSE_BROWSER_START_MIN_INTERVAL_SECONDS": str( + self.config.browser_start_min_interval_seconds + ), + "MINI_BROWSE_BROWSER_START_JITTER_SECONDS": str( + self.config.browser_start_jitter_seconds + ), + "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT": str( + self.config.browser_start_max_in_flight + ), + "MINI_BROWSE_PROGRESS_PATH": self.config.progress_path, + } + if self.config.record_frames: + env["MINI_BROWSE_RECORD_FRAMES_DIR"] = "/logs/mini_browse/frames" + + args = [ + "--task", + self.config.task_payload_path, + "--result", + self.config.result_path, + "--transcript", + self.config.transcript_path, + "--metrics", + self.config.metrics_path, + "--progress", + self.config.progress_path, + "--max-steps", + str(self.config.max_steps), + "--workspace-root", + self.config.workspace_root, + ] + return await runtime.run_uv_script(PROGRAM_SOURCE, args=args, env=env) + + async def _stage_agent(self, runtime: Runtime) -> None: + await runtime.write(AGENT_TARBALL, self._agent_tarball()) + command = ( + f"rm -rf {shlex.quote(AGENT_RUNTIME)} && " + f"mkdir -p {shlex.quote(AGENT_RUNTIME)} && " + f"tar -xzf {shlex.quote(AGENT_TARBALL)} -C {shlex.quote(AGENT_RUNTIME)}" + ) + result = await runtime.run(["sh", "-c", command], {}) + if result.exit_code != 0: + raise HarnessError( + f"agent staging failed: {result.stderr.strip()[-500:]}" + ) + + def _agent_tarball(self) -> bytes: + package = self._ensure_agent() / self.config.agent_package + if not package.is_dir(): + raise HarnessError( + f"agent package {self.config.agent_package!r} not found under {package.parent}" + ) + buffer = io.BytesIO() + with tarfile.open(fileobj=buffer, mode="w:gz") as archive: + archive.add(package, arcname=self.config.agent_package) + return buffer.getvalue() + + def _ensure_agent(self) -> Path: + """Return a dir that contains `/` — a local checkout or the fetch cache.""" + if self.config.agent_path: + return Path(self.config.agent_path).expanduser() + if not self.config.agent_ref: + raise HarnessError( + "set --harness.agent-ref to a pinned commit sha " + "(or --harness.agent-path to a local checkout for development)" + ) + cache_root = ( + Path(self.config.agent_cache_dir).expanduser() + if self.config.agent_cache_dir + else Path.home() / ".cache" / "verifiers" / "browse-agent" + ) + dest = cache_root / self.config.agent_ref + if not (dest / self.config.agent_package).exists(): + self._download_agent(dest) + return dest + + def _download_agent(self, dest: Path) -> None: + token = os.environ.get(self.config.agent_token_env) + if not token: + raise HarnessError( + f"missing ${self.config.agent_token_env} to fetch the private agent repo " + f"{self.config.agent_repo!r}" + ) + url = f"https://api.github.com/repos/{self.config.agent_repo}/tarball/{self.config.agent_ref}" + headers = { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + with tempfile.TemporaryDirectory(prefix="browse-agent-") as tmp: + archive = Path(tmp) / "agent.tar.gz" + # httpx drops the Authorization header on the cross-host redirect to codeload. + with httpx.stream( + "GET", url, headers=headers, follow_redirects=True, timeout=120 + ) as resp: + if resp.status_code != 200: + resp.read() + raise HarnessError( + f"fetching {self.config.agent_repo}@{self.config.agent_ref} failed: " + f"HTTP {resp.status_code}" + ) + with open(archive, "wb") as handle: + for chunk in resp.iter_bytes(): + handle.write(chunk) + extract = Path(tmp) / "extract" + extract.mkdir() + with tarfile.open(archive) as tar: + tar.extractall(extract, filter="data") + matches = sorted(extract.glob(f"*/{self.config.agent_package}")) + if not matches: + raise HarnessError( + f"{self.config.agent_package!r} not found in " + f"{self.config.agent_repo}@{self.config.agent_ref}" + ) + dest.mkdir(parents=True, exist_ok=True) + staging = dest / (self.config.agent_package + ".tmp") + if staging.exists(): + shutil.rmtree(staging) + shutil.copytree(matches[0], staging) + os.replace(staging, dest / self.config.agent_package) + + def _pythonpath(self) -> str: + existing = self.config.env.get("PYTHONPATH", "") + entries = [AGENT_RUNTIME] + if existing: + entries.append(existing) + return ":".join(entries) + + +def load_harness(config: MiniBrowseHarnessConfig) -> MiniBrowseHarness: + return MiniBrowseHarness(config) + + +__all__ = [ + "MiniBrowseHarness", + "MiniBrowseHarnessConfig", + "MiniBrowseTaskPayload", + "read_jsonl_tail", +] diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py new file mode 100644 index 000000000..5c514ce07 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py @@ -0,0 +1,28 @@ +"""Public payload contract consumed by the Mini Browse harness.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, ConfigDict + +TASK_PAYLOAD_PATH = "/task/mini_browse/task.json" +RESULT_PATH = "/task/mini_browse/result.json" +TRANSCRIPT_PATH = "/logs/mini_browse/transcript.json" +METRICS_PATH = "/logs/mini_browse/metrics.json" +PROGRESS_PATH = "/logs/mini_browse/progress.jsonl" +WORKSPACE_ROOT = "/workspace/mini-browse" + + +class MiniBrowseTaskPayload(BaseModel): + """Sandbox-visible task payload for the Mini Browse harness.""" + + model_config = ConfigDict(extra="forbid") + + instruction: str + output_schema: dict[str, Any] + browser_api_url: str + start_url: str = "about:blank" + http_proxy: str | None = None + source: str = "verifiers-mini-browse" + task_preamble: str | None = None diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py new file mode 100644 index 000000000..c4b7535fd --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py @@ -0,0 +1,51 @@ +"""Small helpers for surfacing Mini Browse sandbox diagnostics.""" + +from __future__ import annotations + +import json +from collections import deque +from typing import Any + + +async def read_jsonl_tail( + runtime: Any, + path: str, + *, + max_lines: int = 80, + max_chars: int = 20_000, +) -> dict[str, Any]: + """Read a bounded JSONL tail from a sandbox artifact.""" + + try: + raw = await runtime.read(path) + except Exception as exc: + return {"path": path, "is_error": True, "error": str(exc)} + + text = raw.decode("utf-8", errors="replace") + original_chars = len(text) + if max_chars > 0 and original_chars > max_chars: + text = text[-max_chars:] + first_newline = text.find("\n") + if first_newline >= 0: + text = text[first_newline + 1 :] + + events: deque[Any] = deque(maxlen=max(0, max_lines)) + parse_errors = 0 + for line in text.splitlines(): + line = line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + parse_errors += 1 + events.append(line[:1000]) + + return { + "path": path, + "is_error": False, + "events": list(events), + "event_count": len(events), + "parse_errors": parse_errors, + "truncated": original_chars > len(text), + } diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py new file mode 100644 index 000000000..c0d6ef1c2 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py @@ -0,0 +1,560 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "aiohttp>=3.11", +# "openai>=2.0", +# "orjson>=3.10", +# "pillow>=11.0", +# "pydantic>=2.0", +# "pypdf>=5.4", +# "pypdfium2>=4.30", +# "python-pptx>=1.0", +# ] +# /// +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import re +import time +from pathlib import Path +from typing import Any + +ERROR_CATEGORY_CODES = { + "none": 0, + "harness_disconnect": 1, + "request_too_large_bytes": 2, + "request_too_large_tokens": 3, + "model_rate_limit": 4, + "model_auth": 5, + "model_bad_request": 6, + "model_internal_error": 7, + "max_steps_exceeded": 8, + "browser_or_sandbox": 9, + "agent_logic_error": 10, + "unknown": 11, + "model_endpoint_gone": 12, + "model_connection_failure": 13, +} + +TOOL_ERROR_PREFIXES = ( + "ValidationError", + "KeyError", + "ValueError", + "RuntimeError", + "AttributeError", + "TypeError", + "Unknown tool", +) + +TOOL_ERROR_BREAKDOWN_NAMES = ("computer", "read_page", "find", "get_page_text") +HTTP_STATUS_RE = re.compile( + r"(?:Error code:|status(?: code)?[=:]?)\s*(\d{3})", re.IGNORECASE +) +DEFAULT_PROGRESS_PATH = "/logs/mini_browse/progress.jsonl" + + +def _read_json(path: Path) -> Any: + return json.loads(path.read_text()) + + +def _read_optional_json(path: Path) -> Any: + if not path.exists(): + return None + return _read_json(path) + + +def _json_safe(value: Any) -> Any: + try: + json.dumps(value) + return value + except TypeError: + return repr(value) + + +def _write_progress(progress_path: Path, event: str, **fields: Any) -> None: + try: + progress_path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "event": event, + "timestamp": time.time(), + **{key: _json_safe(value) for key, value in fields.items()}, + } + with progress_path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n") + handle.flush() + except Exception: + return + + +def _env_float(name: str, default: float = 0.0) -> float: + raw = os.environ.get(name, "").strip() + if not raw: + return default + try: + return max(0.0, float(raw)) + except ValueError: + return default + + +def _env_int(name: str, default: int = 0) -> int: + raw = os.environ.get(name, "").strip() + if not raw: + return default + try: + return max(0, int(raw)) + except ValueError: + return default + + +def _env_bool(name: str, default: bool = False) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return raw.strip().lower() not in {"", "0", "false", "no", "off"} + + +def _http_status_from_exception(exc: BaseException | None) -> int | None: + if exc is None: + return None + status = getattr(exc, "status_code", None) + try: + return int(status) if status is not None else None + except (TypeError, ValueError): + return None + + +def _http_status_from_text(text: str | None) -> int | None: + if not text: + return None + match = HTTP_STATUS_RE.search(text) + if not match: + return None + try: + return int(match.group(1)) + except ValueError: + return None + + +def _classify_exception(exc: BaseException) -> str: + if isinstance(exc, asyncio.CancelledError): + return "harness_disconnect" + + try: + import openai + + if isinstance(exc, openai.RateLimitError): + return "model_rate_limit" + if isinstance(exc, (openai.AuthenticationError, openai.PermissionDeniedError)): + return "model_auth" + if isinstance(exc, openai.BadRequestError): + text = str(exc).lower() + bytes_markers = ( + "request entity too large", + "payload too large", + "413 request", + "413 payload", + ) + if any(marker in text for marker in bytes_markers): + return "request_too_large_bytes" + token_markers = ( + "context length", + "maximum context", + "too many tokens", + "context_length_exceeded", + "context window", + "input is too long", + ) + if any(marker in text for marker in token_markers): + return "request_too_large_tokens" + return "model_bad_request" + status = _http_status_from_exception(exc) + if isinstance(exc, openai.NotFoundError) or status == 404: + return "model_endpoint_gone" + if isinstance(exc, openai.InternalServerError) or ( + status is not None and 500 <= status < 600 + ): + return "model_internal_error" + if isinstance(exc, (openai.APIConnectionError, openai.APITimeoutError)): + return "model_connection_failure" + if isinstance(exc, openai.APIError): + if status == 404: + return "model_endpoint_gone" + if status is not None and 500 <= status < 600: + return "model_internal_error" + return "model_connection_failure" + except ImportError: + pass + + try: + import aiohttp + + if isinstance(exc, aiohttp.ClientError): + return "model_connection_failure" + except ImportError: + pass + + if isinstance(exc, TimeoutError): + return "model_connection_failure" + if isinstance(exc, ConnectionError): + return "model_connection_failure" + if isinstance(exc, OSError): + return "browser_or_sandbox" + if isinstance( + exc, (KeyError, TypeError, AttributeError, ValueError, RuntimeError, IndexError) + ): + return "agent_logic_error" + return "unknown" + + +def _diagnose(exc: BaseException | None, error_text: str | None) -> dict[str, Any]: + if exc is not None: + category = _classify_exception(exc) + error_type = type(exc).__name__ + excerpt = str(exc)[:1200] + http_status = _http_status_from_exception(exc) or _http_status_from_text( + excerpt + ) + elif error_text: + text = str(error_text) + error_type = text.split(":", 1)[0][:120] if ":" in text else text[:120] + excerpt = text[:1200] + http_status = _http_status_from_text(text) + if http_status == 404: + category = "model_endpoint_gone" + elif http_status is not None and 500 <= http_status < 600: + category = "model_internal_error" + elif "maximum steps exceeded" in text.lower(): + category = "max_steps_exceeded" + else: + category = "unknown" + else: + return { + "error_type": None, + "error_category": "none", + "error_category_code": ERROR_CATEGORY_CODES["none"], + "error_excerpt": None, + "error_http_status": None, + } + return { + "error_type": error_type, + "error_category": category, + "error_category_code": ERROR_CATEGORY_CODES[category], + "error_excerpt": excerpt, + "error_http_status": http_status, + } + + +def _count_image_parts(messages: list[dict[str, Any]]) -> int: + count = 0 + for message in messages: + content = message.get("content") if isinstance(message, dict) else None + if isinstance(content, list): + for part in content: + if isinstance(part, dict) and part.get("type") == "image_url": + count += 1 + return count + + +def _json_size_bytes(value: Any) -> int: + try: + return len(json.dumps(value, ensure_ascii=False).encode("utf-8")) + except Exception: + return 0 + + +def _summarize_tool_errors(messages: list[dict[str, Any]]) -> dict[str, Any]: + total = 0 + validation = 0 + streak = 0 + max_streak = 0 + by_tool: dict[str, int] = {} + unique_kinds: set[str] = set() + id_to_tool: dict[str, str] = {} + + for message in messages: + if not isinstance(message, dict): + continue + role = message.get("role") + if role == "assistant": + tool_calls = message.get("tool_calls") or [] + if not isinstance(tool_calls, list): + continue + for tool_call in tool_calls: + if isinstance(tool_call, str): + try: + tool_call = json.loads(tool_call) + except json.JSONDecodeError: + continue + if not isinstance(tool_call, dict): + continue + tool_call_id = tool_call.get("id") + function = tool_call.get("function") + if isinstance(function, dict): + name = function.get("name") + else: + name = tool_call.get("name") + if isinstance(tool_call_id, str) and isinstance(name, str): + id_to_tool[tool_call_id] = name + elif role == "tool": + content = message.get("content") + if not isinstance(content, str): + streak = 0 + continue + stripped = content.lstrip() + if not any(stripped.startswith(prefix) for prefix in TOOL_ERROR_PREFIXES): + streak = 0 + continue + + total += 1 + if stripped.startswith("ValidationError"): + validation += 1 + tool_name = id_to_tool.get(message.get("tool_call_id") or "", "unknown") + by_tool[tool_name] = by_tool.get(tool_name, 0) + 1 + unique_kinds.add(stripped.split("\n", 1)[0][:200]) + streak += 1 + max_streak = max(max_streak, streak) + + return { + "tool_error_count": total, + "tool_error_validation": validation, + "tool_error_max_streak": max_streak, + "tool_error_unique_kinds": len(unique_kinds), + "tool_error_by_tool": by_tool, + } + + +def _load_task_payload(path: Path) -> dict[str, Any]: + payload = _read_json(path) + if not isinstance(payload, dict): + raise ValueError(f"Mini Browse task payload must be an object: {path}") + instruction = payload.get("instruction") + if not isinstance(instruction, str) or not instruction.strip(): + raise ValueError("Mini Browse task payload requires non-empty instruction") + output_schema = payload.get("output_schema") + if not isinstance(output_schema, dict): + raise ValueError("Mini Browse task payload requires object output_schema") + return payload + + +async def _run(args: argparse.Namespace) -> int: + from mini_browse import run_bcu_task + + task_path = Path(args.task) + result_path = Path(args.result) + transcript_path = Path(args.transcript) + metrics_path = Path(args.metrics) + progress_path = Path(args.progress) + workspace_root = Path(args.workspace_root) + + result_path.parent.mkdir(parents=True, exist_ok=True) + transcript_path.parent.mkdir(parents=True, exist_ok=True) + metrics_path.parent.mkdir(parents=True, exist_ok=True) + progress_path.parent.mkdir(parents=True, exist_ok=True) + workspace_root.mkdir(parents=True, exist_ok=True) + os.environ["MINI_BROWSE_PROGRESS_PATH"] = str(progress_path) + _write_progress( + progress_path, + "harness_program_start", + task_path=str(task_path), + result_path=str(result_path), + transcript_path=str(transcript_path), + metrics_path=str(metrics_path), + workspace_root=str(workspace_root), + ) + + task_payload = _load_task_payload(task_path) + _write_progress( + progress_path, + "task_payload_loaded", + source=task_payload.get("source"), + start_url=task_payload.get("start_url"), + instruction_chars=len(task_payload.get("instruction") or ""), + output_schema_keys=sorted((task_payload.get("output_schema") or {}).keys()), + has_browser_api_url=bool(task_payload.get("browser_api_url")), + has_http_proxy=bool(task_payload.get("http_proxy")), + ) + instruction = task_payload["instruction"].strip() + output_schema = task_payload["output_schema"] + start_url = str(task_payload.get("start_url") or "about:blank") + browser_api_url = str(task_payload.get("browser_api_url") or "").strip() + if browser_api_url: + os.environ["MINI_BROWSE_BROWSER_API_URL"] = browser_api_url + http_proxy = str(task_payload.get("http_proxy") or "").strip() + if http_proxy: + os.environ["PERPLEXITY_TAILSCALE_HTTP_PROXY"] = http_proxy + source = str(task_payload.get("source") or "verifiers-mini-browse") + task_preamble = str( + task_payload.get("task_preamble") + or os.environ.get("MINI_BROWSE_TASK_PREAMBLE") + or "" + ) + conversation = ( + _read_optional_json(Path(args.conversation)) if args.conversation else None + ) + if conversation is not None and not isinstance(conversation, list): + raise ValueError("Mini Browse conversation payload must be a list") + model = os.environ.get("OPENAI_MODEL", "intercepted/model") + coordinate_mode = os.environ.get("MINI_BROWSE_COORDINATE_MODE", "relative_1000") + + payload: dict[str, Any] + messages: list[dict[str, Any]] = [] + exc_caught: BaseException | None = None + try: + _write_progress( + progress_path, + "run_bcu_task_start", + model=model, + coordinate_mode=coordinate_mode, + max_steps=int(args.max_steps), + ) + run_result = await run_bcu_task( + task=instruction, + url=start_url, + output_schema=output_schema, + model=model, + max_steps=int(args.max_steps), + workspace_root=workspace_root, + include_builtin_tools=_env_bool("MINI_BROWSE_INCLUDE_BUILTIN_TOOLS"), + source=source, + task_preamble=task_preamble, + coordinate_mode=coordinate_mode, + conversation=conversation, + browser_start_min_interval_seconds=_env_float( + "MINI_BROWSE_BROWSER_START_MIN_INTERVAL_SECONDS" + ), + browser_start_jitter_seconds=_env_float( + "MINI_BROWSE_BROWSER_START_JITTER_SECONDS" + ), + browser_start_max_in_flight=_env_int( + "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT" + ), + ) + _write_progress( + progress_path, + "run_bcu_task_done", + is_error=run_result.is_error, + submitted_result_present=bool(run_result.submitted_result), + message_count=len(run_result.messages), + browser_session_id=run_result.browser_session_id, + ) + messages = run_result.messages + payload = { + "response": run_result.response, + "is_error": run_result.is_error, + "error": run_result.error, + "is_cancelled": run_result.is_cancelled, + "browser_session_id": run_result.browser_session_id, + "tab_group_id": run_result.tab_group_id, + "submitted_result": _json_safe(run_result.submitted_result), + "workspace_root": run_result.workspace_root, + "message_count": len(messages), + "coordinate_mode": coordinate_mode, + } + except BaseException as exc: + exc_caught = exc + _write_progress( + progress_path, + "run_bcu_task_exception", + error_type=type(exc).__name__, + error_excerpt=str(exc)[:500], + is_cancelled=isinstance(exc, asyncio.CancelledError), + ) + payload = { + "response": "", + "is_error": True, + "error": f"{type(exc).__name__}: {exc}", + "is_cancelled": isinstance(exc, asyncio.CancelledError), + "browser_session_id": None, + "tab_group_id": None, + "submitted_result": None, + "workspace_root": str(workspace_root), + "message_count": len(messages), + "coordinate_mode": coordinate_mode, + } + + diagnostics = _diagnose(exc_caught, payload.get("error")) + payload.update(diagnostics) + payload["transcript_image_count"] = _count_image_parts(messages) + payload["transcript_json_bytes"] = _json_size_bytes(messages) + payload.update(_summarize_tool_errors(messages)) + + submitted = payload.get("submitted_result") + response = payload.get("response") + answered = bool(submitted) or bool(isinstance(response, str) and response.strip()) + category = payload.get("error_category") + metrics = { + "answered": float(answered and not payload.get("is_error")), + "is_error": float(bool(payload.get("is_error"))), + "message_count": float(payload.get("message_count") or 0), + "submitted_result_present": float(bool(submitted)), + "has_browser_session": float(bool(payload.get("browser_session_id"))), + "error_category_code": float(payload.get("error_category_code") or 0), + "error_http_status": float(payload.get("error_http_status") or 0), + "transcript_image_count": float(payload.get("transcript_image_count") or 0), + "transcript_json_bytes": float(payload.get("transcript_json_bytes") or 0), + "tool_error_count": float(payload.get("tool_error_count") or 0), + "tool_error_validation": float(payload.get("tool_error_validation") or 0), + "tool_error_max_streak": float(payload.get("tool_error_max_streak") or 0), + "tool_error_unique_kinds": float(payload.get("tool_error_unique_kinds") or 0), + } + for category_name in ERROR_CATEGORY_CODES: + if category_name == "none": + continue + metrics[f"error_{category_name}"] = float(category == category_name) + by_tool = payload.get("tool_error_by_tool") or {} + for tool_name in TOOL_ERROR_BREAKDOWN_NAMES: + metrics[f"tool_error_{tool_name}"] = float(by_tool.get(tool_name, 0)) + + result_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2)) + transcript_path.write_text(json.dumps(messages, ensure_ascii=False, indent=2)) + metrics_path.write_text(json.dumps(metrics, ensure_ascii=False, indent=2)) + _write_progress( + progress_path, + "harness_program_artifacts_written", + error_category=category, + is_error=payload.get("is_error"), + result_path=str(result_path), + transcript_path=str(transcript_path), + metrics_path=str(metrics_path), + ) + + print( + json.dumps( + { + "result_path": str(result_path), + "metrics_path": str(metrics_path), + "transcript_path": str(transcript_path), + "progress_path": str(progress_path), + "error_category": category, + "error_type": payload.get("error_type"), + "error_excerpt": payload.get("error_excerpt"), + } + ) + ) + if exc_caught is not None and not isinstance(exc_caught, Exception): + raise exc_caught + return 0 + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run the Mini Browse harness.") + parser.add_argument("--task", required=True) + parser.add_argument("--result", required=True) + parser.add_argument("--transcript", required=True) + parser.add_argument("--metrics", required=True) + parser.add_argument("--progress", default=DEFAULT_PROGRESS_PATH) + parser.add_argument("--conversation") + parser.add_argument("--max-steps", type=int, default=75) + parser.add_argument("--workspace-root", default="/workspace/mini-browse") + return parser.parse_args() + + +def main() -> int: + return asyncio.run(_run(_parse_args())) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py new file mode 100644 index 000000000..b1228336e --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py @@ -0,0 +1,150 @@ +"""LLM judge support for Mini Browse local-app tasks.""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any + +from openai import AsyncOpenAI + +JUDGE_TEMPERATURE = 0 + +JUDGE_PROMPT = """You evaluate a browser automation agent's submitted result for a deterministic local flight-search task. + +Use the evaluation contract and gold answer as the source of truth. Score only +expected_fields where score is true or absent. Ignore verifier metadata, ids, +internal keys, hidden seed fields, and non-scoreable diagnostics. + +Treat equivalent formatting as correct when the same fact is clearly attached to +the same flight leg, date, row, provider, fare, comparison slot, or outcome. +Examples: "Nonstop" equals "0 stops"; prices match after removing currency +symbols and commas; date formats match when they refer to the same calendar date; +duration strings match when they have the same total minutes. + +Extra fields are fine unless they contradict the gold answer. Critical fields +with critical=true are hard gates: if any scoreable critical field is missing or +wrong, the verdict must be "no". + +Return exactly one JSON object, no prose and no code fence: +{ + "correct_fields": , + "total_fields": , + "score": , + "verdict": "yes" | "partial" | "no", + "explanation": "", + "field_verdicts": [ + { + "field_path": "", + "verdict": "exact_match" | "semantic_match" | "wrong" | "missing", + "reason": "" + } + ] +} +""" + + +async def judge_answer_key( + *, + task_instruction: str, + submitted_result: Any, + answer_key: dict[str, Any], + output_schema: dict[str, Any], + model: str, + base_url: str | None, + api_key_env: str, +) -> dict[str, Any]: + context = { + "task_instruction": task_instruction, + "submitted_result": submitted_result, + "evaluation_contract": answer_key.get("evaluator") or {}, + "gold_answer": answer_key.get("gold_answer") or answer_key, + "output_schema": output_schema, + } + response = await judge_client(base_url, api_key_env).chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": JUDGE_PROMPT}, + { + "role": "user", + "content": json.dumps(context, ensure_ascii=False, sort_keys=True), + }, + ], + temperature=JUDGE_TEMPERATURE, + response_format={"type": "json_object"}, + ) + content = response.choices[0].message.content or "{}" + return parse_json_object(content) + + +def judge_client(base_url: str | None, api_key_env: str) -> AsyncOpenAI: + api_key = os.environ.get(api_key_env) + if not api_key: + raise ValueError(f"Missing judge API key env var {api_key_env}") + kwargs: dict[str, Any] = {"api_key": api_key} + if base_url: + kwargs["base_url"] = base_url + default_headers = prime_default_headers(base_url) + if default_headers: + kwargs["default_headers"] = default_headers + return AsyncOpenAI(**kwargs) + + +def score_from_judge_payload(payload: dict[str, Any]) -> float: + correct = payload.get("correct_fields") + total = payload.get("total_fields") + if isinstance(correct, int) and isinstance(total, int) and total > 0: + return max(0.0, min(1.0, correct / total)) + score = payload.get("score") + if isinstance(score, (int, float)) and not isinstance(score, bool): + return max(0.0, min(1.0, float(score))) + verdict = str(payload.get("verdict") or "").lower() + if verdict == "yes": + return 1.0 + if verdict == "partial": + return 0.5 + return 0.0 + + +def parse_json_object(content: str) -> dict[str, Any]: + try: + parsed = json.loads(content) + except json.JSONDecodeError: + start = content.find("{") + end = content.rfind("}") + if start < 0 or end <= start: + return {"score": 0.0, "explanation": content[:500], "verdict": "no"} + parsed = json.loads(content[start : end + 1]) + if not isinstance(parsed, dict): + return { + "score": 0.0, + "explanation": "judge returned non-object", + "verdict": "no", + } + return parsed + + +def prime_team_id() -> str | None: + for name in ("PRIME_TEAM_ID", "PI_TEAM_ID", "X_PRIME_TEAM_ID"): + value = os.environ.get(name) + if value: + return value + config_path = Path.home() / ".prime" / "config.json" + try: + if config_path.exists(): + config = json.loads(config_path.read_text()) + if isinstance(config, dict): + value = config.get("team_id") + if value: + return str(value) + except (json.JSONDecodeError, OSError): + return None + return None + + +def prime_default_headers(base_url: str | None) -> dict[str, str]: + if not base_url or "pinference" not in base_url.lower(): + return {} + team_id = prime_team_id() + return {"X-Prime-Team-ID": team_id} if team_id else {} diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py new file mode 100644 index 000000000..84af873c4 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py @@ -0,0 +1,533 @@ +"""mini-browse-apps-platform-v1: local-app Mini Browse tasks pulled from the Prime hub.""" + +from __future__ import annotations + +import gzip +import hashlib +import json +import os +import shlex +import shutil +import sqlite3 +import subprocess +import tempfile +import zlib +from pathlib import Path +from typing import Any + +from pydantic import Field +from verifiers.v1.errors import TasksetError + +import verifiers.v1 as vf + +from .harness.contract import ( + METRICS_PATH, + PROGRESS_PATH, + RESULT_PATH, + TASK_PAYLOAD_PATH, + TRANSCRIPT_PATH, + MiniBrowseTaskPayload, +) +from .harness.diagnostics import read_jsonl_tail +from .judge import judge_answer_key, score_from_judge_payload + +DEFAULT_SANDBOX_IMAGE = ( + "team-cmlr3u2er002zhr01tj8f48ts/" + "mini-browse-apps:destination-autocomplete-tight-20260528-0027" +) +DEFAULT_HUB_ENV_ID = "prime/mini-browse-apps-platform-v1" +DEFAULT_DATASET_FILENAME = "google_flights_10.jsonl.gz" + +APP_PORT = 5173 +CDP_PORT = 18080 +APP_URL = f"http://127.0.0.1:{APP_PORT}" +BROWSER_API_URL = f"http://127.0.0.1:{CDP_PORT}" +WORKDIR = "/workspace" +APP_SEED_PATH = "/task/app_seed.json" +SERVICE_LOG_DIR = "/logs/services" +APP_LOG_PATH = f"{SERVICE_LOG_DIR}/app.log" +CDP_LOG_PATH = f"{SERVICE_LOG_DIR}/cdp.log" +APP_SERVER = "/opt/mini-browse-services/spa_server.py" +CDP_SERVER = "/opt/mini-browse-services/local_cdp_service.py" +APP_ROOT = "/opt/mini-browse-app/dist" + +SETUP_TIMEOUT_SECONDS = 600 +FINALIZE_TIMEOUT_SECONDS = 120 +SCORING_TIMEOUT_SECONDS = 180 +DEFAULT_TIMEOUT_SECONDS = 3600.0 +DEFAULT_SANDBOX_CPU = 2 +DEFAULT_SANDBOX_MEMORY_GB = 4 +DEFAULT_SANDBOX_DISK_GB = 10 +TASKSET_SHUFFLE_SEED = "google_flights_kernel_v1_dense_hard_no_bag_1188_order_v1" + + +class MiniBrowseAppTask(vf.Task): + """One Mini Browse task backed by a sandboxed local web app.""" + + prompt: str + output_schema: dict[str, Any] + answer_key: dict[str, Any] + app_seed_ref: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +class MiniBrowseAppsConfig(vf.TasksetConfig): + id: str = "mini-browse-apps-platform-v1" + dataset_path: str | None = None + """Explicit local dataset (JSONL/JSONL.GZ); when set, skips the hub pull.""" + hub_env_id: str = DEFAULT_HUB_ENV_ID + """Prime hub environment the dataset is pulled from when no `dataset_path` is given.""" + hub_version: str = "latest" + dataset_filename: str = DEFAULT_DATASET_FILENAME + cache_dir: str | None = None + """Where the pulled dataset is cached (default: ~/.cache/verifiers/mini-browse-apps).""" + seed_store_path: str | None = None + task_indices: list[int] | None = None + task_profile: str = "default" + shuffle_tasks: bool = True + taskset_shuffle_seed: str = TASKSET_SHUFFLE_SEED + sandbox_image: str = DEFAULT_SANDBOX_IMAGE + timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS + judge_model: str = "google/gemini-3.1-pro-preview" + judge_base_url: str | None = "https://api.pinference.ai/api/v1" + judge_api_key_env: str = "PRIME_API_KEY" + + +class MiniBrowseAppsTaskset(vf.Taskset[MiniBrowseAppTask, MiniBrowseAppsConfig]): + """Owns local-app rows, sandbox app startup, and submitted-result judging.""" + + NEEDS_CONTAINER = True + + def __init__(self, config: MiniBrowseAppsConfig) -> None: + super().__init__(config) + self._inline_app_seeds: dict[str, dict[str, Any]] = {} + + def load_tasks(self) -> list[MiniBrowseAppTask]: + rows = self.load_rows() + rows = self.filter_rows(rows) + if self.config.shuffle_tasks: + rows = self.stable_shuffle_rows(rows) + if self.config.task_indices is not None: + rows = self.select_task_indices(rows) + if not rows: + raise ValueError("No Mini Browse app tasks were loaded") + return [self.normalize_row(i, row) for i, row in enumerate(rows)] + + async def setup(self, task: MiniBrowseAppTask, runtime: vf.Runtime) -> None: + app_seed = self.app_seed_for_task(task) + await ensure_runtime_dirs(runtime) + await write_runtime_json(runtime, APP_SEED_PATH, app_seed) + public_payload = MiniBrowseTaskPayload( + instruction=task.prompt, + start_url=APP_URL, + output_schema=task.output_schema, + browser_api_url=BROWSER_API_URL, + source="mini-browse-apps-platform-v1", + ) + await runtime.write( + TASK_PAYLOAD_PATH, + public_payload.model_dump_json(indent=2).encode("utf-8"), + ) + await start_services(runtime) + await wait_for_services(runtime) + + async def finalize( + self, task: MiniBrowseAppTask, trace: vf.Trace, runtime: vf.Runtime + ) -> None: + del task + result = await read_runtime_json(runtime, RESULT_PATH) + metrics = await read_runtime_json(runtime, METRICS_PATH) + trace.info["mini_browse_result"] = result + trace.info["mini_browse_metrics"] = metrics + trace.info["mini_browse_artifacts"] = { + "result_path": RESULT_PATH, + "transcript_path": TRANSCRIPT_PATH, + "metrics_path": METRICS_PATH, + "progress_path": PROGRESS_PATH, + "task_payload_path": TASK_PAYLOAD_PATH, + "app_seed_path": APP_SEED_PATH, + "app_log_path": APP_LOG_PATH, + "cdp_log_path": CDP_LOG_PATH, + } + if isinstance(result, dict): + trace.info["submitted_result"] = result.get("submitted_result") + if result.get("is_error"): + trace.info["mini_browse_progress_tail"] = await read_jsonl_tail( + runtime, + PROGRESS_PATH, + ) + + @vf.reward(weight=1.0) + async def answer_key(self, task: MiniBrowseAppTask, trace: vf.Trace) -> float: + result = trace_result(trace) + submitted = result.get("submitted_result") + if result.get("is_error") or not submitted: + trace.info["mini_browse_judge"] = { + "verdict": "no", + "explanation": result.get("error") or "missing submitted result", + } + return 0.0 + + judge_payload = await judge_answer_key( + task_instruction=task.prompt, + submitted_result=submitted, + answer_key=task.answer_key, + output_schema=task.output_schema, + model=self.config.judge_model, + base_url=self.config.judge_base_url, + api_key_env=self.config.judge_api_key_env, + ) + trace.info["mini_browse_judge"] = judge_payload + return score_from_judge_payload(judge_payload) + + @vf.metric + async def result_present(self, trace: vf.Trace) -> float: + return float(bool(trace_result(trace))) + + @vf.metric + async def submitted_result_present(self, trace: vf.Trace) -> float: + return float(bool(trace_result(trace).get("submitted_result"))) + + @vf.metric + async def agent_error(self, trace: vf.Trace) -> float: + return float(bool(trace_result(trace).get("is_error"))) + + @vf.metric + async def transcript_image_count(self, trace: vf.Trace) -> float: + return metric(trace, "transcript_image_count") + + @vf.metric + async def message_count(self, trace: vf.Trace) -> float: + return metric(trace, "message_count") + + def load_rows(self) -> list[dict[str, Any]]: + path = self.resolved_dataset_path() + if path.suffix == ".gz" or path.suffixes[-2:] == [".jsonl", ".gz"]: + with gzip.open(path, "rt", encoding="utf-8") as handle: + return [json.loads(line) for line in handle if line.strip()] + with path.open("r", encoding="utf-8") as handle: + return [json.loads(line) for line in handle if line.strip()] + + def resolved_dataset_path(self) -> Path: + if self.config.dataset_path: + path = Path(self.config.dataset_path).expanduser() + if not path.exists(): + raise FileNotFoundError(f"Mini Browse app dataset not found: {path}") + return path + return self.ensure_cached_dataset() + + def ensure_cached_dataset(self) -> Path: + cache_root = ( + Path(self.config.cache_dir).expanduser() + if self.config.cache_dir + else Path.home() / ".cache" / "verifiers" / "mini-browse-apps" + ) + cached = cache_root / self.config.hub_version / self.config.dataset_filename + if not cached.exists(): + cached.parent.mkdir(parents=True, exist_ok=True) + self.pull_dataset_into(cached) + return cached + + def pull_dataset_into(self, dest: Path) -> None: + """Pull the env package from the Prime hub into a temp dir and copy the dataset out.""" + with tempfile.TemporaryDirectory(prefix="mini-browse-hub-") as tmp: + result = subprocess.run( + [ + "prime", "env", "pull", self.config.hub_env_id, + "-v", self.config.hub_version, "-t", tmp, "--plain", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + detail = (result.stderr or result.stdout).strip()[-1000:] + raise RuntimeError( + f"`prime env pull {self.config.hub_env_id}` failed: {detail}" + ) + matches = sorted(Path(tmp).rglob(self.config.dataset_filename)) + if not matches: + raise FileNotFoundError( + f"{self.config.dataset_filename!r} not found in pulled hub env " + f"{self.config.hub_env_id!r}" + ) + staging = dest.with_name(dest.name + ".tmp") + shutil.copyfile(matches[0], staging) + os.replace(staging, dest) + + def filter_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + if self.config.task_profile == "default": + return rows + return [row for row in rows if source_profile(row) == self.config.task_profile] + + def stable_shuffle_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + seed = self.config.taskset_shuffle_seed + + def key(row: dict[str, Any]) -> tuple[str, str]: + info = decode_info(row.get("info") or {}) + task_id = str(row.get("task_id") or info.get("task_name") or "") + digest = hashlib.sha256(f"{seed}:{task_id}".encode("utf-8")).hexdigest() + return digest, task_id + + return sorted(rows, key=key) + + def select_task_indices(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + selected = [] + for index in self.config.task_indices or []: + if index < 0 or index >= len(rows): + raise ValueError( + f"task_indices contains out-of-range index {index}; " + f"filtered taskset has {len(rows)} rows" + ) + selected.append(rows[index]) + return selected + + def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask: + info = decode_info(row.get("info") or {}) + raw_instruction = info.get("instruction") or row.get("question") + if not isinstance(raw_instruction, str) or not raw_instruction.strip(): + raise ValueError(f"row {index} is missing a task instruction") + output_schema = info.get("output_schema") + if not isinstance(output_schema, dict): + raise ValueError(f"row {index} is missing output_schema") + answer_key = info.get("answer_key") or parse_answer(row.get("answer")) + if not isinstance(answer_key, dict): + raise ValueError(f"row {index} is missing answer_key") + + task_name = str(row.get("task_id") or info.get("task_name") or index) + app_seed = info.get("app_seed") + if app_seed is not None and not isinstance(app_seed, dict): + raise ValueError(f"row {index} has non-object app_seed") + app_seed_ref = info.get("app_seed_ref") + if app_seed is not None and app_seed_ref: + self._inline_app_seeds[str(app_seed_ref)] = app_seed + + return MiniBrowseAppTask( + idx=index, + name=task_name, + prompt=raw_instruction.strip(), + image=self.config.sandbox_image, + workdir=WORKDIR, + timeout=vf.TaskTimeout( + setup=SETUP_TIMEOUT_SECONDS, + harness=self.config.timeout_seconds, + finalize=FINALIZE_TIMEOUT_SECONDS, + scoring=SCORING_TIMEOUT_SECONDS, + ), + resources=vf.TaskResources( + cpu=DEFAULT_SANDBOX_CPU, + memory=DEFAULT_SANDBOX_MEMORY_GB, + disk=DEFAULT_SANDBOX_DISK_GB, + ), + output_schema=output_schema, + answer_key=answer_key, + app_seed_ref=str(app_seed_ref) if app_seed_ref else None, + metadata={ + "task_name": info.get("task_name"), + "task_id": row.get("task_id") or answer_key.get("task_id"), + "answer_kind": answer_key.get("answer_kind"), + "source_profile": source_profile(row), + "source_dataset": info.get("source_dataset"), + }, + ) + + def app_seed_for_task(self, task: MiniBrowseAppTask) -> dict[str, Any]: + if not task.app_seed_ref: + raise ValueError(f"Task {task.name} has neither app_seed nor app_seed_ref") + inline_seed = self._inline_app_seeds.get(task.app_seed_ref) + if inline_seed is not None: + return inline_seed + seed_store = self.resolved_seed_store_path() + if seed_store is None: + raise ValueError( + f"Task {task.name} needs seed {task.app_seed_ref}, but no seed store " + "was configured" + ) + return load_seed(seed_store, task.app_seed_ref) + + def resolved_seed_store_path(self) -> Path | None: + if self.config.seed_store_path: + return Path(self.config.seed_store_path).expanduser() + if not self.config.dataset_path: + return None + dataset_path = Path(self.config.dataset_path).expanduser() + return seed_store_for_artifact_path(dataset_path) + + +async def ensure_runtime_dirs(runtime: vf.Runtime) -> None: + result = await runtime.run( + [ + "bash", + "-lc", + f"mkdir -p /task {WORKDIR} {SERVICE_LOG_DIR} " + f"{shlex.quote(str(Path(TASK_PAYLOAD_PATH).parent))}", + ], + {}, + ) + if result.exit_code != 0: + raise TasksetError( + f"Mini Browse app setup failed: {combined_output(result)}" + ) + + +async def start_services(runtime: vf.Runtime) -> None: + await runtime.run_background( + [ + "python3", + APP_SERVER, + "--host", + "127.0.0.1", + "--port", + str(APP_PORT), + "--root", + APP_ROOT, + ], + {"TASK_SEED_PATH": APP_SEED_PATH}, + APP_LOG_PATH, + ) + await runtime.run_background( + [ + "python3", + CDP_SERVER, + "--host", + "127.0.0.1", + "--port", + str(CDP_PORT), + "--chrome", + "/usr/bin/chromium", + "--headless", + ], + {}, + CDP_LOG_PATH, + ) + + +async def wait_for_services(runtime: vf.Runtime) -> None: + script = f"""\ +set -e +for i in $(seq 1 90); do + if curl --noproxy '*' -fsS --max-time 2 {APP_URL} >/dev/null \\ + && curl --noproxy '*' -fsS --max-time 2 {BROWSER_API_URL}/healthz >/dev/null; then + echo "services ready" + exit 0 + fi + sleep 1 +done +echo "service readiness failed" +echo "--- process list ---" +ps aux || true +echo "--- app log ---" +tail -120 {APP_LOG_PATH} 2>/dev/null || true +echo "--- cdp log ---" +tail -120 {CDP_LOG_PATH} 2>/dev/null || true +exit 1 +""" + result = await runtime.run(["bash", "-lc", script], {}) + if result.exit_code != 0: + raise TasksetError( + f"Mini Browse app services did not become ready: {combined_output(result)}" + ) + + +async def write_runtime_json(runtime: vf.Runtime, path: str, value: Any) -> None: + data = json.dumps(value, ensure_ascii=False, indent=2).encode("utf-8") + await runtime.write(path, data) + + +async def read_runtime_json(runtime: vf.Runtime, path: str) -> Any: + try: + raw = await runtime.read(path) + except Exception as exc: + return {"is_error": True, "error": f"missing runtime artifact {path}: {exc}"} + text = raw.decode("utf-8", errors="replace").strip() + if not text: + return {} + try: + return json.loads(text) + except json.JSONDecodeError: + return { + "is_error": True, + "error": f"invalid JSON artifact {path}: {text[:500]}", + } + + +def trace_result(trace: vf.Trace) -> dict[str, Any]: + result = trace.info.get("mini_browse_result") + return result if isinstance(result, dict) else {} + + +def metric(trace: vf.Trace, key: str) -> float: + metrics = trace.info.get("mini_browse_metrics") + if isinstance(metrics, dict): + value = metrics.get(key) + if isinstance(value, (int, float)) and not isinstance(value, bool): + return float(value) + value = trace_result(trace).get(key) + if isinstance(value, (int, float)) and not isinstance(value, bool): + return float(value) + return 0.0 + + +def decode_info(info: Any) -> dict[str, Any]: + if isinstance(info, str): + return json.loads(info) + return dict(info or {}) + + +def parse_answer(answer: Any) -> Any: + if isinstance(answer, str): + return json.loads(answer) + return answer + + +def source_profile(row: dict[str, Any]) -> str | None: + info = decode_info(row.get("info") or {}) + factory = info.get("factory") or {} + if not isinstance(factory, dict): + return None + profile = factory.get("source_profile") + return str(profile) if profile else None + + +def seed_store_for_artifact_path(path: Path) -> Path: + name = path.name + if name.endswith(".tasks.jsonl.gz"): + return path.with_name(name.removesuffix(".tasks.jsonl.gz") + ".seeds.sqlite") + if name.endswith(".jsonl.gz"): + return path.with_name(name.removesuffix(".jsonl.gz") + ".seeds.sqlite") + if path.suffix: + return path.with_suffix(".seeds.sqlite") + return path.with_name(name + ".seeds.sqlite") + + +def load_seed(seed_store: Path, seed_ref: str) -> dict[str, Any]: + if not seed_store.exists(): + raise FileNotFoundError(f"Mini Browse app seed store not found: {seed_store}") + with sqlite3.connect(seed_store) as db: + columns = { + row[1] for row in db.execute("PRAGMA table_info(app_seeds)").fetchall() + } + if "app_seed_zlib" in columns: + row = db.execute( + "SELECT app_seed_zlib FROM app_seeds WHERE seed_id = ?", + (seed_ref,), + ).fetchone() + seed_json = None if row is None else zlib.decompress(row[0]).decode("utf-8") + else: + row = db.execute( + "SELECT app_seed_json FROM app_seeds WHERE seed_id = ?", + (seed_ref,), + ).fetchone() + seed_json = None if row is None else row[0] + if seed_json is None: + raise KeyError(f"Seed {seed_ref!r} not found in {seed_store}") + seed = json.loads(seed_json) + if not isinstance(seed, dict): + raise ValueError(f"Seed {seed_ref!r} in {seed_store} is not an object") + return seed + + +def combined_output(result: vf.ProgramResult) -> str: + return ((result.stdout or "") + (result.stderr or "")).strip()[-2000:] + + +__all__ = ["MiniBrowseAppTask", "MiniBrowseAppsConfig", "MiniBrowseAppsTaskset"] diff --git a/environments/mini_browse_apps_platform_v1/pyproject.toml b/environments/mini_browse_apps_platform_v1/pyproject.toml new file mode 100644 index 000000000..fb8bdcf5f --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "mini-browse-apps-platform-v1" +version = "0.1.0" +description = "mini-browse-apps-platform-v1 — sandboxed local-app Mini Browse browser tasks (agentic; vision agent; LLM-judge reward)." +requires-python = ">=3.10" +dependencies = ["openai", "httpx"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["mini_browse_apps_platform_v1"] diff --git a/pyproject.toml b/pyproject.toml index 9971dbe8a..ae71425b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,7 @@ examples = [ "wordle-v1", "terminal-bench-2-v1", "alphabet-sort-v1", "r2e-gym-v1", "scaleswe-v1", "swelego-v1", "scratchpad-v1", "general-agent-v1", "swebench-verified-v1", + "mini-browse-apps-platform-v1", ] [project.optional-dependencies] @@ -161,6 +162,7 @@ alphabet-sort-v1 = { path = "environments/alphabet_sort_v1", editable = true } scratchpad-v1 = { path = "environments/scratchpad_v1", editable = true } general-agent-v1 = { path = "environments/general_agent_v1", editable = true } swebench-verified-v1 = { path = "environments/swebench_verified_v1", editable = true } +mini-browse-apps-platform-v1 = { path = "environments/mini_browse_apps_platform_v1", editable = true } [tool.uv.exclude-newer-package] # PrimeIntellect-published on PyPI (trusted publisher) diff --git a/uv.lock b/uv.lock index 06e62ff82..f5091d603 100644 --- a/uv.lock +++ b/uv.lock @@ -2400,6 +2400,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "mini-browse-apps-platform-v1" +version = "0.1.0" +source = { editable = "environments/mini_browse_apps_platform_v1" } +dependencies = [ + { name = "openai" }, +] + +[package.metadata] +requires-dist = [{ name = "openai" }] + [[package]] name = "mistral-common" version = "1.11.0" @@ -5836,6 +5847,7 @@ examples = [ { name = "glossary-v1" }, { name = "gsm8k-v1" }, { name = "math-env-v1" }, + { name = "mini-browse-apps-platform-v1" }, { name = "r2e-gym-v1" }, { name = "reverse-text-v1" }, { name = "scaleswe-v1" }, @@ -5935,6 +5947,7 @@ examples = [ { name = "glossary-v1", editable = "environments/glossary_v1" }, { name = "gsm8k-v1", editable = "environments/gsm8k_v1" }, { name = "math-env-v1", editable = "environments/math_env_v1" }, + { name = "mini-browse-apps-platform-v1", editable = "environments/mini_browse_apps_platform_v1" }, { name = "r2e-gym-v1", editable = "environments/r2e_gym_v1" }, { name = "reverse-text-v1", editable = "environments/reverse_text_v1" }, { name = "scaleswe-v1", editable = "environments/scaleswe_v1" }, From 7d071005a6b08a87dd9f2895e5a45407e01a995c Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Fri, 19 Jun 2026 04:57:11 +0000 Subject: [PATCH 2/7] fix(v1): make the browse-apps judge tolerant of unterminated JSON The judge model can return JSON with the root object left unclosed (it stops after the final array); the old fallback grabbed a nested `}` and then crashed the rollout on an unguarded json.loads. Strip code fences, balance open brackets, and fall back to a default verdict instead of raising. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../mini_browse_apps_platform_v1/judge.py | 59 ++++++++++++++----- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py index b1228336e..4a1620546 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py @@ -108,21 +108,50 @@ def score_from_judge_payload(payload: dict[str, Any]) -> float: def parse_json_object(content: str) -> dict[str, Any]: - try: - parsed = json.loads(content) - except json.JSONDecodeError: - start = content.find("{") - end = content.rfind("}") - if start < 0 or end <= start: - return {"score": 0.0, "explanation": content[:500], "verdict": "no"} - parsed = json.loads(content[start : end + 1]) - if not isinstance(parsed, dict): - return { - "score": 0.0, - "explanation": "judge returned non-object", - "verdict": "no", - } - return parsed + # Judge models sometimes return JSON wrapped in a code fence or left unterminated (the + # reasoning model stops after the last array without closing the root object). Try the raw + # text, then the brace span, then a bracket-balanced repair of that span. + fenced = content.strip() + if fenced.startswith("```"): + fenced = fenced.split("```", 2)[1].removeprefix("json").strip() + start = fenced.find("{") + span = fenced[start:] if start >= 0 else "" + for candidate in (content, fenced, span, _balance_json(span)): + if not candidate.strip(): + continue + try: + parsed = json.loads(candidate) + except json.JSONDecodeError: + continue + if isinstance(parsed, dict): + return parsed + return {"score": 0.0, "explanation": content[:500], "verdict": "no"} + + +def _balance_json(text: str) -> str: + """Close an unterminated JSON object/array: append the missing `}`/`]` for any brackets left + open outside of strings, after dropping a dangling trailing comma.""" + stack: list[str] = [] + in_string = escaped = False + for ch in text: + if in_string: + if escaped: + escaped = False + elif ch == "\\": + escaped = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch in "{[": + stack.append("}" if ch == "{" else "]") + elif ch in "}]" and stack: + stack.pop() + trimmed = text.rstrip() + if trimmed.endswith(","): + trimmed = trimmed[:-1] + return trimmed + "".join(reversed(stack)) def prime_team_id() -> str | None: From e020bbec77638364db14f329e1190f507c6946c0 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Fri, 19 Jun 2026 05:02:18 +0000 Subject: [PATCH 3/7] refactor(v1): simplify prime_team_id in the browse-apps judge Walrus for the env lookup, .exists() instead of the nested try/except, and drop the defensive isinstance/str cast (the prime config is always a dict). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../mini_browse_apps_platform_v1/judge.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py index 4a1620546..2860b55e7 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py @@ -156,19 +156,11 @@ def _balance_json(text: str) -> str: def prime_team_id() -> str | None: for name in ("PRIME_TEAM_ID", "PI_TEAM_ID", "X_PRIME_TEAM_ID"): - value = os.environ.get(name) - if value: + if value := os.environ.get(name): return value - config_path = Path.home() / ".prime" / "config.json" - try: - if config_path.exists(): - config = json.loads(config_path.read_text()) - if isinstance(config, dict): - value = config.get("team_id") - if value: - return str(value) - except (json.JSONDecodeError, OSError): - return None + config = Path.home() / ".prime" / "config.json" + if config.exists(): + return json.loads(config.read_text()).get("team_id") return None From b2ae4665f81480177bb3ac364d684570427a616c Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Fri, 19 Jun 2026 05:06:14 +0000 Subject: [PATCH 4/7] refactor(v1): slim browse-apps config + structured-output judge - Judge config is now a JudgeConfig subconfig (model + verifiers BaseClientConfig), so the endpoint, team header, and API key auto-resolve to Prime inference; drop the bespoke prime_team_id / prime_default_headers and the flat judge_* fields. - Judge uses a structured-output (json_schema) model (default openai/gpt-4.1-mini) so the verdict is always valid JSON. - Drop config knobs that are framework-internal or belong on the task: per-task timeouts, sandbox_image (now set directly on the task), task shuffling, task_indices / task_profile, seed_store_path (+ the sqlite seed-store path), and the configurable cache dir (now a constant). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../mini_browse_apps_platform_v1/README.md | 23 ++- .../mini_browse_apps_platform_v1/judge.py | 116 +++++++------- .../mini_browse_apps_platform_v1/taskset.py | 147 ++---------------- 3 files changed, 89 insertions(+), 197 deletions(-) diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md index b769f268f..7e1b441a1 100644 --- a/environments/mini_browse_apps_platform_v1/README.md +++ b/environments/mini_browse_apps_platform_v1/README.md @@ -21,15 +21,12 @@ time from a **private GitHub repo** (pinned to a commit), caches it under | `agent_token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent_repo`. | | `agent_path` | _(unset)_ | Local dir containing `/` — skips the fetch (development). | -So set `export MINI_BROWSE_GITHUB_TOKEN=` and `--harness.agent-ref `, or point -`--harness.agent-path` at a local checkout. - ## Tasks (pulled dynamically) Tasks are **pulled from the Prime hub and cached locally** — nothing is bundled in this package. `load_tasks` pulls the dataset from `prime/mini-browse-apps-platform-v1` (private; via `prime env pull`) into `~/.cache/verifiers/mini-browse-apps//`. Override with `--taskset.dataset_path -` or repoint `--taskset.hub_env_id` / `--taskset.hub_version`. +`, or repoint `--taskset.hub_env_id` / `--taskset.hub_version`. ## Run @@ -48,7 +45,19 @@ uv run eval mini-browse-apps-platform-v1 \ ## Reward & metrics -`answer_key` (weight 1.0) judges the submitted result against the gold answer key (`judge_model`, -default `google/gemini-3.1-pro-preview` via pinference); reward 1.0 == judge verdict "yes". Metrics: -`result_present`, `submitted_result_present`, `agent_error`, `transcript_image_count`, +`answer_key` (weight 1.0) judges the submitted result against the gold answer key. The judge uses a +structured-output (`json_schema`) model — default `openai/gpt-4.1-mini` on Prime inference +(auto-resolved); override with `--taskset.judge.model` / `--taskset.judge.client.*`. Reward 1.0 == +all expected fields correct (`verdict: "yes"`); partial credit is `correct_fields / total_fields`. +Metrics: `result_present`, `submitted_result_present`, `agent_error`, `transcript_image_count`, `message_count`. + +## Config (`--taskset.*`) + +| Field | Default | Meaning | +| --- | --- | --- | +| `dataset_path` | `null` | Local dataset override (skips the hub pull). | +| `hub_env_id` | `prime/mini-browse-apps-platform-v1` | Hub env the dataset is pulled from. | +| `hub_version` | `latest` | Hub env version to pull. | +| `judge.model` | `openai/gpt-4.1-mini` | Structured-output judge model. | +| `judge.client` | Prime inference | OpenAI-compatible endpoint for the judge (auto-resolved). | diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py index 2860b55e7..1c749c0c6 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py @@ -1,20 +1,51 @@ -"""LLM judge support for Mini Browse local-app tasks.""" +"""LLM judge for the browse-apps local-app tasks (structured-output verdict).""" from __future__ import annotations import json import os -from pathlib import Path from typing import Any from openai import AsyncOpenAI +from pydantic import Field +from pydantic_config import BaseConfig + +from verifiers.utils.client_utils import load_prime_config +from verifiers.v1.clients.config import BaseClientConfig JUDGE_TEMPERATURE = 0 +# Strict structured output: the judge must return exactly these fields, always valid JSON. +JUDGE_RESPONSE_FORMAT = { + "type": "json_schema", + "json_schema": { + "name": "judge_verdict", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "correct_fields": {"type": "integer"}, + "total_fields": {"type": "integer"}, + "score": {"type": "number"}, + "verdict": {"type": "string", "enum": ["yes", "partial", "no"]}, + "explanation": {"type": "string"}, + }, + "required": [ + "correct_fields", + "total_fields", + "score", + "verdict", + "explanation", + ], + }, + }, +} + JUDGE_PROMPT = """You evaluate a browser automation agent's submitted result for a deterministic local flight-search task. Use the evaluation contract and gold answer as the source of truth. Score only -expected_fields where score is true or absent. Ignore verifier metadata, ids, +expected fields where score is true or absent. Ignore verifier metadata, ids, internal keys, hidden seed fields, and non-scoreable diagnostics. Treat equivalent formatting as correct when the same fact is clearly attached to @@ -27,33 +58,26 @@ with critical=true are hard gates: if any scoreable critical field is missing or wrong, the verdict must be "no". -Return exactly one JSON object, no prose and no code fence: -{ - "correct_fields": , - "total_fields": , - "score": , - "verdict": "yes" | "partial" | "no", - "explanation": "", - "field_verdicts": [ - { - "field_path": "", - "verdict": "exact_match" | "semantic_match" | "wrong" | "missing", - "reason": "" - } - ] -} +Report `correct_fields` / `total_fields` over the scoreable expected fields, a +`score` from 0 to 1, a `verdict`, and a one-sentence `explanation`. """ +class JudgeConfig(BaseConfig): + """The judge model and the OpenAI-compatible endpoint it runs on (Prime auto-resolved).""" + + model: str = "openai/gpt-4.1-mini" + """A model that supports strict structured output (`json_schema`).""" + client: BaseClientConfig = Field(default_factory=BaseClientConfig) + + async def judge_answer_key( *, task_instruction: str, submitted_result: Any, answer_key: dict[str, Any], output_schema: dict[str, Any], - model: str, - base_url: str | None, - api_key_env: str, + config: JudgeConfig, ) -> dict[str, Any]: context = { "task_instruction": task_instruction, @@ -62,8 +86,8 @@ async def judge_answer_key( "gold_answer": answer_key.get("gold_answer") or answer_key, "output_schema": output_schema, } - response = await judge_client(base_url, api_key_env).chat.completions.create( - model=model, + response = await judge_client(config.client).chat.completions.create( + model=config.model, messages=[ {"role": "system", "content": JUDGE_PROMPT}, { @@ -72,23 +96,23 @@ async def judge_answer_key( }, ], temperature=JUDGE_TEMPERATURE, - response_format={"type": "json_object"}, + response_format=JUDGE_RESPONSE_FORMAT, ) content = response.choices[0].message.content or "{}" return parse_json_object(content) -def judge_client(base_url: str | None, api_key_env: str) -> AsyncOpenAI: - api_key = os.environ.get(api_key_env) - if not api_key: - raise ValueError(f"Missing judge API key env var {api_key_env}") - kwargs: dict[str, Any] = {"api_key": api_key} - if base_url: - kwargs["base_url"] = base_url - default_headers = prime_default_headers(base_url) - if default_headers: - kwargs["default_headers"] = default_headers - return AsyncOpenAI(**kwargs) +def judge_client(config: BaseClientConfig) -> AsyncOpenAI: + # base_url + team header are resolved by BaseClientConfig; the key falls back to the Prime + # CLI config for pinference (mirrors verifiers' resolve_client). + api_key = os.environ.get(config.api_key_var) + if not api_key and config.api_key_var == "PRIME_API_KEY": + api_key = load_prime_config().get("api_key") + return AsyncOpenAI( + base_url=config.base_url, + api_key=api_key or "EMPTY", + default_headers=config.headers or None, + ) def score_from_judge_payload(payload: dict[str, Any]) -> float: @@ -108,9 +132,8 @@ def score_from_judge_payload(payload: dict[str, Any]) -> float: def parse_json_object(content: str) -> dict[str, Any]: - # Judge models sometimes return JSON wrapped in a code fence or left unterminated (the - # reasoning model stops after the last array without closing the root object). Try the raw - # text, then the brace span, then a bracket-balanced repair of that span. + # Strict structured output is always valid JSON; this stays tolerant (code fences, an + # unterminated object) as a backstop for an overridden/non-conforming judge model. fenced = content.strip() if fenced.startswith("```"): fenced = fenced.split("```", 2)[1].removeprefix("json").strip() @@ -152,20 +175,3 @@ def _balance_json(text: str) -> str: if trimmed.endswith(","): trimmed = trimmed[:-1] return trimmed + "".join(reversed(stack)) - - -def prime_team_id() -> str | None: - for name in ("PRIME_TEAM_ID", "PI_TEAM_ID", "X_PRIME_TEAM_ID"): - if value := os.environ.get(name): - return value - config = Path.home() / ".prime" / "config.json" - if config.exists(): - return json.loads(config.read_text()).get("team_id") - return None - - -def prime_default_headers(base_url: str | None) -> dict[str, str]: - if not base_url or "pinference" not in base_url.lower(): - return {} - team_id = prime_team_id() - return {"X-Prime-Team-ID": team_id} if team_id else {} diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py index 84af873c4..abdfc95f7 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py @@ -1,17 +1,14 @@ -"""mini-browse-apps-platform-v1: local-app Mini Browse tasks pulled from the Prime hub.""" +"""mini-browse-apps-platform-v1: local-app browser tasks pulled from the Prime hub.""" from __future__ import annotations import gzip -import hashlib import json import os import shlex import shutil -import sqlite3 import subprocess import tempfile -import zlib from pathlib import Path from typing import Any @@ -29,7 +26,7 @@ MiniBrowseTaskPayload, ) from .harness.diagnostics import read_jsonl_tail -from .judge import judge_answer_key, score_from_judge_payload +from .judge import JudgeConfig, judge_answer_key, score_from_judge_payload DEFAULT_SANDBOX_IMAGE = ( "team-cmlr3u2er002zhr01tj8f48ts/" @@ -37,6 +34,7 @@ ) DEFAULT_HUB_ENV_ID = "prime/mini-browse-apps-platform-v1" DEFAULT_DATASET_FILENAME = "google_flights_10.jsonl.gz" +DATASET_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "mini-browse-apps" APP_PORT = 5173 CDP_PORT = 18080 @@ -51,14 +49,9 @@ CDP_SERVER = "/opt/mini-browse-services/local_cdp_service.py" APP_ROOT = "/opt/mini-browse-app/dist" -SETUP_TIMEOUT_SECONDS = 600 -FINALIZE_TIMEOUT_SECONDS = 120 -SCORING_TIMEOUT_SECONDS = 180 -DEFAULT_TIMEOUT_SECONDS = 3600.0 DEFAULT_SANDBOX_CPU = 2 DEFAULT_SANDBOX_MEMORY_GB = 4 DEFAULT_SANDBOX_DISK_GB = 10 -TASKSET_SHUFFLE_SEED = "google_flights_kernel_v1_dense_hard_no_bag_1188_order_v1" class MiniBrowseAppTask(vf.Task): @@ -79,18 +72,7 @@ class MiniBrowseAppsConfig(vf.TasksetConfig): """Prime hub environment the dataset is pulled from when no `dataset_path` is given.""" hub_version: str = "latest" dataset_filename: str = DEFAULT_DATASET_FILENAME - cache_dir: str | None = None - """Where the pulled dataset is cached (default: ~/.cache/verifiers/mini-browse-apps).""" - seed_store_path: str | None = None - task_indices: list[int] | None = None - task_profile: str = "default" - shuffle_tasks: bool = True - taskset_shuffle_seed: str = TASKSET_SHUFFLE_SEED - sandbox_image: str = DEFAULT_SANDBOX_IMAGE - timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS - judge_model: str = "google/gemini-3.1-pro-preview" - judge_base_url: str | None = "https://api.pinference.ai/api/v1" - judge_api_key_env: str = "PRIME_API_KEY" + judge: JudgeConfig = Field(default_factory=JudgeConfig) class MiniBrowseAppsTaskset(vf.Taskset[MiniBrowseAppTask, MiniBrowseAppsConfig]): @@ -104,11 +86,6 @@ def __init__(self, config: MiniBrowseAppsConfig) -> None: def load_tasks(self) -> list[MiniBrowseAppTask]: rows = self.load_rows() - rows = self.filter_rows(rows) - if self.config.shuffle_tasks: - rows = self.stable_shuffle_rows(rows) - if self.config.task_indices is not None: - rows = self.select_task_indices(rows) if not rows: raise ValueError("No Mini Browse app tasks were loaded") return [self.normalize_row(i, row) for i, row in enumerate(rows)] @@ -173,9 +150,7 @@ async def answer_key(self, task: MiniBrowseAppTask, trace: vf.Trace) -> float: submitted_result=submitted, answer_key=task.answer_key, output_schema=task.output_schema, - model=self.config.judge_model, - base_url=self.config.judge_base_url, - api_key_env=self.config.judge_api_key_env, + config=self.config.judge, ) trace.info["mini_browse_judge"] = judge_payload return score_from_judge_payload(judge_payload) @@ -217,12 +192,7 @@ def resolved_dataset_path(self) -> Path: return self.ensure_cached_dataset() def ensure_cached_dataset(self) -> Path: - cache_root = ( - Path(self.config.cache_dir).expanduser() - if self.config.cache_dir - else Path.home() / ".cache" / "verifiers" / "mini-browse-apps" - ) - cached = cache_root / self.config.hub_version / self.config.dataset_filename + cached = DATASET_CACHE_DIR / self.config.hub_version / self.config.dataset_filename if not cached.exists(): cached.parent.mkdir(parents=True, exist_ok=True) self.pull_dataset_into(cached) @@ -254,33 +224,6 @@ def pull_dataset_into(self, dest: Path) -> None: shutil.copyfile(matches[0], staging) os.replace(staging, dest) - def filter_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: - if self.config.task_profile == "default": - return rows - return [row for row in rows if source_profile(row) == self.config.task_profile] - - def stable_shuffle_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: - seed = self.config.taskset_shuffle_seed - - def key(row: dict[str, Any]) -> tuple[str, str]: - info = decode_info(row.get("info") or {}) - task_id = str(row.get("task_id") or info.get("task_name") or "") - digest = hashlib.sha256(f"{seed}:{task_id}".encode("utf-8")).hexdigest() - return digest, task_id - - return sorted(rows, key=key) - - def select_task_indices(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: - selected = [] - for index in self.config.task_indices or []: - if index < 0 or index >= len(rows): - raise ValueError( - f"task_indices contains out-of-range index {index}; " - f"filtered taskset has {len(rows)} rows" - ) - selected.append(rows[index]) - return selected - def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask: info = decode_info(row.get("info") or {}) raw_instruction = info.get("instruction") or row.get("question") @@ -305,14 +248,8 @@ def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask: idx=index, name=task_name, prompt=raw_instruction.strip(), - image=self.config.sandbox_image, + image=DEFAULT_SANDBOX_IMAGE, workdir=WORKDIR, - timeout=vf.TaskTimeout( - setup=SETUP_TIMEOUT_SECONDS, - harness=self.config.timeout_seconds, - finalize=FINALIZE_TIMEOUT_SECONDS, - scoring=SCORING_TIMEOUT_SECONDS, - ), resources=vf.TaskResources( cpu=DEFAULT_SANDBOX_CPU, memory=DEFAULT_SANDBOX_MEMORY_GB, @@ -325,32 +262,19 @@ def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask: "task_name": info.get("task_name"), "task_id": row.get("task_id") or answer_key.get("task_id"), "answer_kind": answer_key.get("answer_kind"), - "source_profile": source_profile(row), "source_dataset": info.get("source_dataset"), }, ) def app_seed_for_task(self, task: MiniBrowseAppTask) -> dict[str, Any]: if not task.app_seed_ref: - raise ValueError(f"Task {task.name} has neither app_seed nor app_seed_ref") - inline_seed = self._inline_app_seeds.get(task.app_seed_ref) - if inline_seed is not None: - return inline_seed - seed_store = self.resolved_seed_store_path() - if seed_store is None: + raise ValueError(f"Task {task.name} has no app_seed") + seed = self._inline_app_seeds.get(task.app_seed_ref) + if seed is None: raise ValueError( - f"Task {task.name} needs seed {task.app_seed_ref}, but no seed store " - "was configured" + f"Task {task.name} references seed {task.app_seed_ref} not present inline" ) - return load_seed(seed_store, task.app_seed_ref) - - def resolved_seed_store_path(self) -> Path | None: - if self.config.seed_store_path: - return Path(self.config.seed_store_path).expanduser() - if not self.config.dataset_path: - return None - dataset_path = Path(self.config.dataset_path).expanduser() - return seed_store_for_artifact_path(dataset_path) + return seed async def ensure_runtime_dirs(runtime: vf.Runtime) -> None: @@ -479,53 +403,6 @@ def parse_answer(answer: Any) -> Any: return answer -def source_profile(row: dict[str, Any]) -> str | None: - info = decode_info(row.get("info") or {}) - factory = info.get("factory") or {} - if not isinstance(factory, dict): - return None - profile = factory.get("source_profile") - return str(profile) if profile else None - - -def seed_store_for_artifact_path(path: Path) -> Path: - name = path.name - if name.endswith(".tasks.jsonl.gz"): - return path.with_name(name.removesuffix(".tasks.jsonl.gz") + ".seeds.sqlite") - if name.endswith(".jsonl.gz"): - return path.with_name(name.removesuffix(".jsonl.gz") + ".seeds.sqlite") - if path.suffix: - return path.with_suffix(".seeds.sqlite") - return path.with_name(name + ".seeds.sqlite") - - -def load_seed(seed_store: Path, seed_ref: str) -> dict[str, Any]: - if not seed_store.exists(): - raise FileNotFoundError(f"Mini Browse app seed store not found: {seed_store}") - with sqlite3.connect(seed_store) as db: - columns = { - row[1] for row in db.execute("PRAGMA table_info(app_seeds)").fetchall() - } - if "app_seed_zlib" in columns: - row = db.execute( - "SELECT app_seed_zlib FROM app_seeds WHERE seed_id = ?", - (seed_ref,), - ).fetchone() - seed_json = None if row is None else zlib.decompress(row[0]).decode("utf-8") - else: - row = db.execute( - "SELECT app_seed_json FROM app_seeds WHERE seed_id = ?", - (seed_ref,), - ).fetchone() - seed_json = None if row is None else row[0] - if seed_json is None: - raise KeyError(f"Seed {seed_ref!r} not found in {seed_store}") - seed = json.loads(seed_json) - if not isinstance(seed, dict): - raise ValueError(f"Seed {seed_ref!r} in {seed_store} is not an object") - return seed - - def combined_output(result: vf.ProgramResult) -> str: return ((result.stdout or "") + (result.stderr or "")).strip()[-2000:] From ec17662f368a3ea1de83297a317017d84573a236 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Fri, 19 Jun 2026 05:10:52 +0000 Subject: [PATCH 5/7] refactor(v1): browse-agent subconfig + file-based model client - Group the harness agent-source fields under an `agent` subconfig (agent.repo / agent.ref / agent.path / ...), dropping the agent_ prefix. - Pass the model endpoint/key/model to the in-sandbox agent via a JSON file (--model-client) instead of OPENAI_* env vars; program.py builds the client and passes it to the agent explicitly, keeping the secret out of the process env. - Drop the redundant harness path config fields (use the shared contract paths directly) and inline taskset constants that only fed config defaults. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../mini_browse_apps_platform_v1/README.md | 10 +- .../harness/__init__.py | 118 +++++++++--------- .../harness/program.py | 10 +- .../mini_browse_apps_platform_v1/taskset.py | 6 +- 4 files changed, 78 insertions(+), 66 deletions(-) diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md index 7e1b441a1..e5fb92280 100644 --- a/environments/mini_browse_apps_platform_v1/README.md +++ b/environments/mini_browse_apps_platform_v1/README.md @@ -16,10 +16,10 @@ time from a **private GitHub repo** (pinned to a commit), caches it under | Field | Default | Meaning | | --- | --- | --- | -| `agent_repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. | -| `agent_ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent_path` is set).** | -| `agent_token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent_repo`. | -| `agent_path` | _(unset)_ | Local dir containing `/` — skips the fetch (development). | +| `agent.repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. | +| `agent.ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent.path` is set).** | +| `agent.token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent.repo`. | +| `agent.path` | _(unset)_ | Local dir containing the agent package — skips the fetch (development). | ## Tasks (pulled dynamically) @@ -38,7 +38,7 @@ export MINI_BROWSE_GITHUB_TOKEN= uv run eval mini-browse-apps-platform-v1 \ --harness.id mini-browse-apps-platform-v1 \ --harness.runtime.type prime \ - --harness.agent-ref \ + --harness.agent.ref \ -m \ -n 1 -r 1 -c 1 ``` diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py index 8ded780df..1dc043c1d 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py @@ -3,12 +3,13 @@ The browser agent is proprietary and is NOT vendored in this repo. It is fetched at run time from a private, auth-gated GitHub repo (pinned to a commit), cached locally, then tarred and staged into the sandbox, where `program.py` (a uv script) imports and runs it. For local -development, point `agent_path` at a checkout instead of fetching. +development, point `agent.path` at a checkout instead of fetching. """ from __future__ import annotations import io +import json import os import shlex import shutil @@ -18,6 +19,8 @@ from typing import Literal import httpx +from pydantic import Field +from pydantic_config import BaseConfig from verifiers.v1.clients import RolloutContext from verifiers.v1.errors import HarnessError from verifiers.v1.harness import Harness, HarnessConfig @@ -39,29 +42,35 @@ AGENT_RUNTIME = "/opt/browse-agent-runtime" AGENT_TARBALL = "/tmp/vf-browse-agent-runtime.tgz" +AGENT_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "browse-agent" +MODEL_CLIENT_PATH = "/tmp/vf-browse-model-client.json" CoordinateMode = Literal["relative_1000", "absolute", "auto"] +class AgentConfig(BaseConfig): + """The proprietary browser agent — fetched at run time from a private repo, not vendored.""" + + repo: str = "PrimeIntellect-ai/plex-mini-browse" + """Private GitHub repo (owner/name) the agent is fetched from.""" + ref: str = "" + """Pinned commit sha to fetch (required unless `path` is set).""" + package: str = "mini_browse" + """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage.""" + token_env: str = "MINI_BROWSE_GITHUB_TOKEN" + """Env var holding a GitHub token with read access to `repo`.""" + path: str | None = None + """Local dir containing `/` — when set, skips the GitHub fetch (development).""" + cache_dir: str | None = None + """Where fetched revisions are cached (default: ~/.cache/verifiers/browse-agent).""" + + class MiniBrowseHarnessConfig(HarnessConfig): """Reusable browser harness; fetches its proprietary agent from a private repo.""" id: str = "mini-browse-apps-platform-v1" runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm") - # --- proprietary agent source (not vendored; fetched at run time) --- - agent_repo: str = "PrimeIntellect-ai/plex-mini-browse" - """Private GitHub repo (owner/name) the agent is fetched from.""" - agent_ref: str = "" - """Pinned commit sha to fetch (required unless `agent_path` is set).""" - agent_package: str = "mini_browse" - """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage.""" - agent_token_env: str = "MINI_BROWSE_GITHUB_TOKEN" - """Env var holding a GitHub token with read access to `agent_repo`.""" - agent_path: str | None = None - """Local dir containing `/` — when set, skips the GitHub fetch (dev).""" - agent_cache_dir: str | None = None - """Where fetched agent revisions are cached (default: ~/.cache/verifiers/browse-agent).""" - # --- agent behavior --- + agent: AgentConfig = Field(default_factory=AgentConfig) max_steps: int = 75 coordinate_mode: CoordinateMode = "relative_1000" keep_last_images: int = 3 @@ -71,12 +80,6 @@ class MiniBrowseHarnessConfig(HarnessConfig): browser_start_jitter_seconds: float = 0.0 browser_start_max_in_flight: int = 0 record_frames: bool = False - task_payload_path: str = TASK_PAYLOAD_PATH - result_path: str = RESULT_PATH - transcript_path: str = TRANSCRIPT_PATH - metrics_path: str = METRICS_PATH - progress_path: str = PROGRESS_PATH - workspace_root: str = WORKSPACE_ROOT class MiniBrowseHarness(Harness[MiniBrowseHarnessConfig]): @@ -108,11 +111,14 @@ async def launch( raise ValueError("Browser harness requires a string task prompt") await self._stage_agent(runtime) + await runtime.write( + MODEL_CLIENT_PATH, + json.dumps( + {"base_url": endpoint, "api_key": secret, "model": ctx.model} + ).encode("utf-8"), + ) env = { **self.config.env, - "OPENAI_BASE_URL": endpoint, - "OPENAI_API_KEY": secret, - "OPENAI_MODEL": ctx.model, "PYTHONPATH": self._pythonpath(), "MINI_BROWSE_COORDINATE_MODE": self.config.coordinate_mode, "MINI_BROWSE_KEEP_LAST_IMAGES": str(self.config.keep_last_images), @@ -131,26 +137,28 @@ async def launch( "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT": str( self.config.browser_start_max_in_flight ), - "MINI_BROWSE_PROGRESS_PATH": self.config.progress_path, + "MINI_BROWSE_PROGRESS_PATH": PROGRESS_PATH, } if self.config.record_frames: env["MINI_BROWSE_RECORD_FRAMES_DIR"] = "/logs/mini_browse/frames" args = [ "--task", - self.config.task_payload_path, + TASK_PAYLOAD_PATH, + "--model-client", + MODEL_CLIENT_PATH, "--result", - self.config.result_path, + RESULT_PATH, "--transcript", - self.config.transcript_path, + TRANSCRIPT_PATH, "--metrics", - self.config.metrics_path, + METRICS_PATH, "--progress", - self.config.progress_path, + PROGRESS_PATH, "--max-steps", str(self.config.max_steps), "--workspace-root", - self.config.workspace_root, + WORKSPACE_ROOT, ] return await runtime.run_uv_script(PROGRAM_SOURCE, args=args, env=env) @@ -168,43 +176,42 @@ async def _stage_agent(self, runtime: Runtime) -> None: ) def _agent_tarball(self) -> bytes: - package = self._ensure_agent() / self.config.agent_package + package = self._ensure_agent() / self.config.agent.package if not package.is_dir(): raise HarnessError( - f"agent package {self.config.agent_package!r} not found under {package.parent}" + f"agent package {self.config.agent.package!r} not found under {package.parent}" ) buffer = io.BytesIO() with tarfile.open(fileobj=buffer, mode="w:gz") as archive: - archive.add(package, arcname=self.config.agent_package) + archive.add(package, arcname=self.config.agent.package) return buffer.getvalue() def _ensure_agent(self) -> Path: - """Return a dir that contains `/` — a local checkout or the fetch cache.""" - if self.config.agent_path: - return Path(self.config.agent_path).expanduser() - if not self.config.agent_ref: + """Return a dir that contains `/` — a local checkout or the fetch cache.""" + agent = self.config.agent + if agent.path: + return Path(agent.path).expanduser() + if not agent.ref: raise HarnessError( - "set --harness.agent-ref to a pinned commit sha " - "(or --harness.agent-path to a local checkout for development)" + "set --harness.agent.ref to a pinned commit sha " + "(or --harness.agent.path to a local checkout for development)" ) cache_root = ( - Path(self.config.agent_cache_dir).expanduser() - if self.config.agent_cache_dir - else Path.home() / ".cache" / "verifiers" / "browse-agent" + Path(agent.cache_dir).expanduser() if agent.cache_dir else AGENT_CACHE_DIR ) - dest = cache_root / self.config.agent_ref - if not (dest / self.config.agent_package).exists(): + dest = cache_root / agent.ref + if not (dest / agent.package).exists(): self._download_agent(dest) return dest def _download_agent(self, dest: Path) -> None: - token = os.environ.get(self.config.agent_token_env) + agent = self.config.agent + token = os.environ.get(agent.token_env) if not token: raise HarnessError( - f"missing ${self.config.agent_token_env} to fetch the private agent repo " - f"{self.config.agent_repo!r}" + f"missing ${agent.token_env} to fetch the private agent repo {agent.repo!r}" ) - url = f"https://api.github.com/repos/{self.config.agent_repo}/tarball/{self.config.agent_ref}" + url = f"https://api.github.com/repos/{agent.repo}/tarball/{agent.ref}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", @@ -219,8 +226,7 @@ def _download_agent(self, dest: Path) -> None: if resp.status_code != 200: resp.read() raise HarnessError( - f"fetching {self.config.agent_repo}@{self.config.agent_ref} failed: " - f"HTTP {resp.status_code}" + f"fetching {agent.repo}@{agent.ref} failed: HTTP {resp.status_code}" ) with open(archive, "wb") as handle: for chunk in resp.iter_bytes(): @@ -229,18 +235,17 @@ def _download_agent(self, dest: Path) -> None: extract.mkdir() with tarfile.open(archive) as tar: tar.extractall(extract, filter="data") - matches = sorted(extract.glob(f"*/{self.config.agent_package}")) + matches = sorted(extract.glob(f"*/{agent.package}")) if not matches: raise HarnessError( - f"{self.config.agent_package!r} not found in " - f"{self.config.agent_repo}@{self.config.agent_ref}" + f"{agent.package!r} not found in {agent.repo}@{agent.ref}" ) dest.mkdir(parents=True, exist_ok=True) - staging = dest / (self.config.agent_package + ".tmp") + staging = dest / (agent.package + ".tmp") if staging.exists(): shutil.rmtree(staging) shutil.copytree(matches[0], staging) - os.replace(staging, dest / self.config.agent_package) + os.replace(staging, dest / agent.package) def _pythonpath(self) -> str: existing = self.config.env.get("PYTHONPATH", "") @@ -255,6 +260,7 @@ def load_harness(config: MiniBrowseHarnessConfig) -> MiniBrowseHarness: __all__ = [ + "AgentConfig", "MiniBrowseHarness", "MiniBrowseHarnessConfig", "MiniBrowseTaskPayload", diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py index c0d6ef1c2..09ee45d3d 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py @@ -396,7 +396,13 @@ async def _run(args: argparse.Namespace) -> int: ) if conversation is not None and not isinstance(conversation, list): raise ValueError("Mini Browse conversation payload must be a list") - model = os.environ.get("OPENAI_MODEL", "intercepted/model") + from openai import AsyncOpenAI + + model_client = _read_json(Path(args.model_client)) + model = model_client["model"] + client = AsyncOpenAI( + base_url=model_client["base_url"], api_key=model_client["api_key"] + ) coordinate_mode = os.environ.get("MINI_BROWSE_COORDINATE_MODE", "relative_1000") payload: dict[str, Any] @@ -415,6 +421,7 @@ async def _run(args: argparse.Namespace) -> int: url=start_url, output_schema=output_schema, model=model, + client=client, max_steps=int(args.max_steps), workspace_root=workspace_root, include_builtin_tools=_env_bool("MINI_BROWSE_INCLUDE_BUILTIN_TOOLS"), @@ -542,6 +549,7 @@ async def _run(args: argparse.Namespace) -> int: def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run the Mini Browse harness.") parser.add_argument("--task", required=True) + parser.add_argument("--model-client", required=True) parser.add_argument("--result", required=True) parser.add_argument("--transcript", required=True) parser.add_argument("--metrics", required=True) diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py index abdfc95f7..14255bdb7 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py @@ -32,8 +32,6 @@ "team-cmlr3u2er002zhr01tj8f48ts/" "mini-browse-apps:destination-autocomplete-tight-20260528-0027" ) -DEFAULT_HUB_ENV_ID = "prime/mini-browse-apps-platform-v1" -DEFAULT_DATASET_FILENAME = "google_flights_10.jsonl.gz" DATASET_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "mini-browse-apps" APP_PORT = 5173 @@ -68,10 +66,10 @@ class MiniBrowseAppsConfig(vf.TasksetConfig): id: str = "mini-browse-apps-platform-v1" dataset_path: str | None = None """Explicit local dataset (JSONL/JSONL.GZ); when set, skips the hub pull.""" - hub_env_id: str = DEFAULT_HUB_ENV_ID + hub_env_id: str = "prime/mini-browse-apps-platform-v1" """Prime hub environment the dataset is pulled from when no `dataset_path` is given.""" hub_version: str = "latest" - dataset_filename: str = DEFAULT_DATASET_FILENAME + dataset_filename: str = "google_flights_10.jsonl.gz" judge: JudgeConfig = Field(default_factory=JudgeConfig) From bd8dc823873400fd5991936079160af78905955c Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Fri, 19 Jun 2026 05:25:58 +0000 Subject: [PATCH 6/7] refactor(v1): drop harness max_steps; cap rollouts via --max-turns The framework enforces --max-turns at the interception layer for any harness, so the harness-specific max_steps knob was redundant. Remove it; program.py keeps its own default step backstop, and rollouts are capped with --max-turns. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../mini_browse_apps_platform_v1/harness/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py index 1dc043c1d..cb7073d7e 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py @@ -71,7 +71,6 @@ class MiniBrowseHarnessConfig(HarnessConfig): id: str = "mini-browse-apps-platform-v1" runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm") agent: AgentConfig = Field(default_factory=AgentConfig) - max_steps: int = 75 coordinate_mode: CoordinateMode = "relative_1000" keep_last_images: int = 3 image_compaction_at_tokens: int = 45_000 @@ -155,8 +154,6 @@ async def launch( METRICS_PATH, "--progress", PROGRESS_PATH, - "--max-steps", - str(self.config.max_steps), "--workspace-root", WORKSPACE_ROOT, ] From b7d15e609922c0cac484d848c9a1fccb46ae87eb Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Fri, 19 Jun 2026 05:39:00 +0000 Subject: [PATCH 7/7] chore(v1): point harness at the private agent repo; scrub naming - Default the agent source to PrimeIntellect-ai/mini-browse pinned at 157b449 (the private browser-agent repo), so the env fetches it out of the box. - Rename the proxy env var to MINI_BROWSE_HTTP_PROXY and say "private" rather than "proprietary" in the harness/README. Co-Authored-By: Claude Opus 4.8 (1M context) --- environments/mini_browse_apps_platform_v1/README.md | 8 ++++---- .../mini_browse_apps_platform_v1/harness/__init__.py | 12 ++++++------ .../mini_browse_apps_platform_v1/harness/program.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md index e5fb92280..fcd40bdb4 100644 --- a/environments/mini_browse_apps_platform_v1/README.md +++ b/environments/mini_browse_apps_platform_v1/README.md @@ -7,17 +7,17 @@ LLM judge scores the submission against a deterministic answer key. The model must be **multimodal** (the agent's only input is screenshots). -## Proprietary agent (fetched at run time) +## Private agent (fetched at run time) -The browser agent is **proprietary and not vendored in this repo**. The harness fetches it at run +The browser agent is **private and not vendored in this repo**. The harness fetches it at run time from a **private GitHub repo** (pinned to a commit), caches it under `~/.cache/verifiers/browse-agent//`, then stages it into the sandbox. Configure via `--harness.*`: | Field | Default | Meaning | | --- | --- | --- | -| `agent.repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. | -| `agent.ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent.path` is set).** | +| `agent.repo` | `PrimeIntellect-ai/mini-browse` | Private `owner/name` to fetch the agent from. | +| `agent.ref` | `157b449` | Pinned commit sha to fetch (`agent.path` skips the fetch). | | `agent.token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent.repo`. | | `agent.path` | _(unset)_ | Local dir containing the agent package — skips the fetch (development). | diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py index cb7073d7e..78f67a779 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py @@ -1,6 +1,6 @@ """Browser-app harness: stages a privately-distributed browser agent into the sandbox. -The browser agent is proprietary and is NOT vendored in this repo. It is fetched at run time +The browser agent is private and is NOT vendored in this repo. It is fetched at run time from a private, auth-gated GitHub repo (pinned to a commit), cached locally, then tarred and staged into the sandbox, where `program.py` (a uv script) imports and runs it. For local development, point `agent.path` at a checkout instead of fetching. @@ -49,12 +49,12 @@ class AgentConfig(BaseConfig): - """The proprietary browser agent — fetched at run time from a private repo, not vendored.""" + """The private browser agent — fetched at run time from a private repo, not vendored.""" - repo: str = "PrimeIntellect-ai/plex-mini-browse" + repo: str = "PrimeIntellect-ai/mini-browse" """Private GitHub repo (owner/name) the agent is fetched from.""" - ref: str = "" - """Pinned commit sha to fetch (required unless `path` is set).""" + ref: str = "157b449" + """Pinned commit sha to fetch (`path` skips the fetch for local development).""" package: str = "mini_browse" """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage.""" token_env: str = "MINI_BROWSE_GITHUB_TOKEN" @@ -66,7 +66,7 @@ class AgentConfig(BaseConfig): class MiniBrowseHarnessConfig(HarnessConfig): - """Reusable browser harness; fetches its proprietary agent from a private repo.""" + """Reusable browser harness; fetches its agent from a private GitHub repo.""" id: str = "mini-browse-apps-platform-v1" runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm") diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py index 09ee45d3d..d4e4bae93 100644 --- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py @@ -384,7 +384,7 @@ async def _run(args: argparse.Namespace) -> int: os.environ["MINI_BROWSE_BROWSER_API_URL"] = browser_api_url http_proxy = str(task_payload.get("http_proxy") or "").strip() if http_proxy: - os.environ["PERPLEXITY_TAILSCALE_HTTP_PROXY"] = http_proxy + os.environ["MINI_BROWSE_HTTP_PROXY"] = http_proxy source = str(task_payload.get("source") or "verifiers-mini-browse") task_preamble = str( task_payload.get("task_preamble")