diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md new file mode 100644 index 000000000..fcd40bdb4 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/README.md @@ -0,0 +1,63 @@ +# mini-browse-apps-platform-v1 + +Sandboxed local-app **browser-agent** environment. Each task boots a local single-page web app +(SPA server + headless-Chromium CDP service) inside a per-task Docker image; a browser agent drives +it by **screenshots → vision model → click/type actions**, then submits a structured JSON result. An +LLM judge scores the submission against a deterministic answer key. + +The model must be **multimodal** (the agent's only input is screenshots). + +## Private agent (fetched at run time) + +The browser agent is **private and not vendored in this repo**. The harness fetches it at run +time from a **private GitHub repo** (pinned to a commit), caches it under +`~/.cache/verifiers/browse-agent//`, then stages it into the sandbox. Configure via +`--harness.*`: + +| Field | Default | Meaning | +| --- | --- | --- | +| `agent.repo` | `PrimeIntellect-ai/mini-browse` | Private `owner/name` to fetch the agent from. | +| `agent.ref` | `157b449` | Pinned commit sha to fetch (`agent.path` skips the fetch). | +| `agent.token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent.repo`. | +| `agent.path` | _(unset)_ | Local dir containing the agent package — skips the fetch (development). | + +## Tasks (pulled dynamically) + +Tasks are **pulled from the Prime hub and cached locally** — nothing is bundled in this package. +`load_tasks` pulls the dataset from `prime/mini-browse-apps-platform-v1` (private; via `prime env +pull`) into `~/.cache/verifiers/mini-browse-apps//`. Override with `--taskset.dataset_path +`, or repoint `--taskset.hub_env_id` / `--taskset.hub_version`. + +## Run + +The taskset and harness are co-packaged (resolved via `__all__`), so `--harness.id` matches the +taskset id. The task image is a Prime-registry image, so use the `prime` runtime: + +```bash +export MINI_BROWSE_GITHUB_TOKEN= +uv run eval mini-browse-apps-platform-v1 \ + --harness.id mini-browse-apps-platform-v1 \ + --harness.runtime.type prime \ + --harness.agent.ref \ + -m \ + -n 1 -r 1 -c 1 +``` + +## Reward & metrics + +`answer_key` (weight 1.0) judges the submitted result against the gold answer key. The judge uses a +structured-output (`json_schema`) model — default `openai/gpt-4.1-mini` on Prime inference +(auto-resolved); override with `--taskset.judge.model` / `--taskset.judge.client.*`. Reward 1.0 == +all expected fields correct (`verdict: "yes"`); partial credit is `correct_fields / total_fields`. +Metrics: `result_present`, `submitted_result_present`, `agent_error`, `transcript_image_count`, +`message_count`. + +## Config (`--taskset.*`) + +| Field | Default | Meaning | +| --- | --- | --- | +| `dataset_path` | `null` | Local dataset override (skips the hub pull). | +| `hub_env_id` | `prime/mini-browse-apps-platform-v1` | Hub env the dataset is pulled from. | +| `hub_version` | `latest` | Hub env version to pull. | +| `judge.model` | `openai/gpt-4.1-mini` | Structured-output judge model. | +| `judge.client` | Prime inference | OpenAI-compatible endpoint for the judge (auto-resolved). | diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py new file mode 100644 index 000000000..9a4e10f35 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py @@ -0,0 +1,16 @@ +"""mini-browse-apps-platform-v1 — sandboxed local-app Mini Browse browser tasks (v1). + +Co-packages the taskset and its browser harness; both are resolved by id from this module's +`__all__` (`--taskset.id` / `--harness.id mini-browse-apps-platform-v1`). +""" + +from .harness import MiniBrowseHarness, MiniBrowseHarnessConfig +from .taskset import MiniBrowseAppsConfig, MiniBrowseAppsTaskset, MiniBrowseAppTask + +__all__ = [ + "MiniBrowseAppsTaskset", + "MiniBrowseAppsConfig", + "MiniBrowseAppTask", + "MiniBrowseHarness", + "MiniBrowseHarnessConfig", +] diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore new file mode 100644 index 000000000..887009b99 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore @@ -0,0 +1,2 @@ +# The browser agent is proprietary and fetched at run time from a private repo — never commit it. +vendor/ diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py new file mode 100644 index 000000000..78f67a779 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py @@ -0,0 +1,265 @@ +"""Browser-app harness: stages a privately-distributed browser agent into the sandbox. + +The browser agent is private and is NOT vendored in this repo. It is fetched at run time +from a private, auth-gated GitHub repo (pinned to a commit), cached locally, then tarred and +staged into the sandbox, where `program.py` (a uv script) imports and runs it. For local +development, point `agent.path` at a checkout instead of fetching. +""" + +from __future__ import annotations + +import io +import json +import os +import shlex +import shutil +import tarfile +import tempfile +from pathlib import Path +from typing import Literal + +import httpx +from pydantic import Field +from pydantic_config import BaseConfig +from verifiers.v1.clients import RolloutContext +from verifiers.v1.errors import HarnessError +from verifiers.v1.harness import Harness, HarnessConfig +from verifiers.v1.runtimes import DockerConfig, ProgramResult, Runtime, RuntimeConfig +from verifiers.v1.trace import Trace + +from .contract import ( + METRICS_PATH, + MiniBrowseTaskPayload, + PROGRESS_PATH, + RESULT_PATH, + TASK_PAYLOAD_PATH, + TRANSCRIPT_PATH, + WORKSPACE_ROOT, +) +from .diagnostics import read_jsonl_tail + +PROGRAM_SOURCE = (Path(__file__).resolve().parent / "program.py").read_text() + +AGENT_RUNTIME = "/opt/browse-agent-runtime" +AGENT_TARBALL = "/tmp/vf-browse-agent-runtime.tgz" +AGENT_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "browse-agent" +MODEL_CLIENT_PATH = "/tmp/vf-browse-model-client.json" + +CoordinateMode = Literal["relative_1000", "absolute", "auto"] + + +class AgentConfig(BaseConfig): + """The private browser agent — fetched at run time from a private repo, not vendored.""" + + repo: str = "PrimeIntellect-ai/mini-browse" + """Private GitHub repo (owner/name) the agent is fetched from.""" + ref: str = "157b449" + """Pinned commit sha to fetch (`path` skips the fetch for local development).""" + package: str = "mini_browse" + """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage.""" + token_env: str = "MINI_BROWSE_GITHUB_TOKEN" + """Env var holding a GitHub token with read access to `repo`.""" + path: str | None = None + """Local dir containing `/` — when set, skips the GitHub fetch (development).""" + cache_dir: str | None = None + """Where fetched revisions are cached (default: ~/.cache/verifiers/browse-agent).""" + + +class MiniBrowseHarnessConfig(HarnessConfig): + """Reusable browser harness; fetches its agent from a private GitHub repo.""" + + id: str = "mini-browse-apps-platform-v1" + runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm") + agent: AgentConfig = Field(default_factory=AgentConfig) + coordinate_mode: CoordinateMode = "relative_1000" + keep_last_images: int = 3 + image_compaction_at_tokens: int = 45_000 + include_builtin_tools: bool = False + browser_start_min_interval_seconds: float = 0.0 + browser_start_jitter_seconds: float = 0.0 + browser_start_max_in_flight: int = 0 + record_frames: bool = False + + +class MiniBrowseHarness(Harness[MiniBrowseHarnessConfig]): + """Stages the privately-fetched browser agent and executes its agent loop.""" + + SUPPORTS_TASK_TOOLS = False + SUPPORTS_MESSAGE_PROMPT = False + + async def launch( + self, + ctx: RolloutContext, + trace: Trace, + runtime: Runtime, + endpoint: str, + secret: str, + mcp_urls: dict[str, str], + ) -> ProgramResult: + if mcp_urls: + names = ", ".join(sorted(mcp_urls)) + raise ValueError( + f"Browser harness does not expose v1 MCP task tools: {names}" + ) + if trace.task.system_prompt: + raise ValueError( + "Browser harness owns the system prompt; put task-specific instructions " + "in task.prompt or the task payload." + ) + if not isinstance(trace.task.prompt, str): + raise ValueError("Browser harness requires a string task prompt") + + await self._stage_agent(runtime) + await runtime.write( + MODEL_CLIENT_PATH, + json.dumps( + {"base_url": endpoint, "api_key": secret, "model": ctx.model} + ).encode("utf-8"), + ) + env = { + **self.config.env, + "PYTHONPATH": self._pythonpath(), + "MINI_BROWSE_COORDINATE_MODE": self.config.coordinate_mode, + "MINI_BROWSE_KEEP_LAST_IMAGES": str(self.config.keep_last_images), + "MINI_BROWSE_IMAGE_COMPACTION_AT_TOKENS": str( + self.config.image_compaction_at_tokens + ), + "MINI_BROWSE_INCLUDE_BUILTIN_TOOLS": ( + "1" if self.config.include_builtin_tools else "0" + ), + "MINI_BROWSE_BROWSER_START_MIN_INTERVAL_SECONDS": str( + self.config.browser_start_min_interval_seconds + ), + "MINI_BROWSE_BROWSER_START_JITTER_SECONDS": str( + self.config.browser_start_jitter_seconds + ), + "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT": str( + self.config.browser_start_max_in_flight + ), + "MINI_BROWSE_PROGRESS_PATH": PROGRESS_PATH, + } + if self.config.record_frames: + env["MINI_BROWSE_RECORD_FRAMES_DIR"] = "/logs/mini_browse/frames" + + args = [ + "--task", + TASK_PAYLOAD_PATH, + "--model-client", + MODEL_CLIENT_PATH, + "--result", + RESULT_PATH, + "--transcript", + TRANSCRIPT_PATH, + "--metrics", + METRICS_PATH, + "--progress", + PROGRESS_PATH, + "--workspace-root", + WORKSPACE_ROOT, + ] + return await runtime.run_uv_script(PROGRAM_SOURCE, args=args, env=env) + + async def _stage_agent(self, runtime: Runtime) -> None: + await runtime.write(AGENT_TARBALL, self._agent_tarball()) + command = ( + f"rm -rf {shlex.quote(AGENT_RUNTIME)} && " + f"mkdir -p {shlex.quote(AGENT_RUNTIME)} && " + f"tar -xzf {shlex.quote(AGENT_TARBALL)} -C {shlex.quote(AGENT_RUNTIME)}" + ) + result = await runtime.run(["sh", "-c", command], {}) + if result.exit_code != 0: + raise HarnessError( + f"agent staging failed: {result.stderr.strip()[-500:]}" + ) + + def _agent_tarball(self) -> bytes: + package = self._ensure_agent() / self.config.agent.package + if not package.is_dir(): + raise HarnessError( + f"agent package {self.config.agent.package!r} not found under {package.parent}" + ) + buffer = io.BytesIO() + with tarfile.open(fileobj=buffer, mode="w:gz") as archive: + archive.add(package, arcname=self.config.agent.package) + return buffer.getvalue() + + def _ensure_agent(self) -> Path: + """Return a dir that contains `/` — a local checkout or the fetch cache.""" + agent = self.config.agent + if agent.path: + return Path(agent.path).expanduser() + if not agent.ref: + raise HarnessError( + "set --harness.agent.ref to a pinned commit sha " + "(or --harness.agent.path to a local checkout for development)" + ) + cache_root = ( + Path(agent.cache_dir).expanduser() if agent.cache_dir else AGENT_CACHE_DIR + ) + dest = cache_root / agent.ref + if not (dest / agent.package).exists(): + self._download_agent(dest) + return dest + + def _download_agent(self, dest: Path) -> None: + agent = self.config.agent + token = os.environ.get(agent.token_env) + if not token: + raise HarnessError( + f"missing ${agent.token_env} to fetch the private agent repo {agent.repo!r}" + ) + url = f"https://api.github.com/repos/{agent.repo}/tarball/{agent.ref}" + headers = { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + with tempfile.TemporaryDirectory(prefix="browse-agent-") as tmp: + archive = Path(tmp) / "agent.tar.gz" + # httpx drops the Authorization header on the cross-host redirect to codeload. + with httpx.stream( + "GET", url, headers=headers, follow_redirects=True, timeout=120 + ) as resp: + if resp.status_code != 200: + resp.read() + raise HarnessError( + f"fetching {agent.repo}@{agent.ref} failed: HTTP {resp.status_code}" + ) + with open(archive, "wb") as handle: + for chunk in resp.iter_bytes(): + handle.write(chunk) + extract = Path(tmp) / "extract" + extract.mkdir() + with tarfile.open(archive) as tar: + tar.extractall(extract, filter="data") + matches = sorted(extract.glob(f"*/{agent.package}")) + if not matches: + raise HarnessError( + f"{agent.package!r} not found in {agent.repo}@{agent.ref}" + ) + dest.mkdir(parents=True, exist_ok=True) + staging = dest / (agent.package + ".tmp") + if staging.exists(): + shutil.rmtree(staging) + shutil.copytree(matches[0], staging) + os.replace(staging, dest / agent.package) + + def _pythonpath(self) -> str: + existing = self.config.env.get("PYTHONPATH", "") + entries = [AGENT_RUNTIME] + if existing: + entries.append(existing) + return ":".join(entries) + + +def load_harness(config: MiniBrowseHarnessConfig) -> MiniBrowseHarness: + return MiniBrowseHarness(config) + + +__all__ = [ + "AgentConfig", + "MiniBrowseHarness", + "MiniBrowseHarnessConfig", + "MiniBrowseTaskPayload", + "read_jsonl_tail", +] diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py new file mode 100644 index 000000000..5c514ce07 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py @@ -0,0 +1,28 @@ +"""Public payload contract consumed by the Mini Browse harness.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, ConfigDict + +TASK_PAYLOAD_PATH = "/task/mini_browse/task.json" +RESULT_PATH = "/task/mini_browse/result.json" +TRANSCRIPT_PATH = "/logs/mini_browse/transcript.json" +METRICS_PATH = "/logs/mini_browse/metrics.json" +PROGRESS_PATH = "/logs/mini_browse/progress.jsonl" +WORKSPACE_ROOT = "/workspace/mini-browse" + + +class MiniBrowseTaskPayload(BaseModel): + """Sandbox-visible task payload for the Mini Browse harness.""" + + model_config = ConfigDict(extra="forbid") + + instruction: str + output_schema: dict[str, Any] + browser_api_url: str + start_url: str = "about:blank" + http_proxy: str | None = None + source: str = "verifiers-mini-browse" + task_preamble: str | None = None diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py new file mode 100644 index 000000000..c4b7535fd --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py @@ -0,0 +1,51 @@ +"""Small helpers for surfacing Mini Browse sandbox diagnostics.""" + +from __future__ import annotations + +import json +from collections import deque +from typing import Any + + +async def read_jsonl_tail( + runtime: Any, + path: str, + *, + max_lines: int = 80, + max_chars: int = 20_000, +) -> dict[str, Any]: + """Read a bounded JSONL tail from a sandbox artifact.""" + + try: + raw = await runtime.read(path) + except Exception as exc: + return {"path": path, "is_error": True, "error": str(exc)} + + text = raw.decode("utf-8", errors="replace") + original_chars = len(text) + if max_chars > 0 and original_chars > max_chars: + text = text[-max_chars:] + first_newline = text.find("\n") + if first_newline >= 0: + text = text[first_newline + 1 :] + + events: deque[Any] = deque(maxlen=max(0, max_lines)) + parse_errors = 0 + for line in text.splitlines(): + line = line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + parse_errors += 1 + events.append(line[:1000]) + + return { + "path": path, + "is_error": False, + "events": list(events), + "event_count": len(events), + "parse_errors": parse_errors, + "truncated": original_chars > len(text), + } diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py new file mode 100644 index 000000000..d4e4bae93 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py @@ -0,0 +1,568 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "aiohttp>=3.11", +# "openai>=2.0", +# "orjson>=3.10", +# "pillow>=11.0", +# "pydantic>=2.0", +# "pypdf>=5.4", +# "pypdfium2>=4.30", +# "python-pptx>=1.0", +# ] +# /// +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import re +import time +from pathlib import Path +from typing import Any + +ERROR_CATEGORY_CODES = { + "none": 0, + "harness_disconnect": 1, + "request_too_large_bytes": 2, + "request_too_large_tokens": 3, + "model_rate_limit": 4, + "model_auth": 5, + "model_bad_request": 6, + "model_internal_error": 7, + "max_steps_exceeded": 8, + "browser_or_sandbox": 9, + "agent_logic_error": 10, + "unknown": 11, + "model_endpoint_gone": 12, + "model_connection_failure": 13, +} + +TOOL_ERROR_PREFIXES = ( + "ValidationError", + "KeyError", + "ValueError", + "RuntimeError", + "AttributeError", + "TypeError", + "Unknown tool", +) + +TOOL_ERROR_BREAKDOWN_NAMES = ("computer", "read_page", "find", "get_page_text") +HTTP_STATUS_RE = re.compile( + r"(?:Error code:|status(?: code)?[=:]?)\s*(\d{3})", re.IGNORECASE +) +DEFAULT_PROGRESS_PATH = "/logs/mini_browse/progress.jsonl" + + +def _read_json(path: Path) -> Any: + return json.loads(path.read_text()) + + +def _read_optional_json(path: Path) -> Any: + if not path.exists(): + return None + return _read_json(path) + + +def _json_safe(value: Any) -> Any: + try: + json.dumps(value) + return value + except TypeError: + return repr(value) + + +def _write_progress(progress_path: Path, event: str, **fields: Any) -> None: + try: + progress_path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "event": event, + "timestamp": time.time(), + **{key: _json_safe(value) for key, value in fields.items()}, + } + with progress_path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n") + handle.flush() + except Exception: + return + + +def _env_float(name: str, default: float = 0.0) -> float: + raw = os.environ.get(name, "").strip() + if not raw: + return default + try: + return max(0.0, float(raw)) + except ValueError: + return default + + +def _env_int(name: str, default: int = 0) -> int: + raw = os.environ.get(name, "").strip() + if not raw: + return default + try: + return max(0, int(raw)) + except ValueError: + return default + + +def _env_bool(name: str, default: bool = False) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return raw.strip().lower() not in {"", "0", "false", "no", "off"} + + +def _http_status_from_exception(exc: BaseException | None) -> int | None: + if exc is None: + return None + status = getattr(exc, "status_code", None) + try: + return int(status) if status is not None else None + except (TypeError, ValueError): + return None + + +def _http_status_from_text(text: str | None) -> int | None: + if not text: + return None + match = HTTP_STATUS_RE.search(text) + if not match: + return None + try: + return int(match.group(1)) + except ValueError: + return None + + +def _classify_exception(exc: BaseException) -> str: + if isinstance(exc, asyncio.CancelledError): + return "harness_disconnect" + + try: + import openai + + if isinstance(exc, openai.RateLimitError): + return "model_rate_limit" + if isinstance(exc, (openai.AuthenticationError, openai.PermissionDeniedError)): + return "model_auth" + if isinstance(exc, openai.BadRequestError): + text = str(exc).lower() + bytes_markers = ( + "request entity too large", + "payload too large", + "413 request", + "413 payload", + ) + if any(marker in text for marker in bytes_markers): + return "request_too_large_bytes" + token_markers = ( + "context length", + "maximum context", + "too many tokens", + "context_length_exceeded", + "context window", + "input is too long", + ) + if any(marker in text for marker in token_markers): + return "request_too_large_tokens" + return "model_bad_request" + status = _http_status_from_exception(exc) + if isinstance(exc, openai.NotFoundError) or status == 404: + return "model_endpoint_gone" + if isinstance(exc, openai.InternalServerError) or ( + status is not None and 500 <= status < 600 + ): + return "model_internal_error" + if isinstance(exc, (openai.APIConnectionError, openai.APITimeoutError)): + return "model_connection_failure" + if isinstance(exc, openai.APIError): + if status == 404: + return "model_endpoint_gone" + if status is not None and 500 <= status < 600: + return "model_internal_error" + return "model_connection_failure" + except ImportError: + pass + + try: + import aiohttp + + if isinstance(exc, aiohttp.ClientError): + return "model_connection_failure" + except ImportError: + pass + + if isinstance(exc, TimeoutError): + return "model_connection_failure" + if isinstance(exc, ConnectionError): + return "model_connection_failure" + if isinstance(exc, OSError): + return "browser_or_sandbox" + if isinstance( + exc, (KeyError, TypeError, AttributeError, ValueError, RuntimeError, IndexError) + ): + return "agent_logic_error" + return "unknown" + + +def _diagnose(exc: BaseException | None, error_text: str | None) -> dict[str, Any]: + if exc is not None: + category = _classify_exception(exc) + error_type = type(exc).__name__ + excerpt = str(exc)[:1200] + http_status = _http_status_from_exception(exc) or _http_status_from_text( + excerpt + ) + elif error_text: + text = str(error_text) + error_type = text.split(":", 1)[0][:120] if ":" in text else text[:120] + excerpt = text[:1200] + http_status = _http_status_from_text(text) + if http_status == 404: + category = "model_endpoint_gone" + elif http_status is not None and 500 <= http_status < 600: + category = "model_internal_error" + elif "maximum steps exceeded" in text.lower(): + category = "max_steps_exceeded" + else: + category = "unknown" + else: + return { + "error_type": None, + "error_category": "none", + "error_category_code": ERROR_CATEGORY_CODES["none"], + "error_excerpt": None, + "error_http_status": None, + } + return { + "error_type": error_type, + "error_category": category, + "error_category_code": ERROR_CATEGORY_CODES[category], + "error_excerpt": excerpt, + "error_http_status": http_status, + } + + +def _count_image_parts(messages: list[dict[str, Any]]) -> int: + count = 0 + for message in messages: + content = message.get("content") if isinstance(message, dict) else None + if isinstance(content, list): + for part in content: + if isinstance(part, dict) and part.get("type") == "image_url": + count += 1 + return count + + +def _json_size_bytes(value: Any) -> int: + try: + return len(json.dumps(value, ensure_ascii=False).encode("utf-8")) + except Exception: + return 0 + + +def _summarize_tool_errors(messages: list[dict[str, Any]]) -> dict[str, Any]: + total = 0 + validation = 0 + streak = 0 + max_streak = 0 + by_tool: dict[str, int] = {} + unique_kinds: set[str] = set() + id_to_tool: dict[str, str] = {} + + for message in messages: + if not isinstance(message, dict): + continue + role = message.get("role") + if role == "assistant": + tool_calls = message.get("tool_calls") or [] + if not isinstance(tool_calls, list): + continue + for tool_call in tool_calls: + if isinstance(tool_call, str): + try: + tool_call = json.loads(tool_call) + except json.JSONDecodeError: + continue + if not isinstance(tool_call, dict): + continue + tool_call_id = tool_call.get("id") + function = tool_call.get("function") + if isinstance(function, dict): + name = function.get("name") + else: + name = tool_call.get("name") + if isinstance(tool_call_id, str) and isinstance(name, str): + id_to_tool[tool_call_id] = name + elif role == "tool": + content = message.get("content") + if not isinstance(content, str): + streak = 0 + continue + stripped = content.lstrip() + if not any(stripped.startswith(prefix) for prefix in TOOL_ERROR_PREFIXES): + streak = 0 + continue + + total += 1 + if stripped.startswith("ValidationError"): + validation += 1 + tool_name = id_to_tool.get(message.get("tool_call_id") or "", "unknown") + by_tool[tool_name] = by_tool.get(tool_name, 0) + 1 + unique_kinds.add(stripped.split("\n", 1)[0][:200]) + streak += 1 + max_streak = max(max_streak, streak) + + return { + "tool_error_count": total, + "tool_error_validation": validation, + "tool_error_max_streak": max_streak, + "tool_error_unique_kinds": len(unique_kinds), + "tool_error_by_tool": by_tool, + } + + +def _load_task_payload(path: Path) -> dict[str, Any]: + payload = _read_json(path) + if not isinstance(payload, dict): + raise ValueError(f"Mini Browse task payload must be an object: {path}") + instruction = payload.get("instruction") + if not isinstance(instruction, str) or not instruction.strip(): + raise ValueError("Mini Browse task payload requires non-empty instruction") + output_schema = payload.get("output_schema") + if not isinstance(output_schema, dict): + raise ValueError("Mini Browse task payload requires object output_schema") + return payload + + +async def _run(args: argparse.Namespace) -> int: + from mini_browse import run_bcu_task + + task_path = Path(args.task) + result_path = Path(args.result) + transcript_path = Path(args.transcript) + metrics_path = Path(args.metrics) + progress_path = Path(args.progress) + workspace_root = Path(args.workspace_root) + + result_path.parent.mkdir(parents=True, exist_ok=True) + transcript_path.parent.mkdir(parents=True, exist_ok=True) + metrics_path.parent.mkdir(parents=True, exist_ok=True) + progress_path.parent.mkdir(parents=True, exist_ok=True) + workspace_root.mkdir(parents=True, exist_ok=True) + os.environ["MINI_BROWSE_PROGRESS_PATH"] = str(progress_path) + _write_progress( + progress_path, + "harness_program_start", + task_path=str(task_path), + result_path=str(result_path), + transcript_path=str(transcript_path), + metrics_path=str(metrics_path), + workspace_root=str(workspace_root), + ) + + task_payload = _load_task_payload(task_path) + _write_progress( + progress_path, + "task_payload_loaded", + source=task_payload.get("source"), + start_url=task_payload.get("start_url"), + instruction_chars=len(task_payload.get("instruction") or ""), + output_schema_keys=sorted((task_payload.get("output_schema") or {}).keys()), + has_browser_api_url=bool(task_payload.get("browser_api_url")), + has_http_proxy=bool(task_payload.get("http_proxy")), + ) + instruction = task_payload["instruction"].strip() + output_schema = task_payload["output_schema"] + start_url = str(task_payload.get("start_url") or "about:blank") + browser_api_url = str(task_payload.get("browser_api_url") or "").strip() + if browser_api_url: + os.environ["MINI_BROWSE_BROWSER_API_URL"] = browser_api_url + http_proxy = str(task_payload.get("http_proxy") or "").strip() + if http_proxy: + os.environ["MINI_BROWSE_HTTP_PROXY"] = http_proxy + source = str(task_payload.get("source") or "verifiers-mini-browse") + task_preamble = str( + task_payload.get("task_preamble") + or os.environ.get("MINI_BROWSE_TASK_PREAMBLE") + or "" + ) + conversation = ( + _read_optional_json(Path(args.conversation)) if args.conversation else None + ) + if conversation is not None and not isinstance(conversation, list): + raise ValueError("Mini Browse conversation payload must be a list") + from openai import AsyncOpenAI + + model_client = _read_json(Path(args.model_client)) + model = model_client["model"] + client = AsyncOpenAI( + base_url=model_client["base_url"], api_key=model_client["api_key"] + ) + coordinate_mode = os.environ.get("MINI_BROWSE_COORDINATE_MODE", "relative_1000") + + payload: dict[str, Any] + messages: list[dict[str, Any]] = [] + exc_caught: BaseException | None = None + try: + _write_progress( + progress_path, + "run_bcu_task_start", + model=model, + coordinate_mode=coordinate_mode, + max_steps=int(args.max_steps), + ) + run_result = await run_bcu_task( + task=instruction, + url=start_url, + output_schema=output_schema, + model=model, + client=client, + max_steps=int(args.max_steps), + workspace_root=workspace_root, + include_builtin_tools=_env_bool("MINI_BROWSE_INCLUDE_BUILTIN_TOOLS"), + source=source, + task_preamble=task_preamble, + coordinate_mode=coordinate_mode, + conversation=conversation, + browser_start_min_interval_seconds=_env_float( + "MINI_BROWSE_BROWSER_START_MIN_INTERVAL_SECONDS" + ), + browser_start_jitter_seconds=_env_float( + "MINI_BROWSE_BROWSER_START_JITTER_SECONDS" + ), + browser_start_max_in_flight=_env_int( + "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT" + ), + ) + _write_progress( + progress_path, + "run_bcu_task_done", + is_error=run_result.is_error, + submitted_result_present=bool(run_result.submitted_result), + message_count=len(run_result.messages), + browser_session_id=run_result.browser_session_id, + ) + messages = run_result.messages + payload = { + "response": run_result.response, + "is_error": run_result.is_error, + "error": run_result.error, + "is_cancelled": run_result.is_cancelled, + "browser_session_id": run_result.browser_session_id, + "tab_group_id": run_result.tab_group_id, + "submitted_result": _json_safe(run_result.submitted_result), + "workspace_root": run_result.workspace_root, + "message_count": len(messages), + "coordinate_mode": coordinate_mode, + } + except BaseException as exc: + exc_caught = exc + _write_progress( + progress_path, + "run_bcu_task_exception", + error_type=type(exc).__name__, + error_excerpt=str(exc)[:500], + is_cancelled=isinstance(exc, asyncio.CancelledError), + ) + payload = { + "response": "", + "is_error": True, + "error": f"{type(exc).__name__}: {exc}", + "is_cancelled": isinstance(exc, asyncio.CancelledError), + "browser_session_id": None, + "tab_group_id": None, + "submitted_result": None, + "workspace_root": str(workspace_root), + "message_count": len(messages), + "coordinate_mode": coordinate_mode, + } + + diagnostics = _diagnose(exc_caught, payload.get("error")) + payload.update(diagnostics) + payload["transcript_image_count"] = _count_image_parts(messages) + payload["transcript_json_bytes"] = _json_size_bytes(messages) + payload.update(_summarize_tool_errors(messages)) + + submitted = payload.get("submitted_result") + response = payload.get("response") + answered = bool(submitted) or bool(isinstance(response, str) and response.strip()) + category = payload.get("error_category") + metrics = { + "answered": float(answered and not payload.get("is_error")), + "is_error": float(bool(payload.get("is_error"))), + "message_count": float(payload.get("message_count") or 0), + "submitted_result_present": float(bool(submitted)), + "has_browser_session": float(bool(payload.get("browser_session_id"))), + "error_category_code": float(payload.get("error_category_code") or 0), + "error_http_status": float(payload.get("error_http_status") or 0), + "transcript_image_count": float(payload.get("transcript_image_count") or 0), + "transcript_json_bytes": float(payload.get("transcript_json_bytes") or 0), + "tool_error_count": float(payload.get("tool_error_count") or 0), + "tool_error_validation": float(payload.get("tool_error_validation") or 0), + "tool_error_max_streak": float(payload.get("tool_error_max_streak") or 0), + "tool_error_unique_kinds": float(payload.get("tool_error_unique_kinds") or 0), + } + for category_name in ERROR_CATEGORY_CODES: + if category_name == "none": + continue + metrics[f"error_{category_name}"] = float(category == category_name) + by_tool = payload.get("tool_error_by_tool") or {} + for tool_name in TOOL_ERROR_BREAKDOWN_NAMES: + metrics[f"tool_error_{tool_name}"] = float(by_tool.get(tool_name, 0)) + + result_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2)) + transcript_path.write_text(json.dumps(messages, ensure_ascii=False, indent=2)) + metrics_path.write_text(json.dumps(metrics, ensure_ascii=False, indent=2)) + _write_progress( + progress_path, + "harness_program_artifacts_written", + error_category=category, + is_error=payload.get("is_error"), + result_path=str(result_path), + transcript_path=str(transcript_path), + metrics_path=str(metrics_path), + ) + + print( + json.dumps( + { + "result_path": str(result_path), + "metrics_path": str(metrics_path), + "transcript_path": str(transcript_path), + "progress_path": str(progress_path), + "error_category": category, + "error_type": payload.get("error_type"), + "error_excerpt": payload.get("error_excerpt"), + } + ) + ) + if exc_caught is not None and not isinstance(exc_caught, Exception): + raise exc_caught + return 0 + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run the Mini Browse harness.") + parser.add_argument("--task", required=True) + parser.add_argument("--model-client", required=True) + parser.add_argument("--result", required=True) + parser.add_argument("--transcript", required=True) + parser.add_argument("--metrics", required=True) + parser.add_argument("--progress", default=DEFAULT_PROGRESS_PATH) + parser.add_argument("--conversation") + parser.add_argument("--max-steps", type=int, default=75) + parser.add_argument("--workspace-root", default="/workspace/mini-browse") + return parser.parse_args() + + +def main() -> int: + return asyncio.run(_run(_parse_args())) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py new file mode 100644 index 000000000..1c749c0c6 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py @@ -0,0 +1,177 @@ +"""LLM judge for the browse-apps local-app tasks (structured-output verdict).""" + +from __future__ import annotations + +import json +import os +from typing import Any + +from openai import AsyncOpenAI +from pydantic import Field +from pydantic_config import BaseConfig + +from verifiers.utils.client_utils import load_prime_config +from verifiers.v1.clients.config import BaseClientConfig + +JUDGE_TEMPERATURE = 0 + +# Strict structured output: the judge must return exactly these fields, always valid JSON. +JUDGE_RESPONSE_FORMAT = { + "type": "json_schema", + "json_schema": { + "name": "judge_verdict", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "correct_fields": {"type": "integer"}, + "total_fields": {"type": "integer"}, + "score": {"type": "number"}, + "verdict": {"type": "string", "enum": ["yes", "partial", "no"]}, + "explanation": {"type": "string"}, + }, + "required": [ + "correct_fields", + "total_fields", + "score", + "verdict", + "explanation", + ], + }, + }, +} + +JUDGE_PROMPT = """You evaluate a browser automation agent's submitted result for a deterministic local flight-search task. + +Use the evaluation contract and gold answer as the source of truth. Score only +expected fields where score is true or absent. Ignore verifier metadata, ids, +internal keys, hidden seed fields, and non-scoreable diagnostics. + +Treat equivalent formatting as correct when the same fact is clearly attached to +the same flight leg, date, row, provider, fare, comparison slot, or outcome. +Examples: "Nonstop" equals "0 stops"; prices match after removing currency +symbols and commas; date formats match when they refer to the same calendar date; +duration strings match when they have the same total minutes. + +Extra fields are fine unless they contradict the gold answer. Critical fields +with critical=true are hard gates: if any scoreable critical field is missing or +wrong, the verdict must be "no". + +Report `correct_fields` / `total_fields` over the scoreable expected fields, a +`score` from 0 to 1, a `verdict`, and a one-sentence `explanation`. +""" + + +class JudgeConfig(BaseConfig): + """The judge model and the OpenAI-compatible endpoint it runs on (Prime auto-resolved).""" + + model: str = "openai/gpt-4.1-mini" + """A model that supports strict structured output (`json_schema`).""" + client: BaseClientConfig = Field(default_factory=BaseClientConfig) + + +async def judge_answer_key( + *, + task_instruction: str, + submitted_result: Any, + answer_key: dict[str, Any], + output_schema: dict[str, Any], + config: JudgeConfig, +) -> dict[str, Any]: + context = { + "task_instruction": task_instruction, + "submitted_result": submitted_result, + "evaluation_contract": answer_key.get("evaluator") or {}, + "gold_answer": answer_key.get("gold_answer") or answer_key, + "output_schema": output_schema, + } + response = await judge_client(config.client).chat.completions.create( + model=config.model, + messages=[ + {"role": "system", "content": JUDGE_PROMPT}, + { + "role": "user", + "content": json.dumps(context, ensure_ascii=False, sort_keys=True), + }, + ], + temperature=JUDGE_TEMPERATURE, + response_format=JUDGE_RESPONSE_FORMAT, + ) + content = response.choices[0].message.content or "{}" + return parse_json_object(content) + + +def judge_client(config: BaseClientConfig) -> AsyncOpenAI: + # base_url + team header are resolved by BaseClientConfig; the key falls back to the Prime + # CLI config for pinference (mirrors verifiers' resolve_client). + api_key = os.environ.get(config.api_key_var) + if not api_key and config.api_key_var == "PRIME_API_KEY": + api_key = load_prime_config().get("api_key") + return AsyncOpenAI( + base_url=config.base_url, + api_key=api_key or "EMPTY", + default_headers=config.headers or None, + ) + + +def score_from_judge_payload(payload: dict[str, Any]) -> float: + correct = payload.get("correct_fields") + total = payload.get("total_fields") + if isinstance(correct, int) and isinstance(total, int) and total > 0: + return max(0.0, min(1.0, correct / total)) + score = payload.get("score") + if isinstance(score, (int, float)) and not isinstance(score, bool): + return max(0.0, min(1.0, float(score))) + verdict = str(payload.get("verdict") or "").lower() + if verdict == "yes": + return 1.0 + if verdict == "partial": + return 0.5 + return 0.0 + + +def parse_json_object(content: str) -> dict[str, Any]: + # Strict structured output is always valid JSON; this stays tolerant (code fences, an + # unterminated object) as a backstop for an overridden/non-conforming judge model. + fenced = content.strip() + if fenced.startswith("```"): + fenced = fenced.split("```", 2)[1].removeprefix("json").strip() + start = fenced.find("{") + span = fenced[start:] if start >= 0 else "" + for candidate in (content, fenced, span, _balance_json(span)): + if not candidate.strip(): + continue + try: + parsed = json.loads(candidate) + except json.JSONDecodeError: + continue + if isinstance(parsed, dict): + return parsed + return {"score": 0.0, "explanation": content[:500], "verdict": "no"} + + +def _balance_json(text: str) -> str: + """Close an unterminated JSON object/array: append the missing `}`/`]` for any brackets left + open outside of strings, after dropping a dangling trailing comma.""" + stack: list[str] = [] + in_string = escaped = False + for ch in text: + if in_string: + if escaped: + escaped = False + elif ch == "\\": + escaped = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch in "{[": + stack.append("}" if ch == "{" else "]") + elif ch in "}]" and stack: + stack.pop() + trimmed = text.rstrip() + if trimmed.endswith(","): + trimmed = trimmed[:-1] + return trimmed + "".join(reversed(stack)) diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py new file mode 100644 index 000000000..14255bdb7 --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py @@ -0,0 +1,408 @@ +"""mini-browse-apps-platform-v1: local-app browser tasks pulled from the Prime hub.""" + +from __future__ import annotations + +import gzip +import json +import os +import shlex +import shutil +import subprocess +import tempfile +from pathlib import Path +from typing import Any + +from pydantic import Field +from verifiers.v1.errors import TasksetError + +import verifiers.v1 as vf + +from .harness.contract import ( + METRICS_PATH, + PROGRESS_PATH, + RESULT_PATH, + TASK_PAYLOAD_PATH, + TRANSCRIPT_PATH, + MiniBrowseTaskPayload, +) +from .harness.diagnostics import read_jsonl_tail +from .judge import JudgeConfig, judge_answer_key, score_from_judge_payload + +DEFAULT_SANDBOX_IMAGE = ( + "team-cmlr3u2er002zhr01tj8f48ts/" + "mini-browse-apps:destination-autocomplete-tight-20260528-0027" +) +DATASET_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "mini-browse-apps" + +APP_PORT = 5173 +CDP_PORT = 18080 +APP_URL = f"http://127.0.0.1:{APP_PORT}" +BROWSER_API_URL = f"http://127.0.0.1:{CDP_PORT}" +WORKDIR = "/workspace" +APP_SEED_PATH = "/task/app_seed.json" +SERVICE_LOG_DIR = "/logs/services" +APP_LOG_PATH = f"{SERVICE_LOG_DIR}/app.log" +CDP_LOG_PATH = f"{SERVICE_LOG_DIR}/cdp.log" +APP_SERVER = "/opt/mini-browse-services/spa_server.py" +CDP_SERVER = "/opt/mini-browse-services/local_cdp_service.py" +APP_ROOT = "/opt/mini-browse-app/dist" + +DEFAULT_SANDBOX_CPU = 2 +DEFAULT_SANDBOX_MEMORY_GB = 4 +DEFAULT_SANDBOX_DISK_GB = 10 + + +class MiniBrowseAppTask(vf.Task): + """One Mini Browse task backed by a sandboxed local web app.""" + + prompt: str + output_schema: dict[str, Any] + answer_key: dict[str, Any] + app_seed_ref: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +class MiniBrowseAppsConfig(vf.TasksetConfig): + id: str = "mini-browse-apps-platform-v1" + dataset_path: str | None = None + """Explicit local dataset (JSONL/JSONL.GZ); when set, skips the hub pull.""" + hub_env_id: str = "prime/mini-browse-apps-platform-v1" + """Prime hub environment the dataset is pulled from when no `dataset_path` is given.""" + hub_version: str = "latest" + dataset_filename: str = "google_flights_10.jsonl.gz" + judge: JudgeConfig = Field(default_factory=JudgeConfig) + + +class MiniBrowseAppsTaskset(vf.Taskset[MiniBrowseAppTask, MiniBrowseAppsConfig]): + """Owns local-app rows, sandbox app startup, and submitted-result judging.""" + + NEEDS_CONTAINER = True + + def __init__(self, config: MiniBrowseAppsConfig) -> None: + super().__init__(config) + self._inline_app_seeds: dict[str, dict[str, Any]] = {} + + def load_tasks(self) -> list[MiniBrowseAppTask]: + rows = self.load_rows() + if not rows: + raise ValueError("No Mini Browse app tasks were loaded") + return [self.normalize_row(i, row) for i, row in enumerate(rows)] + + async def setup(self, task: MiniBrowseAppTask, runtime: vf.Runtime) -> None: + app_seed = self.app_seed_for_task(task) + await ensure_runtime_dirs(runtime) + await write_runtime_json(runtime, APP_SEED_PATH, app_seed) + public_payload = MiniBrowseTaskPayload( + instruction=task.prompt, + start_url=APP_URL, + output_schema=task.output_schema, + browser_api_url=BROWSER_API_URL, + source="mini-browse-apps-platform-v1", + ) + await runtime.write( + TASK_PAYLOAD_PATH, + public_payload.model_dump_json(indent=2).encode("utf-8"), + ) + await start_services(runtime) + await wait_for_services(runtime) + + async def finalize( + self, task: MiniBrowseAppTask, trace: vf.Trace, runtime: vf.Runtime + ) -> None: + del task + result = await read_runtime_json(runtime, RESULT_PATH) + metrics = await read_runtime_json(runtime, METRICS_PATH) + trace.info["mini_browse_result"] = result + trace.info["mini_browse_metrics"] = metrics + trace.info["mini_browse_artifacts"] = { + "result_path": RESULT_PATH, + "transcript_path": TRANSCRIPT_PATH, + "metrics_path": METRICS_PATH, + "progress_path": PROGRESS_PATH, + "task_payload_path": TASK_PAYLOAD_PATH, + "app_seed_path": APP_SEED_PATH, + "app_log_path": APP_LOG_PATH, + "cdp_log_path": CDP_LOG_PATH, + } + if isinstance(result, dict): + trace.info["submitted_result"] = result.get("submitted_result") + if result.get("is_error"): + trace.info["mini_browse_progress_tail"] = await read_jsonl_tail( + runtime, + PROGRESS_PATH, + ) + + @vf.reward(weight=1.0) + async def answer_key(self, task: MiniBrowseAppTask, trace: vf.Trace) -> float: + result = trace_result(trace) + submitted = result.get("submitted_result") + if result.get("is_error") or not submitted: + trace.info["mini_browse_judge"] = { + "verdict": "no", + "explanation": result.get("error") or "missing submitted result", + } + return 0.0 + + judge_payload = await judge_answer_key( + task_instruction=task.prompt, + submitted_result=submitted, + answer_key=task.answer_key, + output_schema=task.output_schema, + config=self.config.judge, + ) + trace.info["mini_browse_judge"] = judge_payload + return score_from_judge_payload(judge_payload) + + @vf.metric + async def result_present(self, trace: vf.Trace) -> float: + return float(bool(trace_result(trace))) + + @vf.metric + async def submitted_result_present(self, trace: vf.Trace) -> float: + return float(bool(trace_result(trace).get("submitted_result"))) + + @vf.metric + async def agent_error(self, trace: vf.Trace) -> float: + return float(bool(trace_result(trace).get("is_error"))) + + @vf.metric + async def transcript_image_count(self, trace: vf.Trace) -> float: + return metric(trace, "transcript_image_count") + + @vf.metric + async def message_count(self, trace: vf.Trace) -> float: + return metric(trace, "message_count") + + def load_rows(self) -> list[dict[str, Any]]: + path = self.resolved_dataset_path() + if path.suffix == ".gz" or path.suffixes[-2:] == [".jsonl", ".gz"]: + with gzip.open(path, "rt", encoding="utf-8") as handle: + return [json.loads(line) for line in handle if line.strip()] + with path.open("r", encoding="utf-8") as handle: + return [json.loads(line) for line in handle if line.strip()] + + def resolved_dataset_path(self) -> Path: + if self.config.dataset_path: + path = Path(self.config.dataset_path).expanduser() + if not path.exists(): + raise FileNotFoundError(f"Mini Browse app dataset not found: {path}") + return path + return self.ensure_cached_dataset() + + def ensure_cached_dataset(self) -> Path: + cached = DATASET_CACHE_DIR / self.config.hub_version / self.config.dataset_filename + if not cached.exists(): + cached.parent.mkdir(parents=True, exist_ok=True) + self.pull_dataset_into(cached) + return cached + + def pull_dataset_into(self, dest: Path) -> None: + """Pull the env package from the Prime hub into a temp dir and copy the dataset out.""" + with tempfile.TemporaryDirectory(prefix="mini-browse-hub-") as tmp: + result = subprocess.run( + [ + "prime", "env", "pull", self.config.hub_env_id, + "-v", self.config.hub_version, "-t", tmp, "--plain", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + detail = (result.stderr or result.stdout).strip()[-1000:] + raise RuntimeError( + f"`prime env pull {self.config.hub_env_id}` failed: {detail}" + ) + matches = sorted(Path(tmp).rglob(self.config.dataset_filename)) + if not matches: + raise FileNotFoundError( + f"{self.config.dataset_filename!r} not found in pulled hub env " + f"{self.config.hub_env_id!r}" + ) + staging = dest.with_name(dest.name + ".tmp") + shutil.copyfile(matches[0], staging) + os.replace(staging, dest) + + def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask: + info = decode_info(row.get("info") or {}) + raw_instruction = info.get("instruction") or row.get("question") + if not isinstance(raw_instruction, str) or not raw_instruction.strip(): + raise ValueError(f"row {index} is missing a task instruction") + output_schema = info.get("output_schema") + if not isinstance(output_schema, dict): + raise ValueError(f"row {index} is missing output_schema") + answer_key = info.get("answer_key") or parse_answer(row.get("answer")) + if not isinstance(answer_key, dict): + raise ValueError(f"row {index} is missing answer_key") + + task_name = str(row.get("task_id") or info.get("task_name") or index) + app_seed = info.get("app_seed") + if app_seed is not None and not isinstance(app_seed, dict): + raise ValueError(f"row {index} has non-object app_seed") + app_seed_ref = info.get("app_seed_ref") + if app_seed is not None and app_seed_ref: + self._inline_app_seeds[str(app_seed_ref)] = app_seed + + return MiniBrowseAppTask( + idx=index, + name=task_name, + prompt=raw_instruction.strip(), + image=DEFAULT_SANDBOX_IMAGE, + workdir=WORKDIR, + resources=vf.TaskResources( + cpu=DEFAULT_SANDBOX_CPU, + memory=DEFAULT_SANDBOX_MEMORY_GB, + disk=DEFAULT_SANDBOX_DISK_GB, + ), + output_schema=output_schema, + answer_key=answer_key, + app_seed_ref=str(app_seed_ref) if app_seed_ref else None, + metadata={ + "task_name": info.get("task_name"), + "task_id": row.get("task_id") or answer_key.get("task_id"), + "answer_kind": answer_key.get("answer_kind"), + "source_dataset": info.get("source_dataset"), + }, + ) + + def app_seed_for_task(self, task: MiniBrowseAppTask) -> dict[str, Any]: + if not task.app_seed_ref: + raise ValueError(f"Task {task.name} has no app_seed") + seed = self._inline_app_seeds.get(task.app_seed_ref) + if seed is None: + raise ValueError( + f"Task {task.name} references seed {task.app_seed_ref} not present inline" + ) + return seed + + +async def ensure_runtime_dirs(runtime: vf.Runtime) -> None: + result = await runtime.run( + [ + "bash", + "-lc", + f"mkdir -p /task {WORKDIR} {SERVICE_LOG_DIR} " + f"{shlex.quote(str(Path(TASK_PAYLOAD_PATH).parent))}", + ], + {}, + ) + if result.exit_code != 0: + raise TasksetError( + f"Mini Browse app setup failed: {combined_output(result)}" + ) + + +async def start_services(runtime: vf.Runtime) -> None: + await runtime.run_background( + [ + "python3", + APP_SERVER, + "--host", + "127.0.0.1", + "--port", + str(APP_PORT), + "--root", + APP_ROOT, + ], + {"TASK_SEED_PATH": APP_SEED_PATH}, + APP_LOG_PATH, + ) + await runtime.run_background( + [ + "python3", + CDP_SERVER, + "--host", + "127.0.0.1", + "--port", + str(CDP_PORT), + "--chrome", + "/usr/bin/chromium", + "--headless", + ], + {}, + CDP_LOG_PATH, + ) + + +async def wait_for_services(runtime: vf.Runtime) -> None: + script = f"""\ +set -e +for i in $(seq 1 90); do + if curl --noproxy '*' -fsS --max-time 2 {APP_URL} >/dev/null \\ + && curl --noproxy '*' -fsS --max-time 2 {BROWSER_API_URL}/healthz >/dev/null; then + echo "services ready" + exit 0 + fi + sleep 1 +done +echo "service readiness failed" +echo "--- process list ---" +ps aux || true +echo "--- app log ---" +tail -120 {APP_LOG_PATH} 2>/dev/null || true +echo "--- cdp log ---" +tail -120 {CDP_LOG_PATH} 2>/dev/null || true +exit 1 +""" + result = await runtime.run(["bash", "-lc", script], {}) + if result.exit_code != 0: + raise TasksetError( + f"Mini Browse app services did not become ready: {combined_output(result)}" + ) + + +async def write_runtime_json(runtime: vf.Runtime, path: str, value: Any) -> None: + data = json.dumps(value, ensure_ascii=False, indent=2).encode("utf-8") + await runtime.write(path, data) + + +async def read_runtime_json(runtime: vf.Runtime, path: str) -> Any: + try: + raw = await runtime.read(path) + except Exception as exc: + return {"is_error": True, "error": f"missing runtime artifact {path}: {exc}"} + text = raw.decode("utf-8", errors="replace").strip() + if not text: + return {} + try: + return json.loads(text) + except json.JSONDecodeError: + return { + "is_error": True, + "error": f"invalid JSON artifact {path}: {text[:500]}", + } + + +def trace_result(trace: vf.Trace) -> dict[str, Any]: + result = trace.info.get("mini_browse_result") + return result if isinstance(result, dict) else {} + + +def metric(trace: vf.Trace, key: str) -> float: + metrics = trace.info.get("mini_browse_metrics") + if isinstance(metrics, dict): + value = metrics.get(key) + if isinstance(value, (int, float)) and not isinstance(value, bool): + return float(value) + value = trace_result(trace).get(key) + if isinstance(value, (int, float)) and not isinstance(value, bool): + return float(value) + return 0.0 + + +def decode_info(info: Any) -> dict[str, Any]: + if isinstance(info, str): + return json.loads(info) + return dict(info or {}) + + +def parse_answer(answer: Any) -> Any: + if isinstance(answer, str): + return json.loads(answer) + return answer + + +def combined_output(result: vf.ProgramResult) -> str: + return ((result.stdout or "") + (result.stderr or "")).strip()[-2000:] + + +__all__ = ["MiniBrowseAppTask", "MiniBrowseAppsConfig", "MiniBrowseAppsTaskset"] diff --git a/environments/mini_browse_apps_platform_v1/pyproject.toml b/environments/mini_browse_apps_platform_v1/pyproject.toml new file mode 100644 index 000000000..fb8bdcf5f --- /dev/null +++ b/environments/mini_browse_apps_platform_v1/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "mini-browse-apps-platform-v1" +version = "0.1.0" +description = "mini-browse-apps-platform-v1 — sandboxed local-app Mini Browse browser tasks (agentic; vision agent; LLM-judge reward)." +requires-python = ">=3.10" +dependencies = ["openai", "httpx"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["mini_browse_apps_platform_v1"] diff --git a/pyproject.toml b/pyproject.toml index 9971dbe8a..ae71425b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,7 @@ examples = [ "wordle-v1", "terminal-bench-2-v1", "alphabet-sort-v1", "r2e-gym-v1", "scaleswe-v1", "swelego-v1", "scratchpad-v1", "general-agent-v1", "swebench-verified-v1", + "mini-browse-apps-platform-v1", ] [project.optional-dependencies] @@ -161,6 +162,7 @@ alphabet-sort-v1 = { path = "environments/alphabet_sort_v1", editable = true } scratchpad-v1 = { path = "environments/scratchpad_v1", editable = true } general-agent-v1 = { path = "environments/general_agent_v1", editable = true } swebench-verified-v1 = { path = "environments/swebench_verified_v1", editable = true } +mini-browse-apps-platform-v1 = { path = "environments/mini_browse_apps_platform_v1", editable = true } [tool.uv.exclude-newer-package] # PrimeIntellect-published on PyPI (trusted publisher) diff --git a/uv.lock b/uv.lock index 06e62ff82..f5091d603 100644 --- a/uv.lock +++ b/uv.lock @@ -2400,6 +2400,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "mini-browse-apps-platform-v1" +version = "0.1.0" +source = { editable = "environments/mini_browse_apps_platform_v1" } +dependencies = [ + { name = "openai" }, +] + +[package.metadata] +requires-dist = [{ name = "openai" }] + [[package]] name = "mistral-common" version = "1.11.0" @@ -5836,6 +5847,7 @@ examples = [ { name = "glossary-v1" }, { name = "gsm8k-v1" }, { name = "math-env-v1" }, + { name = "mini-browse-apps-platform-v1" }, { name = "r2e-gym-v1" }, { name = "reverse-text-v1" }, { name = "scaleswe-v1" }, @@ -5935,6 +5947,7 @@ examples = [ { name = "glossary-v1", editable = "environments/glossary_v1" }, { name = "gsm8k-v1", editable = "environments/gsm8k_v1" }, { name = "math-env-v1", editable = "environments/math_env_v1" }, + { name = "mini-browse-apps-platform-v1", editable = "environments/mini_browse_apps_platform_v1" }, { name = "r2e-gym-v1", editable = "environments/r2e_gym_v1" }, { name = "reverse-text-v1", editable = "environments/reverse_text_v1" }, { name = "scaleswe-v1", editable = "environments/scaleswe_v1" },