From 0631af56b77f6a2e4a715b78e8224a6ba8b2ee66 Mon Sep 17 00:00:00 2001
From: Mika Senghaas <mail@mikasenghaas.de>
Date: Fri, 19 Jun 2026 04:36:36 +0000
Subject: [PATCH 1/7] feat(v1): add mini-browse-apps-platform-v1 environment

Sandboxed local-app browser-agent environment ported to the current v1 API.
Tasks are pulled dynamically from the Prime hub and cached locally. The browser
agent is proprietary and fetched at run time from a private repo (not vendored);
the harness stages it into the sandbox. Co-packages the taskset and harness.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../mini_browse_apps_platform_v1/README.md    |  54 ++
 .../mini_browse_apps_platform_v1/__init__.py  |  16 +
 .../harness/.gitignore                        |   2 +
 .../harness/__init__.py                       | 262 ++++++++
 .../harness/contract.py                       |  28 +
 .../harness/diagnostics.py                    |  51 ++
 .../harness/program.py                        | 560 ++++++++++++++++++
 .../mini_browse_apps_platform_v1/judge.py     | 150 +++++
 .../mini_browse_apps_platform_v1/taskset.py   | 533 +++++++++++++++++
 .../pyproject.toml                            |  13 +
 pyproject.toml                                |   2 +
 uv.lock                                       |  13 +
 12 files changed, 1684 insertions(+)
 create mode 100644 environments/mini_browse_apps_platform_v1/README.md
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
 create mode 100644 environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
 create mode 100644 environments/mini_browse_apps_platform_v1/pyproject.toml
diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md
new file mode 100644
index 000000000..b769f268f
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/README.md
@@ -0,0 +1,54 @@
+# mini-browse-apps-platform-v1
+
+Sandboxed local-app **browser-agent** environment. Each task boots a local single-page web app
+(SPA server + headless-Chromium CDP service) inside a per-task Docker image; a browser agent drives
+it by **screenshots → vision model → click/type actions**, then submits a structured JSON result. An
+LLM judge scores the submission against a deterministic answer key.
+
+The model must be **multimodal** (the agent's only input is screenshots).
+
+## Proprietary agent (fetched at run time)
+
+The browser agent is **proprietary and not vendored in this repo**. The harness fetches it at run
+time from a **private GitHub repo** (pinned to a commit), caches it under
+`~/.cache/verifiers/browse-agent/<sha>/`, then stages it into the sandbox. Configure via
+`--harness.*`:
+
+| Field | Default | Meaning |
+| --- | --- | --- |
+| `agent_repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. |
+| `agent_ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent_path` is set).** |
+| `agent_token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent_repo`. |
+| `agent_path` | _(unset)_ | Local dir containing `<agent_package>/` — skips the fetch (development). |
+
+So set `export MINI_BROWSE_GITHUB_TOKEN=<token>` and `--harness.agent-ref <sha>`, or point
+`--harness.agent-path` at a local checkout.
+
+## Tasks (pulled dynamically)
+
+Tasks are **pulled from the Prime hub and cached locally** — nothing is bundled in this package.
+`load_tasks` pulls the dataset from `prime/mini-browse-apps-platform-v1` (private; via `prime env
+pull`) into `~/.cache/verifiers/mini-browse-apps/<version>/`. Override with `--taskset.dataset_path
+<file>` or repoint `--taskset.hub_env_id` / `--taskset.hub_version`.
+
+## Run
+
+The taskset and harness are co-packaged (resolved via `__all__`), so `--harness.id` matches the
+taskset id. The task image is a Prime-registry image, so use the `prime` runtime:
+
+```bash
+export MINI_BROWSE_GITHUB_TOKEN=<token>
+uv run eval mini-browse-apps-platform-v1 \
+  --harness.id mini-browse-apps-platform-v1 \
+  --harness.runtime.type prime \
+  --harness.agent-ref <agent-commit-sha> \
+  -m <multimodal-model> \
+  -n 1 -r 1 -c 1
+```
+
+## Reward & metrics
+
+`answer_key` (weight 1.0) judges the submitted result against the gold answer key (`judge_model`,
+default `google/gemini-3.1-pro-preview` via pinference); reward 1.0 == judge verdict "yes". Metrics:
+`result_present`, `submitted_result_present`, `agent_error`, `transcript_image_count`,
+`message_count`.
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py
new file mode 100644
index 000000000..9a4e10f35
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/__init__.py
@@ -0,0 +1,16 @@
+"""mini-browse-apps-platform-v1 — sandboxed local-app Mini Browse browser tasks (v1).
+
+Co-packages the taskset and its browser harness; both are resolved by id from this module's
+`__all__` (`--taskset.id` / `--harness.id mini-browse-apps-platform-v1`).
+"""
+
+from .harness import MiniBrowseHarness, MiniBrowseHarnessConfig
+from .taskset import MiniBrowseAppsConfig, MiniBrowseAppsTaskset, MiniBrowseAppTask
+
+__all__ = [
+    "MiniBrowseAppsTaskset",
+    "MiniBrowseAppsConfig",
+    "MiniBrowseAppTask",
+    "MiniBrowseHarness",
+    "MiniBrowseHarnessConfig",
+]
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore
new file mode 100644
index 000000000..887009b99
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/.gitignore
@@ -0,0 +1,2 @@
+# The browser agent is proprietary and fetched at run time from a private repo — never commit it.
+vendor/
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
new file mode 100644
index 000000000..8ded780df
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
@@ -0,0 +1,262 @@
+"""Browser-app harness: stages a privately-distributed browser agent into the sandbox.
+
+The browser agent is proprietary and is NOT vendored in this repo. It is fetched at run time
+from a private, auth-gated GitHub repo (pinned to a commit), cached locally, then tarred and
+staged into the sandbox, where `program.py` (a uv script) imports and runs it. For local
+development, point `agent_path` at a checkout instead of fetching.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import shlex
+import shutil
+import tarfile
+import tempfile
+from pathlib import Path
+from typing import Literal
+
+import httpx
+from verifiers.v1.clients import RolloutContext
+from verifiers.v1.errors import HarnessError
+from verifiers.v1.harness import Harness, HarnessConfig
+from verifiers.v1.runtimes import DockerConfig, ProgramResult, Runtime, RuntimeConfig
+from verifiers.v1.trace import Trace
+
+from .contract import (
+    METRICS_PATH,
+    MiniBrowseTaskPayload,
+    PROGRESS_PATH,
+    RESULT_PATH,
+    TASK_PAYLOAD_PATH,
+    TRANSCRIPT_PATH,
+    WORKSPACE_ROOT,
+)
+from .diagnostics import read_jsonl_tail
+
+PROGRAM_SOURCE = (Path(__file__).resolve().parent / "program.py").read_text()
+
+AGENT_RUNTIME = "/opt/browse-agent-runtime"
+AGENT_TARBALL = "/tmp/vf-browse-agent-runtime.tgz"
+
+CoordinateMode = Literal["relative_1000", "absolute", "auto"]
+
+
+class MiniBrowseHarnessConfig(HarnessConfig):
+    """Reusable browser harness; fetches its proprietary agent from a private repo."""
+
+    id: str = "mini-browse-apps-platform-v1"
+    runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm")
+    # --- proprietary agent source (not vendored; fetched at run time) ---
+    agent_repo: str = "PrimeIntellect-ai/plex-mini-browse"
+    """Private GitHub repo (owner/name) the agent is fetched from."""
+    agent_ref: str = ""
+    """Pinned commit sha to fetch (required unless `agent_path` is set)."""
+    agent_package: str = "mini_browse"
+    """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage."""
+    agent_token_env: str = "MINI_BROWSE_GITHUB_TOKEN"
+    """Env var holding a GitHub token with read access to `agent_repo`."""
+    agent_path: str | None = None
+    """Local dir containing `<agent_package>/` — when set, skips the GitHub fetch (dev)."""
+    agent_cache_dir: str | None = None
+    """Where fetched agent revisions are cached (default: ~/.cache/verifiers/browse-agent)."""
+    # --- agent behavior ---
+    max_steps: int = 75
+    coordinate_mode: CoordinateMode = "relative_1000"
+    keep_last_images: int = 3
+    image_compaction_at_tokens: int = 45_000
+    include_builtin_tools: bool = False
+    browser_start_min_interval_seconds: float = 0.0
+    browser_start_jitter_seconds: float = 0.0
+    browser_start_max_in_flight: int = 0
+    record_frames: bool = False
+    task_payload_path: str = TASK_PAYLOAD_PATH
+    result_path: str = RESULT_PATH
+    transcript_path: str = TRANSCRIPT_PATH
+    metrics_path: str = METRICS_PATH
+    progress_path: str = PROGRESS_PATH
+    workspace_root: str = WORKSPACE_ROOT
+
+
+class MiniBrowseHarness(Harness[MiniBrowseHarnessConfig]):
+    """Stages the privately-fetched browser agent and executes its agent loop."""
+
+    SUPPORTS_TASK_TOOLS = False
+    SUPPORTS_MESSAGE_PROMPT = False
+
+    async def launch(
+        self,
+        ctx: RolloutContext,
+        trace: Trace,
+        runtime: Runtime,
+        endpoint: str,
+        secret: str,
+        mcp_urls: dict[str, str],
+    ) -> ProgramResult:
+        if mcp_urls:
+            names = ", ".join(sorted(mcp_urls))
+            raise ValueError(
+                f"Browser harness does not expose v1 MCP task tools: {names}"
+            )
+        if trace.task.system_prompt:
+            raise ValueError(
+                "Browser harness owns the system prompt; put task-specific instructions "
+                "in task.prompt or the task payload."
+            )
+        if not isinstance(trace.task.prompt, str):
+            raise ValueError("Browser harness requires a string task prompt")
+
+        await self._stage_agent(runtime)
+        env = {
+            **self.config.env,
+            "OPENAI_BASE_URL": endpoint,
+            "OPENAI_API_KEY": secret,
+            "OPENAI_MODEL": ctx.model,
+            "PYTHONPATH": self._pythonpath(),
+            "MINI_BROWSE_COORDINATE_MODE": self.config.coordinate_mode,
+            "MINI_BROWSE_KEEP_LAST_IMAGES": str(self.config.keep_last_images),
+            "MINI_BROWSE_IMAGE_COMPACTION_AT_TOKENS": str(
+                self.config.image_compaction_at_tokens
+            ),
+            "MINI_BROWSE_INCLUDE_BUILTIN_TOOLS": (
+                "1" if self.config.include_builtin_tools else "0"
+            ),
+            "MINI_BROWSE_BROWSER_START_MIN_INTERVAL_SECONDS": str(
+                self.config.browser_start_min_interval_seconds
+            ),
+            "MINI_BROWSE_BROWSER_START_JITTER_SECONDS": str(
+                self.config.browser_start_jitter_seconds
+            ),
+            "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT": str(
+                self.config.browser_start_max_in_flight
+            ),
+            "MINI_BROWSE_PROGRESS_PATH": self.config.progress_path,
+        }
+        if self.config.record_frames:
+            env["MINI_BROWSE_RECORD_FRAMES_DIR"] = "/logs/mini_browse/frames"
+
+        args = [
+            "--task",
+            self.config.task_payload_path,
+            "--result",
+            self.config.result_path,
+            "--transcript",
+            self.config.transcript_path,
+            "--metrics",
+            self.config.metrics_path,
+            "--progress",
+            self.config.progress_path,
+            "--max-steps",
+            str(self.config.max_steps),
+            "--workspace-root",
+            self.config.workspace_root,
+        ]
+        return await runtime.run_uv_script(PROGRAM_SOURCE, args=args, env=env)
+
+    async def _stage_agent(self, runtime: Runtime) -> None:
+        await runtime.write(AGENT_TARBALL, self._agent_tarball())
+        command = (
+            f"rm -rf {shlex.quote(AGENT_RUNTIME)} && "
+            f"mkdir -p {shlex.quote(AGENT_RUNTIME)} && "
+            f"tar -xzf {shlex.quote(AGENT_TARBALL)} -C {shlex.quote(AGENT_RUNTIME)}"
+        )
+        result = await runtime.run(["sh", "-c", command], {})
+        if result.exit_code != 0:
+            raise HarnessError(
+                f"agent staging failed: {result.stderr.strip()[-500:]}"
+            )
+
+    def _agent_tarball(self) -> bytes:
+        package = self._ensure_agent() / self.config.agent_package
+        if not package.is_dir():
+            raise HarnessError(
+                f"agent package {self.config.agent_package!r} not found under {package.parent}"
+            )
+        buffer = io.BytesIO()
+        with tarfile.open(fileobj=buffer, mode="w:gz") as archive:
+            archive.add(package, arcname=self.config.agent_package)
+        return buffer.getvalue()
+
+    def _ensure_agent(self) -> Path:
+        """Return a dir that contains `<agent_package>/` — a local checkout or the fetch cache."""
+        if self.config.agent_path:
+            return Path(self.config.agent_path).expanduser()
+        if not self.config.agent_ref:
+            raise HarnessError(
+                "set --harness.agent-ref to a pinned commit sha "
+                "(or --harness.agent-path to a local checkout for development)"
+            )
+        cache_root = (
+            Path(self.config.agent_cache_dir).expanduser()
+            if self.config.agent_cache_dir
+            else Path.home() / ".cache" / "verifiers" / "browse-agent"
+        )
+        dest = cache_root / self.config.agent_ref
+        if not (dest / self.config.agent_package).exists():
+            self._download_agent(dest)
+        return dest
+
+    def _download_agent(self, dest: Path) -> None:
+        token = os.environ.get(self.config.agent_token_env)
+        if not token:
+            raise HarnessError(
+                f"missing ${self.config.agent_token_env} to fetch the private agent repo "
+                f"{self.config.agent_repo!r}"
+            )
+        url = f"https://api.github.com/repos/{self.config.agent_repo}/tarball/{self.config.agent_ref}"
+        headers = {
+            "Authorization": f"Bearer {token}",
+            "Accept": "application/vnd.github+json",
+            "X-GitHub-Api-Version": "2022-11-28",
+        }
+        with tempfile.TemporaryDirectory(prefix="browse-agent-") as tmp:
+            archive = Path(tmp) / "agent.tar.gz"
+            # httpx drops the Authorization header on the cross-host redirect to codeload.
+            with httpx.stream(
+                "GET", url, headers=headers, follow_redirects=True, timeout=120
+            ) as resp:
+                if resp.status_code != 200:
+                    resp.read()
+                    raise HarnessError(
+                        f"fetching {self.config.agent_repo}@{self.config.agent_ref} failed: "
+                        f"HTTP {resp.status_code}"
+                    )
+                with open(archive, "wb") as handle:
+                    for chunk in resp.iter_bytes():
+                        handle.write(chunk)
+            extract = Path(tmp) / "extract"
+            extract.mkdir()
+            with tarfile.open(archive) as tar:
+                tar.extractall(extract, filter="data")
+            matches = sorted(extract.glob(f"*/{self.config.agent_package}"))
+            if not matches:
+                raise HarnessError(
+                    f"{self.config.agent_package!r} not found in "
+                    f"{self.config.agent_repo}@{self.config.agent_ref}"
+                )
+            dest.mkdir(parents=True, exist_ok=True)
+            staging = dest / (self.config.agent_package + ".tmp")
+            if staging.exists():
+                shutil.rmtree(staging)
+            shutil.copytree(matches[0], staging)
+            os.replace(staging, dest / self.config.agent_package)
+
+    def _pythonpath(self) -> str:
+        existing = self.config.env.get("PYTHONPATH", "")
+        entries = [AGENT_RUNTIME]
+        if existing:
+            entries.append(existing)
+        return ":".join(entries)
+
+
+def load_harness(config: MiniBrowseHarnessConfig) -> MiniBrowseHarness:
+    return MiniBrowseHarness(config)
+
+
+__all__ = [
+    "MiniBrowseHarness",
+    "MiniBrowseHarnessConfig",
+    "MiniBrowseTaskPayload",
+    "read_jsonl_tail",
+]
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py
new file mode 100644
index 000000000..5c514ce07
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/contract.py
@@ -0,0 +1,28 @@
+"""Public payload contract consumed by the Mini Browse harness."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict
+
+TASK_PAYLOAD_PATH = "/task/mini_browse/task.json"
+RESULT_PATH = "/task/mini_browse/result.json"
+TRANSCRIPT_PATH = "/logs/mini_browse/transcript.json"
+METRICS_PATH = "/logs/mini_browse/metrics.json"
+PROGRESS_PATH = "/logs/mini_browse/progress.jsonl"
+WORKSPACE_ROOT = "/workspace/mini-browse"
+
+
+class MiniBrowseTaskPayload(BaseModel):
+    """Sandbox-visible task payload for the Mini Browse harness."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    instruction: str
+    output_schema: dict[str, Any]
+    browser_api_url: str
+    start_url: str = "about:blank"
+    http_proxy: str | None = None
+    source: str = "verifiers-mini-browse"
+    task_preamble: str | None = None
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py
new file mode 100644
index 000000000..c4b7535fd
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/diagnostics.py
@@ -0,0 +1,51 @@
+"""Small helpers for surfacing Mini Browse sandbox diagnostics."""
+
+from __future__ import annotations
+
+import json
+from collections import deque
+from typing import Any
+
+
+async def read_jsonl_tail(
+    runtime: Any,
+    path: str,
+    *,
+    max_lines: int = 80,
+    max_chars: int = 20_000,
+) -> dict[str, Any]:
+    """Read a bounded JSONL tail from a sandbox artifact."""
+
+    try:
+        raw = await runtime.read(path)
+    except Exception as exc:
+        return {"path": path, "is_error": True, "error": str(exc)}
+
+    text = raw.decode("utf-8", errors="replace")
+    original_chars = len(text)
+    if max_chars > 0 and original_chars > max_chars:
+        text = text[-max_chars:]
+        first_newline = text.find("\n")
+        if first_newline >= 0:
+            text = text[first_newline + 1 :]
+
+    events: deque[Any] = deque(maxlen=max(0, max_lines))
+    parse_errors = 0
+    for line in text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            events.append(json.loads(line))
+        except json.JSONDecodeError:
+            parse_errors += 1
+            events.append(line[:1000])
+
+    return {
+        "path": path,
+        "is_error": False,
+        "events": list(events),
+        "event_count": len(events),
+        "parse_errors": parse_errors,
+        "truncated": original_chars > len(text),
+    }
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
new file mode 100644
index 000000000..c0d6ef1c2
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
@@ -0,0 +1,560 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#   "aiohttp>=3.11",
+#   "openai>=2.0",
+#   "orjson>=3.10",
+#   "pillow>=11.0",
+#   "pydantic>=2.0",
+#   "pypdf>=5.4",
+#   "pypdfium2>=4.30",
+#   "python-pptx>=1.0",
+# ]
+# ///
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import os
+import re
+import time
+from pathlib import Path
+from typing import Any
+
+ERROR_CATEGORY_CODES = {
+    "none": 0,
+    "harness_disconnect": 1,
+    "request_too_large_bytes": 2,
+    "request_too_large_tokens": 3,
+    "model_rate_limit": 4,
+    "model_auth": 5,
+    "model_bad_request": 6,
+    "model_internal_error": 7,
+    "max_steps_exceeded": 8,
+    "browser_or_sandbox": 9,
+    "agent_logic_error": 10,
+    "unknown": 11,
+    "model_endpoint_gone": 12,
+    "model_connection_failure": 13,
+}
+
+TOOL_ERROR_PREFIXES = (
+    "ValidationError",
+    "KeyError",
+    "ValueError",
+    "RuntimeError",
+    "AttributeError",
+    "TypeError",
+    "Unknown tool",
+)
+
+TOOL_ERROR_BREAKDOWN_NAMES = ("computer", "read_page", "find", "get_page_text")
+HTTP_STATUS_RE = re.compile(
+    r"(?:Error code:|status(?: code)?[=:]?)\s*(\d{3})", re.IGNORECASE
+)
+DEFAULT_PROGRESS_PATH = "/logs/mini_browse/progress.jsonl"
+
+
+def _read_json(path: Path) -> Any:
+    return json.loads(path.read_text())
+
+
+def _read_optional_json(path: Path) -> Any:
+    if not path.exists():
+        return None
+    return _read_json(path)
+
+
+def _json_safe(value: Any) -> Any:
+    try:
+        json.dumps(value)
+        return value
+    except TypeError:
+        return repr(value)
+
+
+def _write_progress(progress_path: Path, event: str, **fields: Any) -> None:
+    try:
+        progress_path.parent.mkdir(parents=True, exist_ok=True)
+        payload = {
+            "event": event,
+            "timestamp": time.time(),
+            **{key: _json_safe(value) for key, value in fields.items()},
+        }
+        with progress_path.open("a", encoding="utf-8") as handle:
+            handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+            handle.flush()
+    except Exception:
+        return
+
+
+def _env_float(name: str, default: float = 0.0) -> float:
+    raw = os.environ.get(name, "").strip()
+    if not raw:
+        return default
+    try:
+        return max(0.0, float(raw))
+    except ValueError:
+        return default
+
+
+def _env_int(name: str, default: int = 0) -> int:
+    raw = os.environ.get(name, "").strip()
+    if not raw:
+        return default
+    try:
+        return max(0, int(raw))
+    except ValueError:
+        return default
+
+
+def _env_bool(name: str, default: bool = False) -> bool:
+    raw = os.environ.get(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() not in {"", "0", "false", "no", "off"}
+
+
+def _http_status_from_exception(exc: BaseException | None) -> int | None:
+    if exc is None:
+        return None
+    status = getattr(exc, "status_code", None)
+    try:
+        return int(status) if status is not None else None
+    except (TypeError, ValueError):
+        return None
+
+
+def _http_status_from_text(text: str | None) -> int | None:
+    if not text:
+        return None
+    match = HTTP_STATUS_RE.search(text)
+    if not match:
+        return None
+    try:
+        return int(match.group(1))
+    except ValueError:
+        return None
+
+
+def _classify_exception(exc: BaseException) -> str:
+    if isinstance(exc, asyncio.CancelledError):
+        return "harness_disconnect"
+
+    try:
+        import openai
+
+        if isinstance(exc, openai.RateLimitError):
+            return "model_rate_limit"
+        if isinstance(exc, (openai.AuthenticationError, openai.PermissionDeniedError)):
+            return "model_auth"
+        if isinstance(exc, openai.BadRequestError):
+            text = str(exc).lower()
+            bytes_markers = (
+                "request entity too large",
+                "payload too large",
+                "413 request",
+                "413 payload",
+            )
+            if any(marker in text for marker in bytes_markers):
+                return "request_too_large_bytes"
+            token_markers = (
+                "context length",
+                "maximum context",
+                "too many tokens",
+                "context_length_exceeded",
+                "context window",
+                "input is too long",
+            )
+            if any(marker in text for marker in token_markers):
+                return "request_too_large_tokens"
+            return "model_bad_request"
+        status = _http_status_from_exception(exc)
+        if isinstance(exc, openai.NotFoundError) or status == 404:
+            return "model_endpoint_gone"
+        if isinstance(exc, openai.InternalServerError) or (
+            status is not None and 500 <= status < 600
+        ):
+            return "model_internal_error"
+        if isinstance(exc, (openai.APIConnectionError, openai.APITimeoutError)):
+            return "model_connection_failure"
+        if isinstance(exc, openai.APIError):
+            if status == 404:
+                return "model_endpoint_gone"
+            if status is not None and 500 <= status < 600:
+                return "model_internal_error"
+            return "model_connection_failure"
+    except ImportError:
+        pass
+
+    try:
+        import aiohttp
+
+        if isinstance(exc, aiohttp.ClientError):
+            return "model_connection_failure"
+    except ImportError:
+        pass
+
+    if isinstance(exc, TimeoutError):
+        return "model_connection_failure"
+    if isinstance(exc, ConnectionError):
+        return "model_connection_failure"
+    if isinstance(exc, OSError):
+        return "browser_or_sandbox"
+    if isinstance(
+        exc, (KeyError, TypeError, AttributeError, ValueError, RuntimeError, IndexError)
+    ):
+        return "agent_logic_error"
+    return "unknown"
+
+
+def _diagnose(exc: BaseException | None, error_text: str | None) -> dict[str, Any]:
+    if exc is not None:
+        category = _classify_exception(exc)
+        error_type = type(exc).__name__
+        excerpt = str(exc)[:1200]
+        http_status = _http_status_from_exception(exc) or _http_status_from_text(
+            excerpt
+        )
+    elif error_text:
+        text = str(error_text)
+        error_type = text.split(":", 1)[0][:120] if ":" in text else text[:120]
+        excerpt = text[:1200]
+        http_status = _http_status_from_text(text)
+        if http_status == 404:
+            category = "model_endpoint_gone"
+        elif http_status is not None and 500 <= http_status < 600:
+            category = "model_internal_error"
+        elif "maximum steps exceeded" in text.lower():
+            category = "max_steps_exceeded"
+        else:
+            category = "unknown"
+    else:
+        return {
+            "error_type": None,
+            "error_category": "none",
+            "error_category_code": ERROR_CATEGORY_CODES["none"],
+            "error_excerpt": None,
+            "error_http_status": None,
+        }
+    return {
+        "error_type": error_type,
+        "error_category": category,
+        "error_category_code": ERROR_CATEGORY_CODES[category],
+        "error_excerpt": excerpt,
+        "error_http_status": http_status,
+    }
+
+
+def _count_image_parts(messages: list[dict[str, Any]]) -> int:
+    count = 0
+    for message in messages:
+        content = message.get("content") if isinstance(message, dict) else None
+        if isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict) and part.get("type") == "image_url":
+                    count += 1
+    return count
+
+
+def _json_size_bytes(value: Any) -> int:
+    try:
+        return len(json.dumps(value, ensure_ascii=False).encode("utf-8"))
+    except Exception:
+        return 0
+
+
+def _summarize_tool_errors(messages: list[dict[str, Any]]) -> dict[str, Any]:
+    total = 0
+    validation = 0
+    streak = 0
+    max_streak = 0
+    by_tool: dict[str, int] = {}
+    unique_kinds: set[str] = set()
+    id_to_tool: dict[str, str] = {}
+
+    for message in messages:
+        if not isinstance(message, dict):
+            continue
+        role = message.get("role")
+        if role == "assistant":
+            tool_calls = message.get("tool_calls") or []
+            if not isinstance(tool_calls, list):
+                continue
+            for tool_call in tool_calls:
+                if isinstance(tool_call, str):
+                    try:
+                        tool_call = json.loads(tool_call)
+                    except json.JSONDecodeError:
+                        continue
+                if not isinstance(tool_call, dict):
+                    continue
+                tool_call_id = tool_call.get("id")
+                function = tool_call.get("function")
+                if isinstance(function, dict):
+                    name = function.get("name")
+                else:
+                    name = tool_call.get("name")
+                if isinstance(tool_call_id, str) and isinstance(name, str):
+                    id_to_tool[tool_call_id] = name
+        elif role == "tool":
+            content = message.get("content")
+            if not isinstance(content, str):
+                streak = 0
+                continue
+            stripped = content.lstrip()
+            if not any(stripped.startswith(prefix) for prefix in TOOL_ERROR_PREFIXES):
+                streak = 0
+                continue
+
+            total += 1
+            if stripped.startswith("ValidationError"):
+                validation += 1
+            tool_name = id_to_tool.get(message.get("tool_call_id") or "", "unknown")
+            by_tool[tool_name] = by_tool.get(tool_name, 0) + 1
+            unique_kinds.add(stripped.split("\n", 1)[0][:200])
+            streak += 1
+            max_streak = max(max_streak, streak)
+
+    return {
+        "tool_error_count": total,
+        "tool_error_validation": validation,
+        "tool_error_max_streak": max_streak,
+        "tool_error_unique_kinds": len(unique_kinds),
+        "tool_error_by_tool": by_tool,
+    }
+
+
+def _load_task_payload(path: Path) -> dict[str, Any]:
+    payload = _read_json(path)
+    if not isinstance(payload, dict):
+        raise ValueError(f"Mini Browse task payload must be an object: {path}")
+    instruction = payload.get("instruction")
+    if not isinstance(instruction, str) or not instruction.strip():
+        raise ValueError("Mini Browse task payload requires non-empty instruction")
+    output_schema = payload.get("output_schema")
+    if not isinstance(output_schema, dict):
+        raise ValueError("Mini Browse task payload requires object output_schema")
+    return payload
+
+
+async def _run(args: argparse.Namespace) -> int:
+    from mini_browse import run_bcu_task
+
+    task_path = Path(args.task)
+    result_path = Path(args.result)
+    transcript_path = Path(args.transcript)
+    metrics_path = Path(args.metrics)
+    progress_path = Path(args.progress)
+    workspace_root = Path(args.workspace_root)
+
+    result_path.parent.mkdir(parents=True, exist_ok=True)
+    transcript_path.parent.mkdir(parents=True, exist_ok=True)
+    metrics_path.parent.mkdir(parents=True, exist_ok=True)
+    progress_path.parent.mkdir(parents=True, exist_ok=True)
+    workspace_root.mkdir(parents=True, exist_ok=True)
+    os.environ["MINI_BROWSE_PROGRESS_PATH"] = str(progress_path)
+    _write_progress(
+        progress_path,
+        "harness_program_start",
+        task_path=str(task_path),
+        result_path=str(result_path),
+        transcript_path=str(transcript_path),
+        metrics_path=str(metrics_path),
+        workspace_root=str(workspace_root),
+    )
+
+    task_payload = _load_task_payload(task_path)
+    _write_progress(
+        progress_path,
+        "task_payload_loaded",
+        source=task_payload.get("source"),
+        start_url=task_payload.get("start_url"),
+        instruction_chars=len(task_payload.get("instruction") or ""),
+        output_schema_keys=sorted((task_payload.get("output_schema") or {}).keys()),
+        has_browser_api_url=bool(task_payload.get("browser_api_url")),
+        has_http_proxy=bool(task_payload.get("http_proxy")),
+    )
+    instruction = task_payload["instruction"].strip()
+    output_schema = task_payload["output_schema"]
+    start_url = str(task_payload.get("start_url") or "about:blank")
+    browser_api_url = str(task_payload.get("browser_api_url") or "").strip()
+    if browser_api_url:
+        os.environ["MINI_BROWSE_BROWSER_API_URL"] = browser_api_url
+    http_proxy = str(task_payload.get("http_proxy") or "").strip()
+    if http_proxy:
+        os.environ["PERPLEXITY_TAILSCALE_HTTP_PROXY"] = http_proxy
+    source = str(task_payload.get("source") or "verifiers-mini-browse")
+    task_preamble = str(
+        task_payload.get("task_preamble")
+        or os.environ.get("MINI_BROWSE_TASK_PREAMBLE")
+        or ""
+    )
+    conversation = (
+        _read_optional_json(Path(args.conversation)) if args.conversation else None
+    )
+    if conversation is not None and not isinstance(conversation, list):
+        raise ValueError("Mini Browse conversation payload must be a list")
+    model = os.environ.get("OPENAI_MODEL", "intercepted/model")
+    coordinate_mode = os.environ.get("MINI_BROWSE_COORDINATE_MODE", "relative_1000")
+
+    payload: dict[str, Any]
+    messages: list[dict[str, Any]] = []
+    exc_caught: BaseException | None = None
+    try:
+        _write_progress(
+            progress_path,
+            "run_bcu_task_start",
+            model=model,
+            coordinate_mode=coordinate_mode,
+            max_steps=int(args.max_steps),
+        )
+        run_result = await run_bcu_task(
+            task=instruction,
+            url=start_url,
+            output_schema=output_schema,
+            model=model,
+            max_steps=int(args.max_steps),
+            workspace_root=workspace_root,
+            include_builtin_tools=_env_bool("MINI_BROWSE_INCLUDE_BUILTIN_TOOLS"),
+            source=source,
+            task_preamble=task_preamble,
+            coordinate_mode=coordinate_mode,
+            conversation=conversation,
+            browser_start_min_interval_seconds=_env_float(
+                "MINI_BROWSE_BROWSER_START_MIN_INTERVAL_SECONDS"
+            ),
+            browser_start_jitter_seconds=_env_float(
+                "MINI_BROWSE_BROWSER_START_JITTER_SECONDS"
+            ),
+            browser_start_max_in_flight=_env_int(
+                "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT"
+            ),
+        )
+        _write_progress(
+            progress_path,
+            "run_bcu_task_done",
+            is_error=run_result.is_error,
+            submitted_result_present=bool(run_result.submitted_result),
+            message_count=len(run_result.messages),
+            browser_session_id=run_result.browser_session_id,
+        )
+        messages = run_result.messages
+        payload = {
+            "response": run_result.response,
+            "is_error": run_result.is_error,
+            "error": run_result.error,
+            "is_cancelled": run_result.is_cancelled,
+            "browser_session_id": run_result.browser_session_id,
+            "tab_group_id": run_result.tab_group_id,
+            "submitted_result": _json_safe(run_result.submitted_result),
+            "workspace_root": run_result.workspace_root,
+            "message_count": len(messages),
+            "coordinate_mode": coordinate_mode,
+        }
+    except BaseException as exc:
+        exc_caught = exc
+        _write_progress(
+            progress_path,
+            "run_bcu_task_exception",
+            error_type=type(exc).__name__,
+            error_excerpt=str(exc)[:500],
+            is_cancelled=isinstance(exc, asyncio.CancelledError),
+        )
+        payload = {
+            "response": "",
+            "is_error": True,
+            "error": f"{type(exc).__name__}: {exc}",
+            "is_cancelled": isinstance(exc, asyncio.CancelledError),
+            "browser_session_id": None,
+            "tab_group_id": None,
+            "submitted_result": None,
+            "workspace_root": str(workspace_root),
+            "message_count": len(messages),
+            "coordinate_mode": coordinate_mode,
+        }
+
+    diagnostics = _diagnose(exc_caught, payload.get("error"))
+    payload.update(diagnostics)
+    payload["transcript_image_count"] = _count_image_parts(messages)
+    payload["transcript_json_bytes"] = _json_size_bytes(messages)
+    payload.update(_summarize_tool_errors(messages))
+
+    submitted = payload.get("submitted_result")
+    response = payload.get("response")
+    answered = bool(submitted) or bool(isinstance(response, str) and response.strip())
+    category = payload.get("error_category")
+    metrics = {
+        "answered": float(answered and not payload.get("is_error")),
+        "is_error": float(bool(payload.get("is_error"))),
+        "message_count": float(payload.get("message_count") or 0),
+        "submitted_result_present": float(bool(submitted)),
+        "has_browser_session": float(bool(payload.get("browser_session_id"))),
+        "error_category_code": float(payload.get("error_category_code") or 0),
+        "error_http_status": float(payload.get("error_http_status") or 0),
+        "transcript_image_count": float(payload.get("transcript_image_count") or 0),
+        "transcript_json_bytes": float(payload.get("transcript_json_bytes") or 0),
+        "tool_error_count": float(payload.get("tool_error_count") or 0),
+        "tool_error_validation": float(payload.get("tool_error_validation") or 0),
+        "tool_error_max_streak": float(payload.get("tool_error_max_streak") or 0),
+        "tool_error_unique_kinds": float(payload.get("tool_error_unique_kinds") or 0),
+    }
+    for category_name in ERROR_CATEGORY_CODES:
+        if category_name == "none":
+            continue
+        metrics[f"error_{category_name}"] = float(category == category_name)
+    by_tool = payload.get("tool_error_by_tool") or {}
+    for tool_name in TOOL_ERROR_BREAKDOWN_NAMES:
+        metrics[f"tool_error_{tool_name}"] = float(by_tool.get(tool_name, 0))
+
+    result_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2))
+    transcript_path.write_text(json.dumps(messages, ensure_ascii=False, indent=2))
+    metrics_path.write_text(json.dumps(metrics, ensure_ascii=False, indent=2))
+    _write_progress(
+        progress_path,
+        "harness_program_artifacts_written",
+        error_category=category,
+        is_error=payload.get("is_error"),
+        result_path=str(result_path),
+        transcript_path=str(transcript_path),
+        metrics_path=str(metrics_path),
+    )
+
+    print(
+        json.dumps(
+            {
+                "result_path": str(result_path),
+                "metrics_path": str(metrics_path),
+                "transcript_path": str(transcript_path),
+                "progress_path": str(progress_path),
+                "error_category": category,
+                "error_type": payload.get("error_type"),
+                "error_excerpt": payload.get("error_excerpt"),
+            }
+        )
+    )
+    if exc_caught is not None and not isinstance(exc_caught, Exception):
+        raise exc_caught
+    return 0
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run the Mini Browse harness.")
+    parser.add_argument("--task", required=True)
+    parser.add_argument("--result", required=True)
+    parser.add_argument("--transcript", required=True)
+    parser.add_argument("--metrics", required=True)
+    parser.add_argument("--progress", default=DEFAULT_PROGRESS_PATH)
+    parser.add_argument("--conversation")
+    parser.add_argument("--max-steps", type=int, default=75)
+    parser.add_argument("--workspace-root", default="/workspace/mini-browse")
+    return parser.parse_args()
+
+
+def main() -> int:
+    return asyncio.run(_run(_parse_args()))
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
new file mode 100644
index 000000000..b1228336e
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
@@ -0,0 +1,150 @@
+"""LLM judge support for Mini Browse local-app tasks."""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+from openai import AsyncOpenAI
+
+JUDGE_TEMPERATURE = 0
+
+JUDGE_PROMPT = """You evaluate a browser automation agent's submitted result for a deterministic local flight-search task.
+
+Use the evaluation contract and gold answer as the source of truth. Score only
+expected_fields where score is true or absent. Ignore verifier metadata, ids,
+internal keys, hidden seed fields, and non-scoreable diagnostics.
+
+Treat equivalent formatting as correct when the same fact is clearly attached to
+the same flight leg, date, row, provider, fare, comparison slot, or outcome.
+Examples: "Nonstop" equals "0 stops"; prices match after removing currency
+symbols and commas; date formats match when they refer to the same calendar date;
+duration strings match when they have the same total minutes.
+
+Extra fields are fine unless they contradict the gold answer. Critical fields
+with critical=true are hard gates: if any scoreable critical field is missing or
+wrong, the verdict must be "no".
+
+Return exactly one JSON object, no prose and no code fence:
+{
+  "correct_fields": <integer>,
+  "total_fields": <integer>,
+  "score": <number from 0 to 1>,
+  "verdict": "yes" | "partial" | "no",
+  "explanation": "<one concise sentence>",
+  "field_verdicts": [
+    {
+      "field_path": "<expected field path>",
+      "verdict": "exact_match" | "semantic_match" | "wrong" | "missing",
+      "reason": "<short reason>"
+    }
+  ]
+}
+"""
+
+
+async def judge_answer_key(
+    *,
+    task_instruction: str,
+    submitted_result: Any,
+    answer_key: dict[str, Any],
+    output_schema: dict[str, Any],
+    model: str,
+    base_url: str | None,
+    api_key_env: str,
+) -> dict[str, Any]:
+    context = {
+        "task_instruction": task_instruction,
+        "submitted_result": submitted_result,
+        "evaluation_contract": answer_key.get("evaluator") or {},
+        "gold_answer": answer_key.get("gold_answer") or answer_key,
+        "output_schema": output_schema,
+    }
+    response = await judge_client(base_url, api_key_env).chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": JUDGE_PROMPT},
+            {
+                "role": "user",
+                "content": json.dumps(context, ensure_ascii=False, sort_keys=True),
+            },
+        ],
+        temperature=JUDGE_TEMPERATURE,
+        response_format={"type": "json_object"},
+    )
+    content = response.choices[0].message.content or "{}"
+    return parse_json_object(content)
+
+
+def judge_client(base_url: str | None, api_key_env: str) -> AsyncOpenAI:
+    api_key = os.environ.get(api_key_env)
+    if not api_key:
+        raise ValueError(f"Missing judge API key env var {api_key_env}")
+    kwargs: dict[str, Any] = {"api_key": api_key}
+    if base_url:
+        kwargs["base_url"] = base_url
+    default_headers = prime_default_headers(base_url)
+    if default_headers:
+        kwargs["default_headers"] = default_headers
+    return AsyncOpenAI(**kwargs)
+
+
+def score_from_judge_payload(payload: dict[str, Any]) -> float:
+    correct = payload.get("correct_fields")
+    total = payload.get("total_fields")
+    if isinstance(correct, int) and isinstance(total, int) and total > 0:
+        return max(0.0, min(1.0, correct / total))
+    score = payload.get("score")
+    if isinstance(score, (int, float)) and not isinstance(score, bool):
+        return max(0.0, min(1.0, float(score)))
+    verdict = str(payload.get("verdict") or "").lower()
+    if verdict == "yes":
+        return 1.0
+    if verdict == "partial":
+        return 0.5
+    return 0.0
+
+
+def parse_json_object(content: str) -> dict[str, Any]:
+    try:
+        parsed = json.loads(content)
+    except json.JSONDecodeError:
+        start = content.find("{")
+        end = content.rfind("}")
+        if start < 0 or end <= start:
+            return {"score": 0.0, "explanation": content[:500], "verdict": "no"}
+        parsed = json.loads(content[start : end + 1])
+    if not isinstance(parsed, dict):
+        return {
+            "score": 0.0,
+            "explanation": "judge returned non-object",
+            "verdict": "no",
+        }
+    return parsed
+
+
+def prime_team_id() -> str | None:
+    for name in ("PRIME_TEAM_ID", "PI_TEAM_ID", "X_PRIME_TEAM_ID"):
+        value = os.environ.get(name)
+        if value:
+            return value
+    config_path = Path.home() / ".prime" / "config.json"
+    try:
+        if config_path.exists():
+            config = json.loads(config_path.read_text())
+            if isinstance(config, dict):
+                value = config.get("team_id")
+                if value:
+                    return str(value)
+    except (json.JSONDecodeError, OSError):
+        return None
+    return None
+
+
+def prime_default_headers(base_url: str | None) -> dict[str, str]:
+    if not base_url or "pinference" not in base_url.lower():
+        return {}
+    team_id = prime_team_id()
+    return {"X-Prime-Team-ID": team_id} if team_id else {}
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
new file mode 100644
index 000000000..84af873c4
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
@@ -0,0 +1,533 @@
+"""mini-browse-apps-platform-v1: local-app Mini Browse tasks pulled from the Prime hub."""
+
+from __future__ import annotations
+
+import gzip
+import hashlib
+import json
+import os
+import shlex
+import shutil
+import sqlite3
+import subprocess
+import tempfile
+import zlib
+from pathlib import Path
+from typing import Any
+
+from pydantic import Field
+from verifiers.v1.errors import TasksetError
+
+import verifiers.v1 as vf
+
+from .harness.contract import (
+    METRICS_PATH,
+    PROGRESS_PATH,
+    RESULT_PATH,
+    TASK_PAYLOAD_PATH,
+    TRANSCRIPT_PATH,
+    MiniBrowseTaskPayload,
+)
+from .harness.diagnostics import read_jsonl_tail
+from .judge import judge_answer_key, score_from_judge_payload
+
+DEFAULT_SANDBOX_IMAGE = (
+    "team-cmlr3u2er002zhr01tj8f48ts/"
+    "mini-browse-apps:destination-autocomplete-tight-20260528-0027"
+)
+DEFAULT_HUB_ENV_ID = "prime/mini-browse-apps-platform-v1"
+DEFAULT_DATASET_FILENAME = "google_flights_10.jsonl.gz"
+
+APP_PORT = 5173
+CDP_PORT = 18080
+APP_URL = f"http://127.0.0.1:{APP_PORT}"
+BROWSER_API_URL = f"http://127.0.0.1:{CDP_PORT}"
+WORKDIR = "/workspace"
+APP_SEED_PATH = "/task/app_seed.json"
+SERVICE_LOG_DIR = "/logs/services"
+APP_LOG_PATH = f"{SERVICE_LOG_DIR}/app.log"
+CDP_LOG_PATH = f"{SERVICE_LOG_DIR}/cdp.log"
+APP_SERVER = "/opt/mini-browse-services/spa_server.py"
+CDP_SERVER = "/opt/mini-browse-services/local_cdp_service.py"
+APP_ROOT = "/opt/mini-browse-app/dist"
+
+SETUP_TIMEOUT_SECONDS = 600
+FINALIZE_TIMEOUT_SECONDS = 120
+SCORING_TIMEOUT_SECONDS = 180
+DEFAULT_TIMEOUT_SECONDS = 3600.0
+DEFAULT_SANDBOX_CPU = 2
+DEFAULT_SANDBOX_MEMORY_GB = 4
+DEFAULT_SANDBOX_DISK_GB = 10
+TASKSET_SHUFFLE_SEED = "google_flights_kernel_v1_dense_hard_no_bag_1188_order_v1"
+
+
+class MiniBrowseAppTask(vf.Task):
+    """One Mini Browse task backed by a sandboxed local web app."""
+
+    prompt: str
+    output_schema: dict[str, Any]
+    answer_key: dict[str, Any]
+    app_seed_ref: str | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+
+
+class MiniBrowseAppsConfig(vf.TasksetConfig):
+    id: str = "mini-browse-apps-platform-v1"
+    dataset_path: str | None = None
+    """Explicit local dataset (JSONL/JSONL.GZ); when set, skips the hub pull."""
+    hub_env_id: str = DEFAULT_HUB_ENV_ID
+    """Prime hub environment the dataset is pulled from when no `dataset_path` is given."""
+    hub_version: str = "latest"
+    dataset_filename: str = DEFAULT_DATASET_FILENAME
+    cache_dir: str | None = None
+    """Where the pulled dataset is cached (default: ~/.cache/verifiers/mini-browse-apps)."""
+    seed_store_path: str | None = None
+    task_indices: list[int] | None = None
+    task_profile: str = "default"
+    shuffle_tasks: bool = True
+    taskset_shuffle_seed: str = TASKSET_SHUFFLE_SEED
+    sandbox_image: str = DEFAULT_SANDBOX_IMAGE
+    timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS
+    judge_model: str = "google/gemini-3.1-pro-preview"
+    judge_base_url: str | None = "https://api.pinference.ai/api/v1"
+    judge_api_key_env: str = "PRIME_API_KEY"
+
+
+class MiniBrowseAppsTaskset(vf.Taskset[MiniBrowseAppTask, MiniBrowseAppsConfig]):
+    """Owns local-app rows, sandbox app startup, and submitted-result judging."""
+
+    NEEDS_CONTAINER = True
+
+    def __init__(self, config: MiniBrowseAppsConfig) -> None:
+        super().__init__(config)
+        self._inline_app_seeds: dict[str, dict[str, Any]] = {}
+
+    def load_tasks(self) -> list[MiniBrowseAppTask]:
+        rows = self.load_rows()
+        rows = self.filter_rows(rows)
+        if self.config.shuffle_tasks:
+            rows = self.stable_shuffle_rows(rows)
+        if self.config.task_indices is not None:
+            rows = self.select_task_indices(rows)
+        if not rows:
+            raise ValueError("No Mini Browse app tasks were loaded")
+        return [self.normalize_row(i, row) for i, row in enumerate(rows)]
+
+    async def setup(self, task: MiniBrowseAppTask, runtime: vf.Runtime) -> None:
+        app_seed = self.app_seed_for_task(task)
+        await ensure_runtime_dirs(runtime)
+        await write_runtime_json(runtime, APP_SEED_PATH, app_seed)
+        public_payload = MiniBrowseTaskPayload(
+            instruction=task.prompt,
+            start_url=APP_URL,
+            output_schema=task.output_schema,
+            browser_api_url=BROWSER_API_URL,
+            source="mini-browse-apps-platform-v1",
+        )
+        await runtime.write(
+            TASK_PAYLOAD_PATH,
+            public_payload.model_dump_json(indent=2).encode("utf-8"),
+        )
+        await start_services(runtime)
+        await wait_for_services(runtime)
+
+    async def finalize(
+        self, task: MiniBrowseAppTask, trace: vf.Trace, runtime: vf.Runtime
+    ) -> None:
+        del task
+        result = await read_runtime_json(runtime, RESULT_PATH)
+        metrics = await read_runtime_json(runtime, METRICS_PATH)
+        trace.info["mini_browse_result"] = result
+        trace.info["mini_browse_metrics"] = metrics
+        trace.info["mini_browse_artifacts"] = {
+            "result_path": RESULT_PATH,
+            "transcript_path": TRANSCRIPT_PATH,
+            "metrics_path": METRICS_PATH,
+            "progress_path": PROGRESS_PATH,
+            "task_payload_path": TASK_PAYLOAD_PATH,
+            "app_seed_path": APP_SEED_PATH,
+            "app_log_path": APP_LOG_PATH,
+            "cdp_log_path": CDP_LOG_PATH,
+        }
+        if isinstance(result, dict):
+            trace.info["submitted_result"] = result.get("submitted_result")
+            if result.get("is_error"):
+                trace.info["mini_browse_progress_tail"] = await read_jsonl_tail(
+                    runtime,
+                    PROGRESS_PATH,
+                )
+
+    @vf.reward(weight=1.0)
+    async def answer_key(self, task: MiniBrowseAppTask, trace: vf.Trace) -> float:
+        result = trace_result(trace)
+        submitted = result.get("submitted_result")
+        if result.get("is_error") or not submitted:
+            trace.info["mini_browse_judge"] = {
+                "verdict": "no",
+                "explanation": result.get("error") or "missing submitted result",
+            }
+            return 0.0
+
+        judge_payload = await judge_answer_key(
+            task_instruction=task.prompt,
+            submitted_result=submitted,
+            answer_key=task.answer_key,
+            output_schema=task.output_schema,
+            model=self.config.judge_model,
+            base_url=self.config.judge_base_url,
+            api_key_env=self.config.judge_api_key_env,
+        )
+        trace.info["mini_browse_judge"] = judge_payload
+        return score_from_judge_payload(judge_payload)
+
+    @vf.metric
+    async def result_present(self, trace: vf.Trace) -> float:
+        return float(bool(trace_result(trace)))
+
+    @vf.metric
+    async def submitted_result_present(self, trace: vf.Trace) -> float:
+        return float(bool(trace_result(trace).get("submitted_result")))
+
+    @vf.metric
+    async def agent_error(self, trace: vf.Trace) -> float:
+        return float(bool(trace_result(trace).get("is_error")))
+
+    @vf.metric
+    async def transcript_image_count(self, trace: vf.Trace) -> float:
+        return metric(trace, "transcript_image_count")
+
+    @vf.metric
+    async def message_count(self, trace: vf.Trace) -> float:
+        return metric(trace, "message_count")
+
+    def load_rows(self) -> list[dict[str, Any]]:
+        path = self.resolved_dataset_path()
+        if path.suffix == ".gz" or path.suffixes[-2:] == [".jsonl", ".gz"]:
+            with gzip.open(path, "rt", encoding="utf-8") as handle:
+                return [json.loads(line) for line in handle if line.strip()]
+        with path.open("r", encoding="utf-8") as handle:
+            return [json.loads(line) for line in handle if line.strip()]
+
+    def resolved_dataset_path(self) -> Path:
+        if self.config.dataset_path:
+            path = Path(self.config.dataset_path).expanduser()
+            if not path.exists():
+                raise FileNotFoundError(f"Mini Browse app dataset not found: {path}")
+            return path
+        return self.ensure_cached_dataset()
+
+    def ensure_cached_dataset(self) -> Path:
+        cache_root = (
+            Path(self.config.cache_dir).expanduser()
+            if self.config.cache_dir
+            else Path.home() / ".cache" / "verifiers" / "mini-browse-apps"
+        )
+        cached = cache_root / self.config.hub_version / self.config.dataset_filename
+        if not cached.exists():
+            cached.parent.mkdir(parents=True, exist_ok=True)
+            self.pull_dataset_into(cached)
+        return cached
+
+    def pull_dataset_into(self, dest: Path) -> None:
+        """Pull the env package from the Prime hub into a temp dir and copy the dataset out."""
+        with tempfile.TemporaryDirectory(prefix="mini-browse-hub-") as tmp:
+            result = subprocess.run(
+                [
+                    "prime", "env", "pull", self.config.hub_env_id,
+                    "-v", self.config.hub_version, "-t", tmp, "--plain",
+                ],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode != 0:
+                detail = (result.stderr or result.stdout).strip()[-1000:]
+                raise RuntimeError(
+                    f"`prime env pull {self.config.hub_env_id}` failed: {detail}"
+                )
+            matches = sorted(Path(tmp).rglob(self.config.dataset_filename))
+            if not matches:
+                raise FileNotFoundError(
+                    f"{self.config.dataset_filename!r} not found in pulled hub env "
+                    f"{self.config.hub_env_id!r}"
+                )
+            staging = dest.with_name(dest.name + ".tmp")
+            shutil.copyfile(matches[0], staging)
+            os.replace(staging, dest)
+
+    def filter_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        if self.config.task_profile == "default":
+            return rows
+        return [row for row in rows if source_profile(row) == self.config.task_profile]
+
+    def stable_shuffle_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        seed = self.config.taskset_shuffle_seed
+
+        def key(row: dict[str, Any]) -> tuple[str, str]:
+            info = decode_info(row.get("info") or {})
+            task_id = str(row.get("task_id") or info.get("task_name") or "")
+            digest = hashlib.sha256(f"{seed}:{task_id}".encode("utf-8")).hexdigest()
+            return digest, task_id
+
+        return sorted(rows, key=key)
+
+    def select_task_indices(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        selected = []
+        for index in self.config.task_indices or []:
+            if index < 0 or index >= len(rows):
+                raise ValueError(
+                    f"task_indices contains out-of-range index {index}; "
+                    f"filtered taskset has {len(rows)} rows"
+                )
+            selected.append(rows[index])
+        return selected
+
+    def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask:
+        info = decode_info(row.get("info") or {})
+        raw_instruction = info.get("instruction") or row.get("question")
+        if not isinstance(raw_instruction, str) or not raw_instruction.strip():
+            raise ValueError(f"row {index} is missing a task instruction")
+        output_schema = info.get("output_schema")
+        if not isinstance(output_schema, dict):
+            raise ValueError(f"row {index} is missing output_schema")
+        answer_key = info.get("answer_key") or parse_answer(row.get("answer"))
+        if not isinstance(answer_key, dict):
+            raise ValueError(f"row {index} is missing answer_key")
+
+        task_name = str(row.get("task_id") or info.get("task_name") or index)
+        app_seed = info.get("app_seed")
+        if app_seed is not None and not isinstance(app_seed, dict):
+            raise ValueError(f"row {index} has non-object app_seed")
+        app_seed_ref = info.get("app_seed_ref")
+        if app_seed is not None and app_seed_ref:
+            self._inline_app_seeds[str(app_seed_ref)] = app_seed
+
+        return MiniBrowseAppTask(
+            idx=index,
+            name=task_name,
+            prompt=raw_instruction.strip(),
+            image=self.config.sandbox_image,
+            workdir=WORKDIR,
+            timeout=vf.TaskTimeout(
+                setup=SETUP_TIMEOUT_SECONDS,
+                harness=self.config.timeout_seconds,
+                finalize=FINALIZE_TIMEOUT_SECONDS,
+                scoring=SCORING_TIMEOUT_SECONDS,
+            ),
+            resources=vf.TaskResources(
+                cpu=DEFAULT_SANDBOX_CPU,
+                memory=DEFAULT_SANDBOX_MEMORY_GB,
+                disk=DEFAULT_SANDBOX_DISK_GB,
+            ),
+            output_schema=output_schema,
+            answer_key=answer_key,
+            app_seed_ref=str(app_seed_ref) if app_seed_ref else None,
+            metadata={
+                "task_name": info.get("task_name"),
+                "task_id": row.get("task_id") or answer_key.get("task_id"),
+                "answer_kind": answer_key.get("answer_kind"),
+                "source_profile": source_profile(row),
+                "source_dataset": info.get("source_dataset"),
+            },
+        )
+
+    def app_seed_for_task(self, task: MiniBrowseAppTask) -> dict[str, Any]:
+        if not task.app_seed_ref:
+            raise ValueError(f"Task {task.name} has neither app_seed nor app_seed_ref")
+        inline_seed = self._inline_app_seeds.get(task.app_seed_ref)
+        if inline_seed is not None:
+            return inline_seed
+        seed_store = self.resolved_seed_store_path()
+        if seed_store is None:
+            raise ValueError(
+                f"Task {task.name} needs seed {task.app_seed_ref}, but no seed store "
+                "was configured"
+            )
+        return load_seed(seed_store, task.app_seed_ref)
+
+    def resolved_seed_store_path(self) -> Path | None:
+        if self.config.seed_store_path:
+            return Path(self.config.seed_store_path).expanduser()
+        if not self.config.dataset_path:
+            return None
+        dataset_path = Path(self.config.dataset_path).expanduser()
+        return seed_store_for_artifact_path(dataset_path)
+
+
+async def ensure_runtime_dirs(runtime: vf.Runtime) -> None:
+    result = await runtime.run(
+        [
+            "bash",
+            "-lc",
+            f"mkdir -p /task {WORKDIR} {SERVICE_LOG_DIR} "
+            f"{shlex.quote(str(Path(TASK_PAYLOAD_PATH).parent))}",
+        ],
+        {},
+    )
+    if result.exit_code != 0:
+        raise TasksetError(
+            f"Mini Browse app setup failed: {combined_output(result)}"
+        )
+
+
+async def start_services(runtime: vf.Runtime) -> None:
+    await runtime.run_background(
+        [
+            "python3",
+            APP_SERVER,
+            "--host",
+            "127.0.0.1",
+            "--port",
+            str(APP_PORT),
+            "--root",
+            APP_ROOT,
+        ],
+        {"TASK_SEED_PATH": APP_SEED_PATH},
+        APP_LOG_PATH,
+    )
+    await runtime.run_background(
+        [
+            "python3",
+            CDP_SERVER,
+            "--host",
+            "127.0.0.1",
+            "--port",
+            str(CDP_PORT),
+            "--chrome",
+            "/usr/bin/chromium",
+            "--headless",
+        ],
+        {},
+        CDP_LOG_PATH,
+    )
+
+
+async def wait_for_services(runtime: vf.Runtime) -> None:
+    script = f"""\
+set -e
+for i in $(seq 1 90); do
+  if curl --noproxy '*' -fsS --max-time 2 {APP_URL} >/dev/null \\
+    && curl --noproxy '*' -fsS --max-time 2 {BROWSER_API_URL}/healthz >/dev/null; then
+    echo "services ready"
+    exit 0
+  fi
+  sleep 1
+done
+echo "service readiness failed"
+echo "--- process list ---"
+ps aux || true
+echo "--- app log ---"
+tail -120 {APP_LOG_PATH} 2>/dev/null || true
+echo "--- cdp log ---"
+tail -120 {CDP_LOG_PATH} 2>/dev/null || true
+exit 1
+"""
+    result = await runtime.run(["bash", "-lc", script], {})
+    if result.exit_code != 0:
+        raise TasksetError(
+            f"Mini Browse app services did not become ready: {combined_output(result)}"
+        )
+
+
+async def write_runtime_json(runtime: vf.Runtime, path: str, value: Any) -> None:
+    data = json.dumps(value, ensure_ascii=False, indent=2).encode("utf-8")
+    await runtime.write(path, data)
+
+
+async def read_runtime_json(runtime: vf.Runtime, path: str) -> Any:
+    try:
+        raw = await runtime.read(path)
+    except Exception as exc:
+        return {"is_error": True, "error": f"missing runtime artifact {path}: {exc}"}
+    text = raw.decode("utf-8", errors="replace").strip()
+    if not text:
+        return {}
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        return {
+            "is_error": True,
+            "error": f"invalid JSON artifact {path}: {text[:500]}",
+        }
+
+
+def trace_result(trace: vf.Trace) -> dict[str, Any]:
+    result = trace.info.get("mini_browse_result")
+    return result if isinstance(result, dict) else {}
+
+
+def metric(trace: vf.Trace, key: str) -> float:
+    metrics = trace.info.get("mini_browse_metrics")
+    if isinstance(metrics, dict):
+        value = metrics.get(key)
+        if isinstance(value, (int, float)) and not isinstance(value, bool):
+            return float(value)
+    value = trace_result(trace).get(key)
+    if isinstance(value, (int, float)) and not isinstance(value, bool):
+        return float(value)
+    return 0.0
+
+
+def decode_info(info: Any) -> dict[str, Any]:
+    if isinstance(info, str):
+        return json.loads(info)
+    return dict(info or {})
+
+
+def parse_answer(answer: Any) -> Any:
+    if isinstance(answer, str):
+        return json.loads(answer)
+    return answer
+
+
+def source_profile(row: dict[str, Any]) -> str | None:
+    info = decode_info(row.get("info") or {})
+    factory = info.get("factory") or {}
+    if not isinstance(factory, dict):
+        return None
+    profile = factory.get("source_profile")
+    return str(profile) if profile else None
+
+
+def seed_store_for_artifact_path(path: Path) -> Path:
+    name = path.name
+    if name.endswith(".tasks.jsonl.gz"):
+        return path.with_name(name.removesuffix(".tasks.jsonl.gz") + ".seeds.sqlite")
+    if name.endswith(".jsonl.gz"):
+        return path.with_name(name.removesuffix(".jsonl.gz") + ".seeds.sqlite")
+    if path.suffix:
+        return path.with_suffix(".seeds.sqlite")
+    return path.with_name(name + ".seeds.sqlite")
+
+
+def load_seed(seed_store: Path, seed_ref: str) -> dict[str, Any]:
+    if not seed_store.exists():
+        raise FileNotFoundError(f"Mini Browse app seed store not found: {seed_store}")
+    with sqlite3.connect(seed_store) as db:
+        columns = {
+            row[1] for row in db.execute("PRAGMA table_info(app_seeds)").fetchall()
+        }
+        if "app_seed_zlib" in columns:
+            row = db.execute(
+                "SELECT app_seed_zlib FROM app_seeds WHERE seed_id = ?",
+                (seed_ref,),
+            ).fetchone()
+            seed_json = None if row is None else zlib.decompress(row[0]).decode("utf-8")
+        else:
+            row = db.execute(
+                "SELECT app_seed_json FROM app_seeds WHERE seed_id = ?",
+                (seed_ref,),
+            ).fetchone()
+            seed_json = None if row is None else row[0]
+    if seed_json is None:
+        raise KeyError(f"Seed {seed_ref!r} not found in {seed_store}")
+    seed = json.loads(seed_json)
+    if not isinstance(seed, dict):
+        raise ValueError(f"Seed {seed_ref!r} in {seed_store} is not an object")
+    return seed
+
+
+def combined_output(result: vf.ProgramResult) -> str:
+    return ((result.stdout or "") + (result.stderr or "")).strip()[-2000:]
+
+
+__all__ = ["MiniBrowseAppTask", "MiniBrowseAppsConfig", "MiniBrowseAppsTaskset"]
diff --git a/environments/mini_browse_apps_platform_v1/pyproject.toml b/environments/mini_browse_apps_platform_v1/pyproject.toml
new file mode 100644
index 000000000..fb8bdcf5f
--- /dev/null
+++ b/environments/mini_browse_apps_platform_v1/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "mini-browse-apps-platform-v1"
+version = "0.1.0"
+description = "mini-browse-apps-platform-v1 — sandboxed local-app Mini Browse browser tasks (agentic; vision agent; LLM-judge reward)."
+requires-python = ">=3.10"
+dependencies = ["openai", "httpx"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["mini_browse_apps_platform_v1"]
diff --git a/pyproject.toml b/pyproject.toml
index 9971dbe8a..ae71425b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,7 @@ examples = [
     "wordle-v1", "terminal-bench-2-v1", "alphabet-sort-v1",
     "r2e-gym-v1", "scaleswe-v1", "swelego-v1", "scratchpad-v1",
     "general-agent-v1", "swebench-verified-v1",
+    "mini-browse-apps-platform-v1",
 ]
 
 [project.optional-dependencies]
@@ -161,6 +162,7 @@ alphabet-sort-v1 = { path = "environments/alphabet_sort_v1", editable = true }
 scratchpad-v1 = { path = "environments/scratchpad_v1", editable = true }
 general-agent-v1 = { path = "environments/general_agent_v1", editable = true }
 swebench-verified-v1 = { path = "environments/swebench_verified_v1", editable = true }
+mini-browse-apps-platform-v1 = { path = "environments/mini_browse_apps_platform_v1", editable = true }
 
 [tool.uv.exclude-newer-package]
 # PrimeIntellect-published on PyPI (trusted publisher)
diff --git a/uv.lock b/uv.lock
index 06e62ff82..f5091d603 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2400,6 +2400,17 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
 ]
 
+[[package]]
+name = "mini-browse-apps-platform-v1"
+version = "0.1.0"
+source = { editable = "environments/mini_browse_apps_platform_v1" }
+dependencies = [
+    { name = "openai" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "openai" }]
+
 [[package]]
 name = "mistral-common"
 version = "1.11.0"
@@ -5836,6 +5847,7 @@ examples = [
     { name = "glossary-v1" },
     { name = "gsm8k-v1" },
     { name = "math-env-v1" },
+    { name = "mini-browse-apps-platform-v1" },
     { name = "r2e-gym-v1" },
     { name = "reverse-text-v1" },
     { name = "scaleswe-v1" },
@@ -5935,6 +5947,7 @@ examples = [
     { name = "glossary-v1", editable = "environments/glossary_v1" },
     { name = "gsm8k-v1", editable = "environments/gsm8k_v1" },
     { name = "math-env-v1", editable = "environments/math_env_v1" },
+    { name = "mini-browse-apps-platform-v1", editable = "environments/mini_browse_apps_platform_v1" },
     { name = "r2e-gym-v1", editable = "environments/r2e_gym_v1" },
     { name = "reverse-text-v1", editable = "environments/reverse_text_v1" },
     { name = "scaleswe-v1", editable = "environments/scaleswe_v1" },

From 7d071005a6b08a87dd9f2895e5a45407e01a995c Mon Sep 17 00:00:00 2001
From: Mika Senghaas <mail@mikasenghaas.de>
Date: Fri, 19 Jun 2026 04:57:11 +0000
Subject: [PATCH 2/7] fix(v1): make the browse-apps judge tolerant of
 unterminated JSON

The judge model can return JSON with the root object left unclosed (it stops
after the final array); the old fallback grabbed a nested `}` and then crashed
the rollout on an unguarded json.loads. Strip code fences, balance open
brackets, and fall back to a default verdict instead of raising.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../mini_browse_apps_platform_v1/judge.py     | 59 ++++++++++++++-----
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
index b1228336e..4a1620546 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
@@ -108,21 +108,50 @@ def score_from_judge_payload(payload: dict[str, Any]) -> float:
 
 
 def parse_json_object(content: str) -> dict[str, Any]:
-    try:
-        parsed = json.loads(content)
-    except json.JSONDecodeError:
-        start = content.find("{")
-        end = content.rfind("}")
-        if start < 0 or end <= start:
-            return {"score": 0.0, "explanation": content[:500], "verdict": "no"}
-        parsed = json.loads(content[start : end + 1])
-    if not isinstance(parsed, dict):
-        return {
-            "score": 0.0,
-            "explanation": "judge returned non-object",
-            "verdict": "no",
-        }
-    return parsed
+    # Judge models sometimes return JSON wrapped in a code fence or left unterminated (the
+    # reasoning model stops after the last array without closing the root object). Try the raw
+    # text, then the brace span, then a bracket-balanced repair of that span.
+    fenced = content.strip()
+    if fenced.startswith("```"):
+        fenced = fenced.split("```", 2)[1].removeprefix("json").strip()
+    start = fenced.find("{")
+    span = fenced[start:] if start >= 0 else ""
+    for candidate in (content, fenced, span, _balance_json(span)):
+        if not candidate.strip():
+            continue
+        try:
+            parsed = json.loads(candidate)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(parsed, dict):
+            return parsed
+    return {"score": 0.0, "explanation": content[:500], "verdict": "no"}
+
+
+def _balance_json(text: str) -> str:
+    """Close an unterminated JSON object/array: append the missing `}`/`]` for any brackets left
+    open outside of strings, after dropping a dangling trailing comma."""
+    stack: list[str] = []
+    in_string = escaped = False
+    for ch in text:
+        if in_string:
+            if escaped:
+                escaped = False
+            elif ch == "\\":
+                escaped = True
+            elif ch == '"':
+                in_string = False
+            continue
+        if ch == '"':
+            in_string = True
+        elif ch in "{[":
+            stack.append("}" if ch == "{" else "]")
+        elif ch in "}]" and stack:
+            stack.pop()
+    trimmed = text.rstrip()
+    if trimmed.endswith(","):
+        trimmed = trimmed[:-1]
+    return trimmed + "".join(reversed(stack))
 
 
 def prime_team_id() -> str | None:

From e020bbec77638364db14f329e1190f507c6946c0 Mon Sep 17 00:00:00 2001
From: Mika Senghaas <mail@mikasenghaas.de>
Date: Fri, 19 Jun 2026 05:02:18 +0000
Subject: [PATCH 3/7] refactor(v1): simplify prime_team_id in the browse-apps
 judge

Walrus for the env lookup, .exists() instead of the nested try/except, and drop
the defensive isinstance/str cast (the prime config is always a dict).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../mini_browse_apps_platform_v1/judge.py        | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
index 4a1620546..2860b55e7 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
@@ -156,19 +156,11 @@ def _balance_json(text: str) -> str:
 
 def prime_team_id() -> str | None:
     for name in ("PRIME_TEAM_ID", "PI_TEAM_ID", "X_PRIME_TEAM_ID"):
-        value = os.environ.get(name)
-        if value:
+        if value := os.environ.get(name):
             return value
-    config_path = Path.home() / ".prime" / "config.json"
-    try:
-        if config_path.exists():
-            config = json.loads(config_path.read_text())
-            if isinstance(config, dict):
-                value = config.get("team_id")
-                if value:
-                    return str(value)
-    except (json.JSONDecodeError, OSError):
-        return None
+    config = Path.home() / ".prime" / "config.json"
+    if config.exists():
+        return json.loads(config.read_text()).get("team_id")
     return None
 
 

From b2ae4665f81480177bb3ac364d684570427a616c Mon Sep 17 00:00:00 2001
From: Mika Senghaas <mail@mikasenghaas.de>
Date: Fri, 19 Jun 2026 05:06:14 +0000
Subject: [PATCH 4/7] refactor(v1): slim browse-apps config + structured-output
 judge

- Judge config is now a JudgeConfig subconfig (model + verifiers BaseClientConfig),
  so the endpoint, team header, and API key auto-resolve to Prime inference; drop
  the bespoke prime_team_id / prime_default_headers and the flat judge_* fields.
- Judge uses a structured-output (json_schema) model (default openai/gpt-4.1-mini)
  so the verdict is always valid JSON.
- Drop config knobs that are framework-internal or belong on the task: per-task
  timeouts, sandbox_image (now set directly on the task), task shuffling,
  task_indices / task_profile, seed_store_path (+ the sqlite seed-store path), and
  the configurable cache dir (now a constant).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../mini_browse_apps_platform_v1/README.md    |  23 ++-
 .../mini_browse_apps_platform_v1/judge.py     | 116 +++++++-------
 .../mini_browse_apps_platform_v1/taskset.py   | 147 ++----------------
 3 files changed, 89 insertions(+), 197 deletions(-)

diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md
index b769f268f..7e1b441a1 100644
--- a/environments/mini_browse_apps_platform_v1/README.md
+++ b/environments/mini_browse_apps_platform_v1/README.md
@@ -21,15 +21,12 @@ time from a **private GitHub repo** (pinned to a commit), caches it under
 | `agent_token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent_repo`. |
 | `agent_path` | _(unset)_ | Local dir containing `<agent_package>/` — skips the fetch (development). |
 
-So set `export MINI_BROWSE_GITHUB_TOKEN=<token>` and `--harness.agent-ref <sha>`, or point
-`--harness.agent-path` at a local checkout.
-
 ## Tasks (pulled dynamically)
 
 Tasks are **pulled from the Prime hub and cached locally** — nothing is bundled in this package.
 `load_tasks` pulls the dataset from `prime/mini-browse-apps-platform-v1` (private; via `prime env
 pull`) into `~/.cache/verifiers/mini-browse-apps/<version>/`. Override with `--taskset.dataset_path
-<file>` or repoint `--taskset.hub_env_id` / `--taskset.hub_version`.
+<file>`, or repoint `--taskset.hub_env_id` / `--taskset.hub_version`.
 
 ## Run
 
@@ -48,7 +45,19 @@ uv run eval mini-browse-apps-platform-v1 \
 
 ## Reward & metrics
 
-`answer_key` (weight 1.0) judges the submitted result against the gold answer key (`judge_model`,
-default `google/gemini-3.1-pro-preview` via pinference); reward 1.0 == judge verdict "yes". Metrics:
-`result_present`, `submitted_result_present`, `agent_error`, `transcript_image_count`,
+`answer_key` (weight 1.0) judges the submitted result against the gold answer key. The judge uses a
+structured-output (`json_schema`) model — default `openai/gpt-4.1-mini` on Prime inference
+(auto-resolved); override with `--taskset.judge.model` / `--taskset.judge.client.*`. Reward 1.0 ==
+all expected fields correct (`verdict: "yes"`); partial credit is `correct_fields / total_fields`.
+Metrics: `result_present`, `submitted_result_present`, `agent_error`, `transcript_image_count`,
 `message_count`.
+
+## Config (`--taskset.*`)
+
+| Field | Default | Meaning |
+| --- | --- | --- |
+| `dataset_path` | `null` | Local dataset override (skips the hub pull). |
+| `hub_env_id` | `prime/mini-browse-apps-platform-v1` | Hub env the dataset is pulled from. |
+| `hub_version` | `latest` | Hub env version to pull. |
+| `judge.model` | `openai/gpt-4.1-mini` | Structured-output judge model. |
+| `judge.client` | Prime inference | OpenAI-compatible endpoint for the judge (auto-resolved). |
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
index 2860b55e7..1c749c0c6 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/judge.py
@@ -1,20 +1,51 @@
-"""LLM judge support for Mini Browse local-app tasks."""
+"""LLM judge for the browse-apps local-app tasks (structured-output verdict)."""
 
 from __future__ import annotations
 
 import json
 import os
-from pathlib import Path
 from typing import Any
 
 from openai import AsyncOpenAI
+from pydantic import Field
+from pydantic_config import BaseConfig
+
+from verifiers.utils.client_utils import load_prime_config
+from verifiers.v1.clients.config import BaseClientConfig
 
 JUDGE_TEMPERATURE = 0
 
+# Strict structured output: the judge must return exactly these fields, always valid JSON.
+JUDGE_RESPONSE_FORMAT = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "judge_verdict",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "correct_fields": {"type": "integer"},
+                "total_fields": {"type": "integer"},
+                "score": {"type": "number"},
+                "verdict": {"type": "string", "enum": ["yes", "partial", "no"]},
+                "explanation": {"type": "string"},
+            },
+            "required": [
+                "correct_fields",
+                "total_fields",
+                "score",
+                "verdict",
+                "explanation",
+            ],
+        },
+    },
+}
+
 JUDGE_PROMPT = """You evaluate a browser automation agent's submitted result for a deterministic local flight-search task.
 
 Use the evaluation contract and gold answer as the source of truth. Score only
-expected_fields where score is true or absent. Ignore verifier metadata, ids,
+expected fields where score is true or absent. Ignore verifier metadata, ids,
 internal keys, hidden seed fields, and non-scoreable diagnostics.
 
 Treat equivalent formatting as correct when the same fact is clearly attached to
@@ -27,33 +58,26 @@
 with critical=true are hard gates: if any scoreable critical field is missing or
 wrong, the verdict must be "no".
 
-Return exactly one JSON object, no prose and no code fence:
-{
-  "correct_fields": <integer>,
-  "total_fields": <integer>,
-  "score": <number from 0 to 1>,
-  "verdict": "yes" | "partial" | "no",
-  "explanation": "<one concise sentence>",
-  "field_verdicts": [
-    {
-      "field_path": "<expected field path>",
-      "verdict": "exact_match" | "semantic_match" | "wrong" | "missing",
-      "reason": "<short reason>"
-    }
-  ]
-}
+Report `correct_fields` / `total_fields` over the scoreable expected fields, a
+`score` from 0 to 1, a `verdict`, and a one-sentence `explanation`.
 """
 
 
+class JudgeConfig(BaseConfig):
+    """The judge model and the OpenAI-compatible endpoint it runs on (Prime auto-resolved)."""
+
+    model: str = "openai/gpt-4.1-mini"
+    """A model that supports strict structured output (`json_schema`)."""
+    client: BaseClientConfig = Field(default_factory=BaseClientConfig)
+
+
 async def judge_answer_key(
     *,
     task_instruction: str,
     submitted_result: Any,
     answer_key: dict[str, Any],
     output_schema: dict[str, Any],
-    model: str,
-    base_url: str | None,
-    api_key_env: str,
+    config: JudgeConfig,
 ) -> dict[str, Any]:
     context = {
         "task_instruction": task_instruction,
@@ -62,8 +86,8 @@ async def judge_answer_key(
         "gold_answer": answer_key.get("gold_answer") or answer_key,
         "output_schema": output_schema,
     }
-    response = await judge_client(base_url, api_key_env).chat.completions.create(
-        model=model,
+    response = await judge_client(config.client).chat.completions.create(
+        model=config.model,
         messages=[
             {"role": "system", "content": JUDGE_PROMPT},
             {
@@ -72,23 +96,23 @@ async def judge_answer_key(
             },
         ],
         temperature=JUDGE_TEMPERATURE,
-        response_format={"type": "json_object"},
+        response_format=JUDGE_RESPONSE_FORMAT,
     )
     content = response.choices[0].message.content or "{}"
     return parse_json_object(content)
 
 
-def judge_client(base_url: str | None, api_key_env: str) -> AsyncOpenAI:
-    api_key = os.environ.get(api_key_env)
-    if not api_key:
-        raise ValueError(f"Missing judge API key env var {api_key_env}")
-    kwargs: dict[str, Any] = {"api_key": api_key}
-    if base_url:
-        kwargs["base_url"] = base_url
-    default_headers = prime_default_headers(base_url)
-    if default_headers:
-        kwargs["default_headers"] = default_headers
-    return AsyncOpenAI(**kwargs)
+def judge_client(config: BaseClientConfig) -> AsyncOpenAI:
+    # base_url + team header are resolved by BaseClientConfig; the key falls back to the Prime
+    # CLI config for pinference (mirrors verifiers' resolve_client).
+    api_key = os.environ.get(config.api_key_var)
+    if not api_key and config.api_key_var == "PRIME_API_KEY":
+        api_key = load_prime_config().get("api_key")
+    return AsyncOpenAI(
+        base_url=config.base_url,
+        api_key=api_key or "EMPTY",
+        default_headers=config.headers or None,
+    )
 
 
 def score_from_judge_payload(payload: dict[str, Any]) -> float:
@@ -108,9 +132,8 @@ def score_from_judge_payload(payload: dict[str, Any]) -> float:
 
 
 def parse_json_object(content: str) -> dict[str, Any]:
-    # Judge models sometimes return JSON wrapped in a code fence or left unterminated (the
-    # reasoning model stops after the last array without closing the root object). Try the raw
-    # text, then the brace span, then a bracket-balanced repair of that span.
+    # Strict structured output is always valid JSON; this stays tolerant (code fences, an
+    # unterminated object) as a backstop for an overridden/non-conforming judge model.
     fenced = content.strip()
     if fenced.startswith("```"):
         fenced = fenced.split("```", 2)[1].removeprefix("json").strip()
@@ -152,20 +175,3 @@ def _balance_json(text: str) -> str:
     if trimmed.endswith(","):
         trimmed = trimmed[:-1]
     return trimmed + "".join(reversed(stack))
-
-
-def prime_team_id() -> str | None:
-    for name in ("PRIME_TEAM_ID", "PI_TEAM_ID", "X_PRIME_TEAM_ID"):
-        if value := os.environ.get(name):
-            return value
-    config = Path.home() / ".prime" / "config.json"
-    if config.exists():
-        return json.loads(config.read_text()).get("team_id")
-    return None
-
-
-def prime_default_headers(base_url: str | None) -> dict[str, str]:
-    if not base_url or "pinference" not in base_url.lower():
-        return {}
-    team_id = prime_team_id()
-    return {"X-Prime-Team-ID": team_id} if team_id else {}
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
index 84af873c4..abdfc95f7 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
@@ -1,17 +1,14 @@
-"""mini-browse-apps-platform-v1: local-app Mini Browse tasks pulled from the Prime hub."""
+"""mini-browse-apps-platform-v1: local-app browser tasks pulled from the Prime hub."""
 
 from __future__ import annotations
 
 import gzip
-import hashlib
 import json
 import os
 import shlex
 import shutil
-import sqlite3
 import subprocess
 import tempfile
-import zlib
 from pathlib import Path
 from typing import Any
 
@@ -29,7 +26,7 @@
     MiniBrowseTaskPayload,
 )
 from .harness.diagnostics import read_jsonl_tail
-from .judge import judge_answer_key, score_from_judge_payload
+from .judge import JudgeConfig, judge_answer_key, score_from_judge_payload
 
 DEFAULT_SANDBOX_IMAGE = (
     "team-cmlr3u2er002zhr01tj8f48ts/"
@@ -37,6 +34,7 @@
 )
 DEFAULT_HUB_ENV_ID = "prime/mini-browse-apps-platform-v1"
 DEFAULT_DATASET_FILENAME = "google_flights_10.jsonl.gz"
+DATASET_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "mini-browse-apps"
 
 APP_PORT = 5173
 CDP_PORT = 18080
@@ -51,14 +49,9 @@
 CDP_SERVER = "/opt/mini-browse-services/local_cdp_service.py"
 APP_ROOT = "/opt/mini-browse-app/dist"
 
-SETUP_TIMEOUT_SECONDS = 600
-FINALIZE_TIMEOUT_SECONDS = 120
-SCORING_TIMEOUT_SECONDS = 180
-DEFAULT_TIMEOUT_SECONDS = 3600.0
 DEFAULT_SANDBOX_CPU = 2
 DEFAULT_SANDBOX_MEMORY_GB = 4
 DEFAULT_SANDBOX_DISK_GB = 10
-TASKSET_SHUFFLE_SEED = "google_flights_kernel_v1_dense_hard_no_bag_1188_order_v1"
 
 
 class MiniBrowseAppTask(vf.Task):
@@ -79,18 +72,7 @@ class MiniBrowseAppsConfig(vf.TasksetConfig):
     """Prime hub environment the dataset is pulled from when no `dataset_path` is given."""
     hub_version: str = "latest"
     dataset_filename: str = DEFAULT_DATASET_FILENAME
-    cache_dir: str | None = None
-    """Where the pulled dataset is cached (default: ~/.cache/verifiers/mini-browse-apps)."""
-    seed_store_path: str | None = None
-    task_indices: list[int] | None = None
-    task_profile: str = "default"
-    shuffle_tasks: bool = True
-    taskset_shuffle_seed: str = TASKSET_SHUFFLE_SEED
-    sandbox_image: str = DEFAULT_SANDBOX_IMAGE
-    timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS
-    judge_model: str = "google/gemini-3.1-pro-preview"
-    judge_base_url: str | None = "https://api.pinference.ai/api/v1"
-    judge_api_key_env: str = "PRIME_API_KEY"
+    judge: JudgeConfig = Field(default_factory=JudgeConfig)
 
 
 class MiniBrowseAppsTaskset(vf.Taskset[MiniBrowseAppTask, MiniBrowseAppsConfig]):
@@ -104,11 +86,6 @@ def __init__(self, config: MiniBrowseAppsConfig) -> None:
 
     def load_tasks(self) -> list[MiniBrowseAppTask]:
         rows = self.load_rows()
-        rows = self.filter_rows(rows)
-        if self.config.shuffle_tasks:
-            rows = self.stable_shuffle_rows(rows)
-        if self.config.task_indices is not None:
-            rows = self.select_task_indices(rows)
         if not rows:
             raise ValueError("No Mini Browse app tasks were loaded")
         return [self.normalize_row(i, row) for i, row in enumerate(rows)]
@@ -173,9 +150,7 @@ async def answer_key(self, task: MiniBrowseAppTask, trace: vf.Trace) -> float:
             submitted_result=submitted,
             answer_key=task.answer_key,
             output_schema=task.output_schema,
-            model=self.config.judge_model,
-            base_url=self.config.judge_base_url,
-            api_key_env=self.config.judge_api_key_env,
+            config=self.config.judge,
         )
         trace.info["mini_browse_judge"] = judge_payload
         return score_from_judge_payload(judge_payload)
@@ -217,12 +192,7 @@ def resolved_dataset_path(self) -> Path:
         return self.ensure_cached_dataset()
 
     def ensure_cached_dataset(self) -> Path:
-        cache_root = (
-            Path(self.config.cache_dir).expanduser()
-            if self.config.cache_dir
-            else Path.home() / ".cache" / "verifiers" / "mini-browse-apps"
-        )
-        cached = cache_root / self.config.hub_version / self.config.dataset_filename
+        cached = DATASET_CACHE_DIR / self.config.hub_version / self.config.dataset_filename
         if not cached.exists():
             cached.parent.mkdir(parents=True, exist_ok=True)
             self.pull_dataset_into(cached)
@@ -254,33 +224,6 @@ def pull_dataset_into(self, dest: Path) -> None:
             shutil.copyfile(matches[0], staging)
             os.replace(staging, dest)
 
-    def filter_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
-        if self.config.task_profile == "default":
-            return rows
-        return [row for row in rows if source_profile(row) == self.config.task_profile]
-
-    def stable_shuffle_rows(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
-        seed = self.config.taskset_shuffle_seed
-
-        def key(row: dict[str, Any]) -> tuple[str, str]:
-            info = decode_info(row.get("info") or {})
-            task_id = str(row.get("task_id") or info.get("task_name") or "")
-            digest = hashlib.sha256(f"{seed}:{task_id}".encode("utf-8")).hexdigest()
-            return digest, task_id
-
-        return sorted(rows, key=key)
-
-    def select_task_indices(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
-        selected = []
-        for index in self.config.task_indices or []:
-            if index < 0 or index >= len(rows):
-                raise ValueError(
-                    f"task_indices contains out-of-range index {index}; "
-                    f"filtered taskset has {len(rows)} rows"
-                )
-            selected.append(rows[index])
-        return selected
-
     def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask:
         info = decode_info(row.get("info") or {})
         raw_instruction = info.get("instruction") or row.get("question")
@@ -305,14 +248,8 @@ def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask:
             idx=index,
             name=task_name,
             prompt=raw_instruction.strip(),
-            image=self.config.sandbox_image,
+            image=DEFAULT_SANDBOX_IMAGE,
             workdir=WORKDIR,
-            timeout=vf.TaskTimeout(
-                setup=SETUP_TIMEOUT_SECONDS,
-                harness=self.config.timeout_seconds,
-                finalize=FINALIZE_TIMEOUT_SECONDS,
-                scoring=SCORING_TIMEOUT_SECONDS,
-            ),
             resources=vf.TaskResources(
                 cpu=DEFAULT_SANDBOX_CPU,
                 memory=DEFAULT_SANDBOX_MEMORY_GB,
@@ -325,32 +262,19 @@ def normalize_row(self, index: int, row: dict[str, Any]) -> MiniBrowseAppTask:
                 "task_name": info.get("task_name"),
                 "task_id": row.get("task_id") or answer_key.get("task_id"),
                 "answer_kind": answer_key.get("answer_kind"),
-                "source_profile": source_profile(row),
                 "source_dataset": info.get("source_dataset"),
             },
         )
 
     def app_seed_for_task(self, task: MiniBrowseAppTask) -> dict[str, Any]:
         if not task.app_seed_ref:
-            raise ValueError(f"Task {task.name} has neither app_seed nor app_seed_ref")
-        inline_seed = self._inline_app_seeds.get(task.app_seed_ref)
-        if inline_seed is not None:
-            return inline_seed
-        seed_store = self.resolved_seed_store_path()
-        if seed_store is None:
+            raise ValueError(f"Task {task.name} has no app_seed")
+        seed = self._inline_app_seeds.get(task.app_seed_ref)
+        if seed is None:
             raise ValueError(
-                f"Task {task.name} needs seed {task.app_seed_ref}, but no seed store "
-                "was configured"
+                f"Task {task.name} references seed {task.app_seed_ref} not present inline"
             )
-        return load_seed(seed_store, task.app_seed_ref)
-
-    def resolved_seed_store_path(self) -> Path | None:
-        if self.config.seed_store_path:
-            return Path(self.config.seed_store_path).expanduser()
-        if not self.config.dataset_path:
-            return None
-        dataset_path = Path(self.config.dataset_path).expanduser()
-        return seed_store_for_artifact_path(dataset_path)
+        return seed
 
 
 async def ensure_runtime_dirs(runtime: vf.Runtime) -> None:
@@ -479,53 +403,6 @@ def parse_answer(answer: Any) -> Any:
     return answer
 
 
-def source_profile(row: dict[str, Any]) -> str | None:
-    info = decode_info(row.get("info") or {})
-    factory = info.get("factory") or {}
-    if not isinstance(factory, dict):
-        return None
-    profile = factory.get("source_profile")
-    return str(profile) if profile else None
-
-
-def seed_store_for_artifact_path(path: Path) -> Path:
-    name = path.name
-    if name.endswith(".tasks.jsonl.gz"):
-        return path.with_name(name.removesuffix(".tasks.jsonl.gz") + ".seeds.sqlite")
-    if name.endswith(".jsonl.gz"):
-        return path.with_name(name.removesuffix(".jsonl.gz") + ".seeds.sqlite")
-    if path.suffix:
-        return path.with_suffix(".seeds.sqlite")
-    return path.with_name(name + ".seeds.sqlite")
-
-
-def load_seed(seed_store: Path, seed_ref: str) -> dict[str, Any]:
-    if not seed_store.exists():
-        raise FileNotFoundError(f"Mini Browse app seed store not found: {seed_store}")
-    with sqlite3.connect(seed_store) as db:
-        columns = {
-            row[1] for row in db.execute("PRAGMA table_info(app_seeds)").fetchall()
-        }
-        if "app_seed_zlib" in columns:
-            row = db.execute(
-                "SELECT app_seed_zlib FROM app_seeds WHERE seed_id = ?",
-                (seed_ref,),
-            ).fetchone()
-            seed_json = None if row is None else zlib.decompress(row[0]).decode("utf-8")
-        else:
-            row = db.execute(
-                "SELECT app_seed_json FROM app_seeds WHERE seed_id = ?",
-                (seed_ref,),
-            ).fetchone()
-            seed_json = None if row is None else row[0]
-    if seed_json is None:
-        raise KeyError(f"Seed {seed_ref!r} not found in {seed_store}")
-    seed = json.loads(seed_json)
-    if not isinstance(seed, dict):
-        raise ValueError(f"Seed {seed_ref!r} in {seed_store} is not an object")
-    return seed
-
-
 def combined_output(result: vf.ProgramResult) -> str:
     return ((result.stdout or "") + (result.stderr or "")).strip()[-2000:]
 

From ec17662f368a3ea1de83297a317017d84573a236 Mon Sep 17 00:00:00 2001
From: Mika Senghaas <mail@mikasenghaas.de>
Date: Fri, 19 Jun 2026 05:10:52 +0000
Subject: [PATCH 5/7] refactor(v1): browse-agent subconfig + file-based model
 client

- Group the harness agent-source fields under an `agent` subconfig (agent.repo /
  agent.ref / agent.path / ...), dropping the agent_ prefix.
- Pass the model endpoint/key/model to the in-sandbox agent via a JSON file
  (--model-client) instead of OPENAI_* env vars; program.py builds the client and
  passes it to the agent explicitly, keeping the secret out of the process env.
- Drop the redundant harness path config fields (use the shared contract paths
  directly) and inline taskset constants that only fed config defaults.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../mini_browse_apps_platform_v1/README.md    |  10 +-
 .../harness/__init__.py                       | 118 +++++++++---------
 .../harness/program.py                        |  10 +-
 .../mini_browse_apps_platform_v1/taskset.py   |   6 +-
 4 files changed, 78 insertions(+), 66 deletions(-)

diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md
index 7e1b441a1..e5fb92280 100644
--- a/environments/mini_browse_apps_platform_v1/README.md
+++ b/environments/mini_browse_apps_platform_v1/README.md
@@ -16,10 +16,10 @@ time from a **private GitHub repo** (pinned to a commit), caches it under
 
 | Field | Default | Meaning |
 | --- | --- | --- |
-| `agent_repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. |
-| `agent_ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent_path` is set).** |
-| `agent_token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent_repo`. |
-| `agent_path` | _(unset)_ | Local dir containing `<agent_package>/` — skips the fetch (development). |
+| `agent.repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. |
+| `agent.ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent.path` is set).** |
+| `agent.token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent.repo`. |
+| `agent.path` | _(unset)_ | Local dir containing the agent package — skips the fetch (development). |
 
 ## Tasks (pulled dynamically)
 
@@ -38,7 +38,7 @@ export MINI_BROWSE_GITHUB_TOKEN=<token>
 uv run eval mini-browse-apps-platform-v1 \
   --harness.id mini-browse-apps-platform-v1 \
   --harness.runtime.type prime \
-  --harness.agent-ref <agent-commit-sha> \
+  --harness.agent.ref <agent-commit-sha> \
   -m <multimodal-model> \
   -n 1 -r 1 -c 1
 ```
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
index 8ded780df..1dc043c1d 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
@@ -3,12 +3,13 @@
 The browser agent is proprietary and is NOT vendored in this repo. It is fetched at run time
 from a private, auth-gated GitHub repo (pinned to a commit), cached locally, then tarred and
 staged into the sandbox, where `program.py` (a uv script) imports and runs it. For local
-development, point `agent_path` at a checkout instead of fetching.
+development, point `agent.path` at a checkout instead of fetching.
 """
 
 from __future__ import annotations
 
 import io
+import json
 import os
 import shlex
 import shutil
@@ -18,6 +19,8 @@
 from typing import Literal
 
 import httpx
+from pydantic import Field
+from pydantic_config import BaseConfig
 from verifiers.v1.clients import RolloutContext
 from verifiers.v1.errors import HarnessError
 from verifiers.v1.harness import Harness, HarnessConfig
@@ -39,29 +42,35 @@
 
 AGENT_RUNTIME = "/opt/browse-agent-runtime"
 AGENT_TARBALL = "/tmp/vf-browse-agent-runtime.tgz"
+AGENT_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "browse-agent"
+MODEL_CLIENT_PATH = "/tmp/vf-browse-model-client.json"
 
 CoordinateMode = Literal["relative_1000", "absolute", "auto"]
 
 
+class AgentConfig(BaseConfig):
+    """The proprietary browser agent — fetched at run time from a private repo, not vendored."""
+
+    repo: str = "PrimeIntellect-ai/plex-mini-browse"
+    """Private GitHub repo (owner/name) the agent is fetched from."""
+    ref: str = ""
+    """Pinned commit sha to fetch (required unless `path` is set)."""
+    package: str = "mini_browse"
+    """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage."""
+    token_env: str = "MINI_BROWSE_GITHUB_TOKEN"
+    """Env var holding a GitHub token with read access to `repo`."""
+    path: str | None = None
+    """Local dir containing `<package>/` — when set, skips the GitHub fetch (development)."""
+    cache_dir: str | None = None
+    """Where fetched revisions are cached (default: ~/.cache/verifiers/browse-agent)."""
+
+
 class MiniBrowseHarnessConfig(HarnessConfig):
     """Reusable browser harness; fetches its proprietary agent from a private repo."""
 
     id: str = "mini-browse-apps-platform-v1"
     runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm")
-    # --- proprietary agent source (not vendored; fetched at run time) ---
-    agent_repo: str = "PrimeIntellect-ai/plex-mini-browse"
-    """Private GitHub repo (owner/name) the agent is fetched from."""
-    agent_ref: str = ""
-    """Pinned commit sha to fetch (required unless `agent_path` is set)."""
-    agent_package: str = "mini_browse"
-    """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage."""
-    agent_token_env: str = "MINI_BROWSE_GITHUB_TOKEN"
-    """Env var holding a GitHub token with read access to `agent_repo`."""
-    agent_path: str | None = None
-    """Local dir containing `<agent_package>/` — when set, skips the GitHub fetch (dev)."""
-    agent_cache_dir: str | None = None
-    """Where fetched agent revisions are cached (default: ~/.cache/verifiers/browse-agent)."""
-    # --- agent behavior ---
+    agent: AgentConfig = Field(default_factory=AgentConfig)
     max_steps: int = 75
     coordinate_mode: CoordinateMode = "relative_1000"
     keep_last_images: int = 3
@@ -71,12 +80,6 @@ class MiniBrowseHarnessConfig(HarnessConfig):
     browser_start_jitter_seconds: float = 0.0
     browser_start_max_in_flight: int = 0
     record_frames: bool = False
-    task_payload_path: str = TASK_PAYLOAD_PATH
-    result_path: str = RESULT_PATH
-    transcript_path: str = TRANSCRIPT_PATH
-    metrics_path: str = METRICS_PATH
-    progress_path: str = PROGRESS_PATH
-    workspace_root: str = WORKSPACE_ROOT
 
 
 class MiniBrowseHarness(Harness[MiniBrowseHarnessConfig]):
@@ -108,11 +111,14 @@ async def launch(
             raise ValueError("Browser harness requires a string task prompt")
 
         await self._stage_agent(runtime)
+        await runtime.write(
+            MODEL_CLIENT_PATH,
+            json.dumps(
+                {"base_url": endpoint, "api_key": secret, "model": ctx.model}
+            ).encode("utf-8"),
+        )
         env = {
             **self.config.env,
-            "OPENAI_BASE_URL": endpoint,
-            "OPENAI_API_KEY": secret,
-            "OPENAI_MODEL": ctx.model,
             "PYTHONPATH": self._pythonpath(),
             "MINI_BROWSE_COORDINATE_MODE": self.config.coordinate_mode,
             "MINI_BROWSE_KEEP_LAST_IMAGES": str(self.config.keep_last_images),
@@ -131,26 +137,28 @@ async def launch(
             "MINI_BROWSE_BROWSER_START_MAX_IN_FLIGHT": str(
                 self.config.browser_start_max_in_flight
             ),
-            "MINI_BROWSE_PROGRESS_PATH": self.config.progress_path,
+            "MINI_BROWSE_PROGRESS_PATH": PROGRESS_PATH,
         }
         if self.config.record_frames:
             env["MINI_BROWSE_RECORD_FRAMES_DIR"] = "/logs/mini_browse/frames"
 
         args = [
             "--task",
-            self.config.task_payload_path,
+            TASK_PAYLOAD_PATH,
+            "--model-client",
+            MODEL_CLIENT_PATH,
             "--result",
-            self.config.result_path,
+            RESULT_PATH,
             "--transcript",
-            self.config.transcript_path,
+            TRANSCRIPT_PATH,
             "--metrics",
-            self.config.metrics_path,
+            METRICS_PATH,
             "--progress",
-            self.config.progress_path,
+            PROGRESS_PATH,
             "--max-steps",
             str(self.config.max_steps),
             "--workspace-root",
-            self.config.workspace_root,
+            WORKSPACE_ROOT,
         ]
         return await runtime.run_uv_script(PROGRAM_SOURCE, args=args, env=env)
 
@@ -168,43 +176,42 @@ async def _stage_agent(self, runtime: Runtime) -> None:
             )
 
     def _agent_tarball(self) -> bytes:
-        package = self._ensure_agent() / self.config.agent_package
+        package = self._ensure_agent() / self.config.agent.package
         if not package.is_dir():
             raise HarnessError(
-                f"agent package {self.config.agent_package!r} not found under {package.parent}"
+                f"agent package {self.config.agent.package!r} not found under {package.parent}"
             )
         buffer = io.BytesIO()
         with tarfile.open(fileobj=buffer, mode="w:gz") as archive:
-            archive.add(package, arcname=self.config.agent_package)
+            archive.add(package, arcname=self.config.agent.package)
         return buffer.getvalue()
 
     def _ensure_agent(self) -> Path:
-        """Return a dir that contains `<agent_package>/` — a local checkout or the fetch cache."""
-        if self.config.agent_path:
-            return Path(self.config.agent_path).expanduser()
-        if not self.config.agent_ref:
+        """Return a dir that contains `<package>/` — a local checkout or the fetch cache."""
+        agent = self.config.agent
+        if agent.path:
+            return Path(agent.path).expanduser()
+        if not agent.ref:
             raise HarnessError(
-                "set --harness.agent-ref to a pinned commit sha "
-                "(or --harness.agent-path to a local checkout for development)"
+                "set --harness.agent.ref to a pinned commit sha "
+                "(or --harness.agent.path to a local checkout for development)"
             )
         cache_root = (
-            Path(self.config.agent_cache_dir).expanduser()
-            if self.config.agent_cache_dir
-            else Path.home() / ".cache" / "verifiers" / "browse-agent"
+            Path(agent.cache_dir).expanduser() if agent.cache_dir else AGENT_CACHE_DIR
         )
-        dest = cache_root / self.config.agent_ref
-        if not (dest / self.config.agent_package).exists():
+        dest = cache_root / agent.ref
+        if not (dest / agent.package).exists():
             self._download_agent(dest)
         return dest
 
     def _download_agent(self, dest: Path) -> None:
-        token = os.environ.get(self.config.agent_token_env)
+        agent = self.config.agent
+        token = os.environ.get(agent.token_env)
         if not token:
             raise HarnessError(
-                f"missing ${self.config.agent_token_env} to fetch the private agent repo "
-                f"{self.config.agent_repo!r}"
+                f"missing ${agent.token_env} to fetch the private agent repo {agent.repo!r}"
             )
-        url = f"https://api.github.com/repos/{self.config.agent_repo}/tarball/{self.config.agent_ref}"
+        url = f"https://api.github.com/repos/{agent.repo}/tarball/{agent.ref}"
         headers = {
             "Authorization": f"Bearer {token}",
             "Accept": "application/vnd.github+json",
@@ -219,8 +226,7 @@ def _download_agent(self, dest: Path) -> None:
                 if resp.status_code != 200:
                     resp.read()
                     raise HarnessError(
-                        f"fetching {self.config.agent_repo}@{self.config.agent_ref} failed: "
-                        f"HTTP {resp.status_code}"
+                        f"fetching {agent.repo}@{agent.ref} failed: HTTP {resp.status_code}"
                     )
                 with open(archive, "wb") as handle:
                     for chunk in resp.iter_bytes():
@@ -229,18 +235,17 @@ def _download_agent(self, dest: Path) -> None:
             extract.mkdir()
             with tarfile.open(archive) as tar:
                 tar.extractall(extract, filter="data")
-            matches = sorted(extract.glob(f"*/{self.config.agent_package}"))
+            matches = sorted(extract.glob(f"*/{agent.package}"))
             if not matches:
                 raise HarnessError(
-                    f"{self.config.agent_package!r} not found in "
-                    f"{self.config.agent_repo}@{self.config.agent_ref}"
+                    f"{agent.package!r} not found in {agent.repo}@{agent.ref}"
                 )
             dest.mkdir(parents=True, exist_ok=True)
-            staging = dest / (self.config.agent_package + ".tmp")
+            staging = dest / (agent.package + ".tmp")
             if staging.exists():
                 shutil.rmtree(staging)
             shutil.copytree(matches[0], staging)
-            os.replace(staging, dest / self.config.agent_package)
+            os.replace(staging, dest / agent.package)
 
     def _pythonpath(self) -> str:
         existing = self.config.env.get("PYTHONPATH", "")
@@ -255,6 +260,7 @@ def load_harness(config: MiniBrowseHarnessConfig) -> MiniBrowseHarness:
 
 
 __all__ = [
+    "AgentConfig",
     "MiniBrowseHarness",
     "MiniBrowseHarnessConfig",
     "MiniBrowseTaskPayload",
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
index c0d6ef1c2..09ee45d3d 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
@@ -396,7 +396,13 @@ async def _run(args: argparse.Namespace) -> int:
     )
     if conversation is not None and not isinstance(conversation, list):
         raise ValueError("Mini Browse conversation payload must be a list")
-    model = os.environ.get("OPENAI_MODEL", "intercepted/model")
+    from openai import AsyncOpenAI
+
+    model_client = _read_json(Path(args.model_client))
+    model = model_client["model"]
+    client = AsyncOpenAI(
+        base_url=model_client["base_url"], api_key=model_client["api_key"]
+    )
     coordinate_mode = os.environ.get("MINI_BROWSE_COORDINATE_MODE", "relative_1000")
 
     payload: dict[str, Any]
@@ -415,6 +421,7 @@ async def _run(args: argparse.Namespace) -> int:
             url=start_url,
             output_schema=output_schema,
             model=model,
+            client=client,
             max_steps=int(args.max_steps),
             workspace_root=workspace_root,
             include_builtin_tools=_env_bool("MINI_BROWSE_INCLUDE_BUILTIN_TOOLS"),
@@ -542,6 +549,7 @@ async def _run(args: argparse.Namespace) -> int:
 def _parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Run the Mini Browse harness.")
     parser.add_argument("--task", required=True)
+    parser.add_argument("--model-client", required=True)
     parser.add_argument("--result", required=True)
     parser.add_argument("--transcript", required=True)
     parser.add_argument("--metrics", required=True)
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
index abdfc95f7..14255bdb7 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/taskset.py
@@ -32,8 +32,6 @@
     "team-cmlr3u2er002zhr01tj8f48ts/"
     "mini-browse-apps:destination-autocomplete-tight-20260528-0027"
 )
-DEFAULT_HUB_ENV_ID = "prime/mini-browse-apps-platform-v1"
-DEFAULT_DATASET_FILENAME = "google_flights_10.jsonl.gz"
 DATASET_CACHE_DIR = Path.home() / ".cache" / "verifiers" / "mini-browse-apps"
 
 APP_PORT = 5173
@@ -68,10 +66,10 @@ class MiniBrowseAppsConfig(vf.TasksetConfig):
     id: str = "mini-browse-apps-platform-v1"
     dataset_path: str | None = None
     """Explicit local dataset (JSONL/JSONL.GZ); when set, skips the hub pull."""
-    hub_env_id: str = DEFAULT_HUB_ENV_ID
+    hub_env_id: str = "prime/mini-browse-apps-platform-v1"
     """Prime hub environment the dataset is pulled from when no `dataset_path` is given."""
     hub_version: str = "latest"
-    dataset_filename: str = DEFAULT_DATASET_FILENAME
+    dataset_filename: str = "google_flights_10.jsonl.gz"
     judge: JudgeConfig = Field(default_factory=JudgeConfig)
 
 

From bd8dc823873400fd5991936079160af78905955c Mon Sep 17 00:00:00 2001
From: Mika Senghaas <mail@mikasenghaas.de>
Date: Fri, 19 Jun 2026 05:25:58 +0000
Subject: [PATCH 6/7] refactor(v1): drop harness max_steps; cap rollouts via
 --max-turns

The framework enforces --max-turns at the interception layer for any harness, so
the harness-specific max_steps knob was redundant. Remove it; program.py keeps its
own default step backstop, and rollouts are capped with --max-turns.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../mini_browse_apps_platform_v1/harness/__init__.py           | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
index 1dc043c1d..cb7073d7e 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
@@ -71,7 +71,6 @@ class MiniBrowseHarnessConfig(HarnessConfig):
     id: str = "mini-browse-apps-platform-v1"
     runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm")
     agent: AgentConfig = Field(default_factory=AgentConfig)
-    max_steps: int = 75
     coordinate_mode: CoordinateMode = "relative_1000"
     keep_last_images: int = 3
     image_compaction_at_tokens: int = 45_000
@@ -155,8 +154,6 @@ async def launch(
             METRICS_PATH,
             "--progress",
             PROGRESS_PATH,
-            "--max-steps",
-            str(self.config.max_steps),
             "--workspace-root",
             WORKSPACE_ROOT,
         ]

From b7d15e609922c0cac484d848c9a1fccb46ae87eb Mon Sep 17 00:00:00 2001
From: Mika Senghaas <mail@mikasenghaas.de>
Date: Fri, 19 Jun 2026 05:39:00 +0000
Subject: [PATCH 7/7] chore(v1): point harness at the private agent repo; scrub
 naming

- Default the agent source to PrimeIntellect-ai/mini-browse pinned at 157b449
  (the private browser-agent repo), so the env fetches it out of the box.
- Rename the proxy env var to MINI_BROWSE_HTTP_PROXY and say "private" rather
  than "proprietary" in the harness/README.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 environments/mini_browse_apps_platform_v1/README.md  |  8 ++++----
 .../mini_browse_apps_platform_v1/harness/__init__.py | 12 ++++++------
 .../mini_browse_apps_platform_v1/harness/program.py  |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/environments/mini_browse_apps_platform_v1/README.md b/environments/mini_browse_apps_platform_v1/README.md
index e5fb92280..fcd40bdb4 100644
--- a/environments/mini_browse_apps_platform_v1/README.md
+++ b/environments/mini_browse_apps_platform_v1/README.md
@@ -7,17 +7,17 @@ LLM judge scores the submission against a deterministic answer key.
 
 The model must be **multimodal** (the agent's only input is screenshots).
 
-## Proprietary agent (fetched at run time)
+## Private agent (fetched at run time)
 
-The browser agent is **proprietary and not vendored in this repo**. The harness fetches it at run
+The browser agent is **private and not vendored in this repo**. The harness fetches it at run
 time from a **private GitHub repo** (pinned to a commit), caches it under
 `~/.cache/verifiers/browse-agent/<sha>/`, then stages it into the sandbox. Configure via
 `--harness.*`:
 
 | Field | Default | Meaning |
 | --- | --- | --- |
-| `agent.repo` | `PrimeIntellect-ai/plex-mini-browse` | Private `owner/name` to fetch the agent from. |
-| `agent.ref` | _(unset)_ | **Pinned commit sha to fetch (required unless `agent.path` is set).** |
+| `agent.repo` | `PrimeIntellect-ai/mini-browse` | Private `owner/name` to fetch the agent from. |
+| `agent.ref` | `157b449` | Pinned commit sha to fetch (`agent.path` skips the fetch). |
 | `agent.token_env` | `MINI_BROWSE_GITHUB_TOKEN` | Env var holding a GitHub token with read access to `agent.repo`. |
 | `agent.path` | _(unset)_ | Local dir containing the agent package — skips the fetch (development). |
 
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
index cb7073d7e..78f67a779 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/__init__.py
@@ -1,6 +1,6 @@
 """Browser-app harness: stages a privately-distributed browser agent into the sandbox.
 
-The browser agent is proprietary and is NOT vendored in this repo. It is fetched at run time
+The browser agent is private and is NOT vendored in this repo. It is fetched at run time
 from a private, auth-gated GitHub repo (pinned to a commit), cached locally, then tarred and
 staged into the sandbox, where `program.py` (a uv script) imports and runs it. For local
 development, point `agent.path` at a checkout instead of fetching.
@@ -49,12 +49,12 @@
 
 
 class AgentConfig(BaseConfig):
-    """The proprietary browser agent — fetched at run time from a private repo, not vendored."""
+    """The private browser agent — fetched at run time from a private repo, not vendored."""
 
-    repo: str = "PrimeIntellect-ai/plex-mini-browse"
+    repo: str = "PrimeIntellect-ai/mini-browse"
     """Private GitHub repo (owner/name) the agent is fetched from."""
-    ref: str = ""
-    """Pinned commit sha to fetch (required unless `path` is set)."""
+    ref: str = "157b449"
+    """Pinned commit sha to fetch (`path` skips the fetch for local development)."""
     package: str = "mini_browse"
     """Importable package dir within the repo (and on the sandbox PYTHONPATH) to stage."""
     token_env: str = "MINI_BROWSE_GITHUB_TOKEN"
@@ -66,7 +66,7 @@ class AgentConfig(BaseConfig):
 
 
 class MiniBrowseHarnessConfig(HarnessConfig):
-    """Reusable browser harness; fetches its proprietary agent from a private repo."""
+    """Reusable browser harness; fetches its agent from a private GitHub repo."""
 
     id: str = "mini-browse-apps-platform-v1"
     runtime: RuntimeConfig = DockerConfig(image="python:3.12-slim-bookworm")
diff --git a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
index 09ee45d3d..d4e4bae93 100644
--- a/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
+++ b/environments/mini_browse_apps_platform_v1/mini_browse_apps_platform_v1/harness/program.py
@@ -384,7 +384,7 @@ async def _run(args: argparse.Namespace) -> int:
         os.environ["MINI_BROWSE_BROWSER_API_URL"] = browser_api_url
     http_proxy = str(task_payload.get("http_proxy") or "").strip()
     if http_proxy:
-        os.environ["PERPLEXITY_TAILSCALE_HTTP_PROXY"] = http_proxy
+        os.environ["MINI_BROWSE_HTTP_PROXY"] = http_proxy
     source = str(task_payload.get("source") or "verifiers-mini-browse")
     task_preamble = str(
         task_payload.get("task_preamble")