diff --git a/hindsight-tools/mission-sandbox/.gitignore b/hindsight-tools/mission-sandbox/.gitignore new file mode 100644 index 000000000..3baed7658 --- /dev/null +++ b/hindsight-tools/mission-sandbox/.gitignore @@ -0,0 +1,8 @@ +node_modules +dist +.next +.next-* +standalone +next-env.d.ts +*.tsbuildinfo +projects diff --git a/hindsight-tools/mission-sandbox/README.md b/hindsight-tools/mission-sandbox/README.md new file mode 100644 index 000000000..5f1e659a8 --- /dev/null +++ b/hindsight-tools/mission-sandbox/README.md @@ -0,0 +1,130 @@ +# @vectorize-io/hindsight-mission-sandbox + +Tune Hindsight's **retain (extraction)** and **observation (consolidation)** missions against your +own task, then verify with an **external validator** (a benchmark like LOCOMO, or your app's eval). + +The tool is deliberately small and opinionated: + +- **You bring** the documents and a way to score success (the validator). The tool does **not** + measure accuracy or label facts — task success is decoupled from the tool. +- **You refine a mission with feedback.** After looking at validator results, you hand the tool + _feedback_ (and optional failing examples); it rewrites the current mission. No good/bad labeling. +- **Retain iterates across versioned banks** (`-v1`, `-v2`, …) so you can point the + validator at any version and compare. **Observations iterate in place** (clear + re-consolidate), + since they're re-derived from the same facts. + +## The loop + +``` +init (bind docs) + └─ retain mission (feedback + examples → refine retain mission) + └─ retain apply (ingest docs into a NEW bank -vN) + └─ VALIDATE EXTERNALLY against -vN ─┐ + ┌──────────────────────────────────────────────────┘ failures become the next feedback + ▼ + retain mission (feedback) → retain apply → validate → … + +observe mission (feedback → refine obs mission) → observe apply (clear obs + re-consolidate on current bank) → validate +``` + +The validator is never inside the tool. A typical round: run your eval against `-vN`, +read what failed, then `retain mission --feedback "" --example ""` +→ `retain apply` (new version) → re-validate. + +## Commands + +```bash +# bind a project to its documents (no ingest yet) +mission-sandbox init --documents [--api-url URL] + +# RETAIN loop — iterates across versioned banks +mission-sandbox retain mission --feedback "" [--example "" ...] +mission-sandbox retain apply # ingest docs → new bank -vN, prints the bank id + +# OBSERVE loop — iterates in place on the current bank +mission-sandbox observe mission --feedback "" [--example "<...>" ...] +mission-sandbox observe apply # clear observations on current bank + re-consolidate + +mission-sandbox status # bound docs, current missions, versions (+ bank ids) +mission-sandbox ui # minimal UI: project status + versions +``` + +- `retain mission` / `observe mission` refine the **current** mission from your feedback (+ examples); + the first call (no prior mission) treats the feedback as the initial spec. The LLM sees the current + mission + feedback + examples — nothing else, no labels. +- `retain apply` always creates the **next** version bank and ingests into it. Point your validator + (e.g. LOCOMO `--template`/the bank id) at that version. +- `--model` overrides the Gemini model used for mission refinement (default `gemini-2.5-flash`, or + `HINDSIGHT_API_LLM_MODEL`). Mission refinement is the **only** LLM call the tool makes; ingestion + + consolidation run on the Hindsight deployment. + +## Verifying with LOCOMO (example external validator) + +The LOCOMO runner is unchanged and is the **only** thing that measures accuracy. Build a template +from a version's missions and point the runner at it (default mode — **no `--use-reflect`**): + +```bash +# representative subset: trim the runner's input to N per category (data only — restore after) +cd hindsight-dev/benchmarks/locomo/datasets && cp locomo10.json locomo10.full.json +N=5 # widen to 10+ once a mission looks good, to confirm it generalises and surface weak categories +python3 - "$N" <<'PY' +import json, sys +n=int(sys.argv[1]); d=json.load(open("locomo10.json")) +for s in d: + if s["sample_id"]!="": continue + s["qa"]=[q for c in (1,2,3,4) for q in [x for x in s["qa"] if x.get("category")==c and x.get("answer")][:n]] +json.dump(d,open("locomo10.json","w")) +PY + +# verify a version's missions +python3 -c "import json;p=json.load(open('/project.json'));v=p['versions'][-1]; \ + json.dump({'version':'1','bank':{'retain_mission':v['retainMission'],'observations_mission':v.get('observeMission')}}, \ + open('/template.json','w'))" +set -a; source hindsight-api-slim/.env; set +a; export HINDSIGHT_API_LLM_MODEL=gemini-2.5-flash +uv run --project hindsight-dev python hindsight-dev/benchmarks/locomo/locomo_benchmark.py \ + --conversation --wait-consolidation --template /template.json +# results: hindsight-dev/benchmarks/locomo/results/benchmark_results.json (by-category is_correct) +mv hindsight-dev/benchmarks/locomo/datasets/locomo10.full.json hindsight-dev/benchmarks/locomo/datasets/locomo10.json +``` + +Read accuracy **by category**; a weak category is your next `--feedback`. Notes from real runs: +single-question swings between runs are **recall variance** (each apply re-ingests) — watch +category trends; and verify a "failure" against the transcript before chasing it (some benchmark +golds are wrong). + +## Project model (`project.json`) + +```jsonc +{ + "documents": "/path/to/docs", // bound at init + "apiUrl": "http://localhost:8888", + "retain": { "mission": "…", "feedback": ["…"] }, + "observe": { "mission": "…", "feedback": ["…"] }, + "versions": [ + { + "n": 1, + "bank": "-v1", + "retainMission": "…", + "observeMission": "…", + "createdAt": "…", + }, + ], + "currentVersion": 1, +} +``` + +## Setup + +```bash +npm install +npm run build --workspace @vectorize-io/hindsight-mission-sandbox +export GEMINI_API_KEY=... # or GOOGLE_API_KEY, or a Gemini HINDSIGHT_API_LLM_* in your .env +``` + +## Development + +```bash +npm run test # vitest unit tests for core +npm run typecheck # tsc for the lib + the Next app +npm run build # build the core lib (dist) + the minimal Next UI +``` diff --git a/hindsight-tools/mission-sandbox/bin/cli.js b/hindsight-tools/mission-sandbox/bin/cli.js new file mode 100755 index 000000000..27e7cb818 --- /dev/null +++ b/hindsight-tools/mission-sandbox/bin/cli.js @@ -0,0 +1,13 @@ +#!/usr/bin/env node +// Thin launcher: delegate to the compiled CLI. Run `npm run build:lib` (or `npm run build`) +// to produce dist/. For development without a build, use `npm run cli -- ` (tsx). +import("../dist/cli/index.js").catch((err) => { + if (err && err.code === "ERR_MODULE_NOT_FOUND") { + console.error( + "mission-sandbox: build output missing. Run `npm run build` in the package first." + ); + } else { + console.error(err); + } + process.exit(1); +}); diff --git a/hindsight-tools/mission-sandbox/next.config.ts b/hindsight-tools/mission-sandbox/next.config.ts new file mode 100644 index 000000000..b1e2befbb --- /dev/null +++ b/hindsight-tools/mission-sandbox/next.config.ts @@ -0,0 +1,9 @@ +import type { NextConfig } from "next"; + +const nextConfig: NextConfig = { + output: "standalone", + // core is consumed as a built package (dist); its heavy runtime deps stay external. + serverExternalPackages: ["@google/genai", "@vectorize-io/hindsight-client"], +}; + +export default nextConfig; diff --git a/hindsight-tools/mission-sandbox/package.json b/hindsight-tools/mission-sandbox/package.json new file mode 100644 index 000000000..851a7fa85 --- /dev/null +++ b/hindsight-tools/mission-sandbox/package.json @@ -0,0 +1,74 @@ +{ + "name": "@vectorize-io/hindsight-mission-sandbox", + "version": "0.1.0", + "description": "Iterate on Hindsight observation missions with a fast feedback loop — CLI + Next.js UI", + "type": "module", + "main": "./dist/core/index.js", + "types": "./dist/core/index.d.ts", + "exports": { + ".": { + "types": "./dist/core/index.d.ts", + "import": "./dist/core/index.js" + }, + "./core": { + "types": "./dist/core/index.d.ts", + "import": "./dist/core/index.js" + } + }, + "bin": { + "mission-sandbox": "bin/cli.js" + }, + "files": [ + "bin", + "dist", + "standalone", + "public" + ], + "scripts": { + "build": "npm run build:lib && npm run build:ui", + "build:lib": "tsc -p tsconfig.lib.json", + "build:ui": "NODE_ENV=production next build && npm run build:standalone", + "build:standalone": "rm -rf standalone && SERVER_JS=$(find .next/standalone -path '*/node_modules' -prune -o -name 'server.js' -print | head -1) && test -n \"$SERVER_JS\" || (echo 'Error: server.js not found in .next/standalone - standalone build failed' && exit 1) && STANDALONE_ROOT=$(dirname \"$SERVER_JS\") && cp -r \"$STANDALONE_ROOT\" standalone && cp -r .next/standalone/node_modules standalone/node_modules && mkdir -p standalone/.next && cp -r .next/static standalone/.next/static && mkdir -p standalone/public && (cp -r public/* standalone/public/ 2>/dev/null || true)", + "dev": "npm run build:lib && next dev -p ${PORT:-7777}", + "cli": "tsx src/cli/index.ts", + "start": "next start -p ${PORT:-7777}", + "lint": "next lint", + "typecheck": "tsc -p tsconfig.lib.json --noEmit && tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest", + "prepublishOnly": "npm run build" + }, + "keywords": [ + "hindsight", + "memory", + "observations", + "mission", + "prompt-optimization" + ], + "license": "MIT", + "repository": { + "type": "git", + "url": "https://github.com/vectorize-io/hindsight.git", + "directory": "hindsight-tools/mission-sandbox" + }, + "dependencies": { + "@google/genai": "^2.7.0", + "@vectorize-io/hindsight-client": "^0.7.0", + "commander": "^14.0.0", + "next": "^16.2.6", + "react": "^19.2.0", + "react-dom": "^19.2.0" + }, + "devDependencies": { + "@tailwindcss/postcss": "^4.1.17", + "@types/node": "^24.10.0", + "@types/react": "^19.2.2", + "@types/react-dom": "^19.2.2", + "eslint": "^9.39.1", + "eslint-config-next": "^16.0.1", + "tailwindcss": "^4.1.17", + "tsx": "^4.19.2", + "typescript": "^5.9.3", + "vitest": "^4.1.2" + } +} diff --git a/hindsight-tools/mission-sandbox/postcss.config.mjs b/hindsight-tools/mission-sandbox/postcss.config.mjs new file mode 100644 index 000000000..c7bcb4b1e --- /dev/null +++ b/hindsight-tools/mission-sandbox/postcss.config.mjs @@ -0,0 +1,5 @@ +const config = { + plugins: ["@tailwindcss/postcss"], +}; + +export default config; diff --git a/hindsight-tools/mission-sandbox/src/app/api/extract/route.ts b/hindsight-tools/mission-sandbox/src/app/api/extract/route.ts new file mode 100644 index 000000000..b778a4198 --- /dev/null +++ b/hindsight-tools/mission-sandbox/src/app/api/extract/route.ts @@ -0,0 +1,28 @@ +import { runExtractPreview } from "@vectorize-io/hindsight-mission-sandbox/core"; + +import { projectDir } from "@/app/lib/project-context"; + +export const runtime = "nodejs"; +export const dynamic = "force-dynamic"; + +/** Dry-run extraction preview: what does this mission extract from the given text? (no ingest) */ +export async function POST(req: Request) { + const body = (await req.json().catch(() => ({}))) as { + project?: string; + content?: string; + retainMission?: string | null; + }; + if (!body.project || !body.content) { + return Response.json({ error: "project and content are required" }, { status: 400 }); + } + try { + const facts = await runExtractPreview({ + projectDir: projectDir(body.project), + content: body.content, + retainMission: body.retainMission, + }); + return Response.json({ facts }); + } catch (e) { + return Response.json({ error: e instanceof Error ? e.message : String(e) }, { status: 500 }); + } +} diff --git a/hindsight-tools/mission-sandbox/src/app/components/ExtractPanel.tsx b/hindsight-tools/mission-sandbox/src/app/components/ExtractPanel.tsx new file mode 100644 index 000000000..bb505d57c --- /dev/null +++ b/hindsight-tools/mission-sandbox/src/app/components/ExtractPanel.tsx @@ -0,0 +1,106 @@ +"use client"; + +import { useState } from "react"; + +interface PreviewFact { + text: string; + factType: string; + occurredStart: string | null; + occurredEnd: string | null; + entities: string[]; +} + +/** + * Dry-run extraction preview: paste text + an optional mission, see what the retain step would + * extract — with no ingestion, no persistence. Backed by the /memories/extract API. + */ +export function ExtractPanel({ + project, + defaultMission, +}: { + project: string; + defaultMission: string | null; +}) { + const [content, setContent] = useState(""); + const [mission, setMission] = useState(defaultMission ?? ""); + const [facts, setFacts] = useState(null); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + + async function run() { + setLoading(true); + setError(null); + setFacts(null); + try { + const res = await fetch("/api/extract", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ project, content, retainMission: mission || null }), + }); + const data = await res.json(); + if (!res.ok) throw new Error(data.error || `HTTP ${res.status}`); + setFacts(data.facts as PreviewFact[]); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + } finally { + setLoading(false); + } + } + + return ( +
+ + Dry-run extraction (preview a mission, no ingest) + + + +