engkimo · engkimo · May 21, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/benchmarks/planner_quality_ab.py b/benchmarks/planner_quality_ab.py
@@ -45,20 +45,32 @@
 from pathlib import Path
 
 from domain.entities.fractal_engine import CandidateNode, ExecutionPlan, PlanNode
+from domain.services.planner_model_router import PlannerModelRouter
+from domain.value_objects.planner_model import PlannerModel
+from infrastructure.events.in_memory_event_bus import InMemoryEventBus
 from infrastructure.fractal.llm_plan_evaluator import LLMPlanEvaluator
 from infrastructure.fractal.llm_planner import LLMPlanner
 from infrastructure.llm.cost_tracker import CostTracker
 from infrastructure.llm.litellm_gateway import LiteLLMGateway
 from infrastructure.llm.ollama_manager import OllamaManager
+from infrastructure.metrics.router_metrics import RouterMetrics
+from infrastructure.observability.router_observer import RouterObservingEventBus
 from infrastructure.persistence.in_memory import InMemoryCostRepository
+from infrastructure.routing.llm_goal_classifier import LLMGoalClassifier
 from shared.config import Settings
 
 logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger("planner_quality_ab")
 
 SONNET = "claude-sonnet-4-6"
 HAIKU = "claude-haiku-4-5-20251001"
-JUDGE = SONNET  # consistent judge across both arms — eliminates self-grading bias
+ROUTER = "router"  # virtual arm: PlannerModelRouter picks Haiku or Sonnet per goal
+JUDGE = SONNET  # consistent judge across all arms — eliminates self-grading bias
+
+_PLANNER_MODEL_TO_GATEWAY: dict[PlannerModel, str] = {
+    PlannerModel.SONNET: SONNET,
+    PlannerModel.HAIKU: HAIKU,
+}
 
 # 10 goals chosen to span: simple/complex, EN/JA, text/file output, technical/everyday.
 GOALS: list[str] = [
@@ -142,14 +154,16 @@ def _candidates_to_plan(candidates: list[CandidateNode], goal: str) -> Execution
 @dataclass
 class TrialResult:
     goal: str
-    model: str
+    model: str  # arm label: SONNET, HAIKU, or ROUTER
     trial: int
     parse_success: bool
     schema_valid: bool
     entity_preserved: float
     plan_eval: float
     candidate_count: int
     cost_usd: float
+    chosen_model: str | None = None  # for ROUTER arm — actual planner model used
+    classifier_cost_usd: float = 0.0  # for ROUTER arm — extra classifier overhead
     plan_descriptions: list[str] = field(default_factory=list)
 
 
@@ -277,6 +291,81 @@ def line(name: str, s: float, h: float, *, pct: bool) -> tuple[float, bool]:
     return all_ok
 
 
+async def _classify_goals(
+    *,
+    classifier: LLMGoalClassifier,
+    router: PlannerModelRouter,
+    goals: list[str],
+) -> dict[str, tuple[PlannerModel, float]]:
+    """Run the router once per goal; return ``{goal: (chosen_model, classifier_cost)}``."""
+    out: dict[str, tuple[PlannerModel, float]] = {}
+    for goal in goals:
+        chosen, classification = await router.select_for(goal)
+        cost = classification.cost_usd if classification is not None else 0.0
+        out[goal] = (chosen, cost)
+    return out
+
+
+def _print_router_summary(
+    *,
+    sonnet: ModelSummary,
+    haiku: ModelSummary,
+    router: ModelSummary,
+    threshold_pt: float,
+    plan_eval_threshold: float,
+    captured_saving_threshold: float,
+    chosen_models: dict[str, str],
+) -> bool:
+    print("\n=== Router-gated arm summary (per AD-4 acceptance) ===")
+    print(f"{'metric':<20}  {'Sonnet (base)':>14}  {'Router':>10}  "
+          f"{'Δ (Router−Sonnet)':>22}")
+    print("-" * 74)
+
+    def line(name: str, base: float, r: float, *, pct: bool, threshold: float) -> bool:
+        delta = r - base
+        b_str = f"{base * 100:>12.1f}%" if pct else f"{base:>14.3f}"
+        r_str = f"{r * 100:>8.1f}%" if pct else f"{r:>10.3f}"
+        d_str = f"{delta * 100:>+19.1f}pt" if pct else f"{delta:>+22.3f}"
+        ok = delta >= -threshold
+        marker = "✓" if ok else "✗"
+        print(f"{name:<20}  {b_str}  {r_str}  {d_str}  {marker}")
+        return ok
+
+    ok_parse = line("parse_success", sonnet.parse_success, router.parse_success,
+                    pct=True, threshold=threshold_pt / 100)
+    ok_schema = line("schema_valid", sonnet.schema_valid, router.schema_valid,
+                     pct=True, threshold=threshold_pt / 100)
+    ok_entity = line("entity_preserved", sonnet.entity_preserved, router.entity_preserved,
+                     pct=True, threshold=threshold_pt / 100)
+    ok_eval = line("plan_eval", sonnet.plan_eval, router.plan_eval,
+                   pct=False, threshold=plan_eval_threshold)
+
+    print()
+    print(f"avg cost/call: Sonnet ${sonnet.avg_cost_usd:.5f}  "
+          f"Haiku ${haiku.avg_cost_usd:.5f}  Router ${router.avg_cost_usd:.5f}")
+    captured = 0.0
+    if sonnet.avg_cost_usd > haiku.avg_cost_usd:
+        captured = (
+            (sonnet.avg_cost_usd - router.avg_cost_usd)
+            / (sonnet.avg_cost_usd - haiku.avg_cost_usd)
+        )
+        print(f"captured-saving (Router) vs theoretical max (Haiku-only): "
+              f"{captured * 100:.1f}%")
+    ok_capture = captured >= captured_saving_threshold
+
+    counts: dict[str, int] = {}
+    for v in chosen_models.values():
+        counts[v] = counts.get(v, 0) + 1
+    print(f"router routing breakdown: {counts}")
+
+    all_ok = ok_parse and ok_schema and ok_entity and ok_eval and ok_capture
+    verdict = ("PASS — Router meets AD-4 quality + captured-saving thresholds"
+               if all_ok
+               else "FAIL — Router violates at least one AD-4 acceptance bar")
+    print(f"\nRouter verdict: {verdict}")
+    return all_ok
+
+
 async def _main(args: argparse.Namespace) -> int:
     settings = Settings()
     if not settings.has_anthropic:
@@ -289,61 +378,135 @@ async def _main(args: argparse.Namespace) -> int:
 
     evaluator = LLMPlanEvaluator(gateway, models=[JUDGE])
 
-    print("=== LLMPlanner quality A/B: Sonnet 4.6 vs Haiku 4.5 ===")
+    arms = (SONNET, HAIKU, ROUTER) if args.router else (SONNET, HAIKU)
+    title = ("Sonnet 4.6 vs Haiku 4.5 vs Router"
+             if args.router
+             else "Sonnet 4.6 vs Haiku 4.5")
+    print(f"=== LLMPlanner quality A/B: {title} ===")
     print(f"goals: {len(GOALS)}  trials/model: {args.trials}  judge: {JUDGE}")
     print(f"cost cap: ${args.cost_cap_usd:.2f}\n")
 
+    chosen_models: dict[str, str] = {}
+    classifier_cost_total = 0.0
+    if args.router:
+        classifier = LLMGoalClassifier(gateway=gateway)
+        metrics = RouterMetrics()
+        bus = RouterObservingEventBus(inner=InMemoryEventBus(), metrics=metrics)
+        router = PlannerModelRouter(
+            classifier=classifier,
+            event_bus=bus,
+            enabled=True,
+            haiku_confidence_threshold=0.7,
+            classifier_timeout_ms=5_000,
+        )
+        print("  [router] classifying 10 goals...", flush=True)
+        verdicts = await _classify_goals(
+            classifier=classifier, router=router, goals=GOALS
+        )
+        for g, (m, c) in verdicts.items():
+            chosen_models[g] = m.value
+            classifier_cost_total += c
+        print(f"  [router] classifier cost: ${classifier_cost_total:.5f}  "
+              f"breakdown: {chosen_models}\n", flush=True)
+
     rows: list[TrialResult] = []
-    for model in (SONNET, HAIKU):
-        planner = LLMPlanner(gateway, candidates_per_node=3, max_depth=3, model=model)
-        for goal in GOALS:
-            for trial in range(1, args.trials + 1):
-                running = sum(r.cost_usd for r in cost_repo.records)
-                if running > args.cost_cap_usd:
-                    print(f"\n!! cost cap ${args.cost_cap_usd:.2f} exceeded "
-                          f"(spent ${running:.4f}) — aborting", file=sys.stderr)
-                    _print_detail(rows)
-                    return 2
-                print(f"  {model} | trial {trial} | {goal[:60]}", flush=True)
-                row = await _run_one(
-                    planner=planner,
-                    evaluator=evaluator,
-                    cost_repo=cost_repo,
-                    goal=goal,
-                    model=model,
-                    trial=trial,
+    for arm in arms:
+        if arm == ROUTER:
+            for goal in GOALS:
+                pm, cls_cost = verdicts[goal]
+                planner_model = _PLANNER_MODEL_TO_GATEWAY[pm]
+                planner = LLMPlanner(
+                    gateway, candidates_per_node=3, max_depth=3, model=planner_model
                 )
-                rows.append(row)
+                for trial in range(1, args.trials + 1):
+                    running = sum(r.cost_usd for r in cost_repo.records)
+                    if running > args.cost_cap_usd:
+                        print(f"\n!! cost cap ${args.cost_cap_usd:.2f} exceeded "
+                              f"(spent ${running:.4f}) — aborting", file=sys.stderr)
+                        _print_detail(rows)
+                        return 2
+                    print(f"  router→{pm.value} | trial {trial} | {goal[:50]}",
+                          flush=True)
+                    row = await _run_one(
+                        planner=planner,
+                        evaluator=evaluator,
+                        cost_repo=cost_repo,
+                        goal=goal,
+                        model=ROUTER,
+                        trial=trial,
+                    )
+                    row.chosen_model = pm.value
+                    row.classifier_cost_usd = cls_cost
+                    # Roll the per-goal classifier overhead into the router cost.
+                    row.cost_usd = round(row.cost_usd + cls_cost, 6)
+                    rows.append(row)
+        else:
+            planner = LLMPlanner(gateway, candidates_per_node=3, max_depth=3, model=arm)
+            for goal in GOALS:
+                for trial in range(1, args.trials + 1):
+                    running = sum(r.cost_usd for r in cost_repo.records)
+                    if running > args.cost_cap_usd:
+                        print(f"\n!! cost cap ${args.cost_cap_usd:.2f} exceeded "
+                              f"(spent ${running:.4f}) — aborting", file=sys.stderr)
+                        _print_detail(rows)
+                        return 2
+                    print(f"  {arm} | trial {trial} | {goal[:60]}", flush=True)
+                    row = await _run_one(
+                        planner=planner,
+                        evaluator=evaluator,
+                        cost_repo=cost_repo,
+                        goal=goal,
+                        model=arm,
+                        trial=trial,
+                    )
+                    rows.append(row)
 
     _print_detail(rows)
 
     sonnet_sum = _summarize(rows, SONNET)
     haiku_sum = _summarize(rows, HAIKU)
     passed = _print_summary(sonnet_sum, haiku_sum, args.threshold_pt)
 
+    router_passed = True
+    if args.router:
+        router_sum = _summarize(rows, ROUTER)
+        router_passed = _print_router_summary(
+            sonnet=sonnet_sum,
+            haiku=haiku_sum,
+            router=router_sum,
+            threshold_pt=args.threshold_pt,
+            plan_eval_threshold=args.plan_eval_threshold,
+            captured_saving_threshold=args.captured_saving_threshold,
+            chosen_models=chosen_models,
+        )
+
     total_cost = sum(r.cost_usd for r in cost_repo.records)
     print(f"\nTotal benchmark cost: ${total_cost:.4f} ({len(cost_repo.records)} LLM calls)")
+    if args.router:
+        print(f"  (router classifier overhead: ${classifier_cost_total:.5f})")
 
     if args.dump:
+        dump_payload: dict[str, object] = {
+            "judge": JUDGE,
+            "trials": args.trials,
+            "router_mode": args.router,
+            "rows": [r.__dict__ for r in rows],
+            "summary": {
+                "sonnet": sonnet_sum.__dict__,
+                "haiku": haiku_sum.__dict__,
+            },
+            "total_cost_usd": round(total_cost, 6),
+        }
+        if args.router:
+            dump_payload["summary"]["router"] = _summarize(rows, ROUTER).__dict__  # type: ignore[index]
+            dump_payload["router_chosen_models"] = chosen_models
+            dump_payload["router_classifier_cost_usd"] = round(classifier_cost_total, 6)
         Path(args.dump).write_text(
-            json.dumps(
-                {
-                    "judge": JUDGE,
-                    "trials": args.trials,
-                    "rows": [r.__dict__ for r in rows],
-                    "summary": {
-                        "sonnet": sonnet_sum.__dict__,
-                        "haiku": haiku_sum.__dict__,
-                    },
-                    "total_cost_usd": round(total_cost, 6),
-                },
-                indent=2,
-                ensure_ascii=False,
-            )
+            json.dumps(dump_payload, indent=2, ensure_ascii=False)
         )
         print(f"Raw results dumped to {args.dump}")
 
-    return 0 if passed else 1
+    return 0 if (passed and router_passed) else 1
 
 
 def _parse() -> argparse.Namespace:
@@ -356,6 +519,12 @@ def _parse() -> argparse.Namespace:
                    help="Pass if Haiku is within this many points of Sonnet on every axis.")
     p.add_argument("--dump", type=str, default=None,
                    help="Optional path to dump raw JSON results.")
+    p.add_argument("--router", action="store_true",
+                   help="Enable router-gated 3rd arm (AD-4 per-goal routing).")
+    p.add_argument("--plan-eval-threshold", type=float, default=0.030,
+                   help="Router arm passes plan_eval if Δ >= -this (default 0.030).")
+    p.add_argument("--captured-saving-threshold", type=float, default=0.30,
+                   help="Router arm passes captured-saving if >= this (default 0.30).")
     return p.parse_args()
 
 

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,5 +1,11 @@
 # CLAUDE.md Changelog
 
+## Unreleased
+
+- **[FEAT/TD-195]** Goal Classifier Router for planner model selection — `domain/ports/goal_classifier.py` (ABC) + `infrastructure/routing/{llm,local}_goal_classifier.py` (LLM + Ollama impls) + `domain/services/planner_model_router.py`. Per-goal routing of `LLMPlanner` between Sonnet 4.6 and Haiku 4.5, gated by confidence threshold and `MORPHIC_PLANNER_ROUTER` flag (default disabled). Targets the −11.4pt entity_preserved regression from `haiku_planner_ab_2026_05_19` while retaining ≥30% of Haiku's 47.6% cost saving on the eligible slice. See `specs/goal-classifier-router/`.
+
+---
+
 ## v0.6.1 → v0.6.2 (2026-05-15) — **Council Pilot full merge + TD-189 per-task cache_hit_rate + TD-192 fractal-entry latency cut + Haiku 4.5 threshold pinned**
 
 - **[PERF/TD-192]** `OutputRequirementClassifier.classify()` を `FractalBypassClassifier.should_bypass()` 内に折り畳み、fractal-entry の LLM 呼出を **2 → 1** に削減。`BypassDecision` を `(bypass, complexity, output_requirement, reason)` に拡張、`FractalTaskEngine` 側の二重呼出を撤廃。Round 22 live regression (`test_round22_td192_latency.py`, real qwen3:8b) で実測: 2 ゴール × 1 call = 2 total (baseline 4)、artifact ゴール 7.80s, text ゴール 1.08s。TD-191 architectural guard は完全維持

diff --git a/docs/CONTINUATION.md b/docs/CONTINUATION.md
@@ -1,7 +1,45 @@
 # Morphic-Agent — Continuation State
 
-> Last updated: 2026-04-13
-> Last commit: `fix: hard time-based timeout for fractal engine + Round 19 E2E verification (TD-181)`
+> Last updated: 2026-05-20
+> Last commit: `feat(router): Goal Classifier Router for planner model selection (TD-195)`
+> Branch: `feature/goal-classifier-router` (HEAD `e49499c`)
+
+---
+
+## What Was Just Done (2026-05-20)
+
+### Sprint 91 (TD-195) — Goal Classifier Router
+
+**TD-195: Per-goal routing of `LLMPlanner` between Sonnet 4.6 and Haiku 4.5**
+
+Spec-driven (`specs/goal-classifier-router/{spec,plan,tasks.md}`), full
+TDD on `feature/goal-classifier-router`. Implements:
+
+- `GoalClassifierPort` (domain ABC) + `GoalClassification` VO + AD-3
+  6-bucket `ReasonCategory` Literal.
+- `PlannerModelRouter.select_for(goal) → (PlannerModel, GoalClassification | None)`
+  — confidence-gated, fail-safe to Sonnet on timeout / parse error.
+- `LLMGoalClassifier` (Haiku 4.5 via LiteLLM) + `LocalGoalClassifier`
+  (qwen3:8b via Ollama) — share byte-identical `SYSTEM_PROMPT` per TD-190.
+- `EventBusPort` + `InMemoryEventBus` + `RouterObservingEventBus`
+  decorator (metrics + structured logs). `sha256(goal)[:16]` only —
+  raw goal **never** serialized.
+- `MORPHIC_PLANNER_ROUTER` env flag (default `disabled`, opt-in
+  `remote` / `local`).
+
+**Live A/B verdict** (3 arms × 10 goals × 3 trials, $0.97 total):
+
+- entity_preserved: 83.8% (Sonnet) → 81.3% (Router) = **−2.5pt** (≤5pt ✓)
+- plan_eval:        0.898 → 0.884 = **−0.014** (≤0.030 ✓)
+- avg cost / call:  $0.01351 → $0.01218 = **−9.85%**
+- Routing: 4/10 Haiku, 6/10 Sonnet (entity-stressed benchmark)
+- Captured-saving: 20.9% (paper bar 30% missed — workload-mix structural)
+
+Memo: `memory/planner_router_ab_2026_05_20.md`. Ship recommendation
+documented.
+
+### Sprint 90 (TD-194) — Council Pilot full merge
+(See `docs/CHANGELOG.md` for the v0.6.1 → v0.6.2 detail.)
 
 ---
 

diff --git a/docs/ENV_VARS.md b/docs/ENV_VARS.md
@@ -47,6 +47,11 @@ LAEE_BROWSER_HEADLESS=true
 LAEE_GUI_ENABLED=true
 LAEE_CRON_ENABLED=true
 
+# ── Planner Model Router (v0.6.3, TD-195) ──
+MORPHIC_PLANNER_ROUTER=disabled                          # disabled | enabled (enabled時はANTHROPIC_API_KEY有無でremote/localを自動選択)
+MORPHIC_PLANNER_ROUTER_HAIKU_CONFIDENCE_THRESHOLD=0.7    # 0.0–1.0; Haikuを選ぶ最小信頼度
+MORPHIC_PLANNER_ROUTER_CLASSIFIER_TIMEOUT_MS=1500        # classifier hard timeout (ms) → Sonnet fallback
+
 # ── Morphic Settings ──
 MORPHIC_ENV=development
 AUTO_TOOL_INSTALL=false       # true: 自動, false: 承認制