Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
31505c9
docs(goal-classifier-router): scaffold spec + CHANGELOG scope
engkimo May 19, 2026
0316f75
feat(domain): T010-T015 PlannerModel, GoalClassification, GoalClassif…
engkimo May 19, 2026
9e30401
feat(domain): T020-T021 GoalClassifierPort ABC
engkimo May 19, 2026
679c292
test(_fakes): T030 InMemoryGoalClassifier port-compliant fake
engkimo May 19, 2026
5ff44e9
refactor(domain): align ReasonCategory Literal with plan.md AD-3 taxo…
engkimo May 19, 2026
4a2be8d
feat(domain): T040-T041 PlannerModelRouter with AD-2 gating + AD-3 no…
engkimo May 19, 2026
ac496a4
feat(infrastructure): T050-T051 shared classifier prompt + parser
engkimo May 19, 2026
405d0d3
feat(infrastructure): T060-T071 LLMGoalClassifier (remote) + LocalGoa…
engkimo May 19, 2026
cbf3b41
feat(infrastructure): T080-T081 wire PlannerModelRouter into LLMPlanner
engkimo May 19, 2026
840b3d3
feat(config): T090 add planner router settings (MORPHIC_PLANNER_ROUTER)
engkimo May 19, 2026
f6a6ce7
feat(interface): T091 wire PlannerModelRouter into AppContainer
engkimo May 19, 2026
dcdec38
feat(observability): T100-T101 RouterMetrics + RouterObservingEventBus
engkimo May 19, 2026
58df4a1
feat(routing): T110-T111 live integration tests + RouterObservingEven…
engkimo May 19, 2026
00df10a
bench(planner): T120 — add --router mode for AD-4 per-goal routing
engkimo May 20, 2026
e49499c
bench(planner): T121 — live 3-arm A/B dump (TD-195 router)
engkimo May 20, 2026
e1428fc
docs: T130-T132 — TD-195 ADR + ENV_VARS + CONTINUATION update
engkimo May 20, 2026
53bc64d
style(routing): T140 — ruff cleanup for router test files
engkimo May 20, 2026
ba605c1
docs: fix TD-195 env-var values flagged in self-review
engkimo May 20, 2026
4b2c75e
fix(td-195): address CodeRabbit Major findings before merge
engkimo May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 206 additions & 37 deletions benchmarks/planner_quality_ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,32 @@
from pathlib import Path

from domain.entities.fractal_engine import CandidateNode, ExecutionPlan, PlanNode
from domain.services.planner_model_router import PlannerModelRouter
from domain.value_objects.planner_model import PlannerModel
from infrastructure.events.in_memory_event_bus import InMemoryEventBus
from infrastructure.fractal.llm_plan_evaluator import LLMPlanEvaluator
from infrastructure.fractal.llm_planner import LLMPlanner
from infrastructure.llm.cost_tracker import CostTracker
from infrastructure.llm.litellm_gateway import LiteLLMGateway
from infrastructure.llm.ollama_manager import OllamaManager
from infrastructure.metrics.router_metrics import RouterMetrics
from infrastructure.observability.router_observer import RouterObservingEventBus
from infrastructure.persistence.in_memory import InMemoryCostRepository
from infrastructure.routing.llm_goal_classifier import LLMGoalClassifier
from shared.config import Settings

logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("planner_quality_ab")

SONNET = "claude-sonnet-4-6"
HAIKU = "claude-haiku-4-5-20251001"
JUDGE = SONNET # consistent judge across both arms — eliminates self-grading bias
ROUTER = "router" # virtual arm: PlannerModelRouter picks Haiku or Sonnet per goal
JUDGE = SONNET # consistent judge across all arms — eliminates self-grading bias

_PLANNER_MODEL_TO_GATEWAY: dict[PlannerModel, str] = {
PlannerModel.SONNET: SONNET,
PlannerModel.HAIKU: HAIKU,
}

# 10 goals chosen to span: simple/complex, EN/JA, text/file output, technical/everyday.
GOALS: list[str] = [
Expand Down Expand Up @@ -142,14 +154,16 @@ def _candidates_to_plan(candidates: list[CandidateNode], goal: str) -> Execution
@dataclass
class TrialResult:
goal: str
model: str
model: str # arm label: SONNET, HAIKU, or ROUTER
trial: int
parse_success: bool
schema_valid: bool
entity_preserved: float
plan_eval: float
candidate_count: int
cost_usd: float
chosen_model: str | None = None # for ROUTER arm — actual planner model used
classifier_cost_usd: float = 0.0 # for ROUTER arm — extra classifier overhead
plan_descriptions: list[str] = field(default_factory=list)


Expand Down Expand Up @@ -277,6 +291,81 @@ def line(name: str, s: float, h: float, *, pct: bool) -> tuple[float, bool]:
return all_ok


async def _classify_goals(
*,
classifier: LLMGoalClassifier,
router: PlannerModelRouter,
goals: list[str],
) -> dict[str, tuple[PlannerModel, float]]:
"""Run the router once per goal; return ``{goal: (chosen_model, classifier_cost)}``."""
out: dict[str, tuple[PlannerModel, float]] = {}
for goal in goals:
chosen, classification = await router.select_for(goal)
cost = classification.cost_usd if classification is not None else 0.0
out[goal] = (chosen, cost)
return out


def _print_router_summary(
*,
sonnet: ModelSummary,
haiku: ModelSummary,
router: ModelSummary,
threshold_pt: float,
plan_eval_threshold: float,
captured_saving_threshold: float,
chosen_models: dict[str, str],
) -> bool:
print("\n=== Router-gated arm summary (per AD-4 acceptance) ===")
print(f"{'metric':<20} {'Sonnet (base)':>14} {'Router':>10} "
f"{'Δ (Router−Sonnet)':>22}")
print("-" * 74)

def line(name: str, base: float, r: float, *, pct: bool, threshold: float) -> bool:
delta = r - base
b_str = f"{base * 100:>12.1f}%" if pct else f"{base:>14.3f}"
r_str = f"{r * 100:>8.1f}%" if pct else f"{r:>10.3f}"
d_str = f"{delta * 100:>+19.1f}pt" if pct else f"{delta:>+22.3f}"
ok = delta >= -threshold
marker = "✓" if ok else "✗"
print(f"{name:<20} {b_str} {r_str} {d_str} {marker}")
return ok

ok_parse = line("parse_success", sonnet.parse_success, router.parse_success,
pct=True, threshold=threshold_pt / 100)
ok_schema = line("schema_valid", sonnet.schema_valid, router.schema_valid,
pct=True, threshold=threshold_pt / 100)
ok_entity = line("entity_preserved", sonnet.entity_preserved, router.entity_preserved,
pct=True, threshold=threshold_pt / 100)
ok_eval = line("plan_eval", sonnet.plan_eval, router.plan_eval,
pct=False, threshold=plan_eval_threshold)

print()
print(f"avg cost/call: Sonnet ${sonnet.avg_cost_usd:.5f} "
f"Haiku ${haiku.avg_cost_usd:.5f} Router ${router.avg_cost_usd:.5f}")
captured = 0.0
if sonnet.avg_cost_usd > haiku.avg_cost_usd:
captured = (
(sonnet.avg_cost_usd - router.avg_cost_usd)
/ (sonnet.avg_cost_usd - haiku.avg_cost_usd)
)
print(f"captured-saving (Router) vs theoretical max (Haiku-only): "
f"{captured * 100:.1f}%")
ok_capture = captured >= captured_saving_threshold

counts: dict[str, int] = {}
for v in chosen_models.values():
counts[v] = counts.get(v, 0) + 1
print(f"router routing breakdown: {counts}")

all_ok = ok_parse and ok_schema and ok_entity and ok_eval and ok_capture
verdict = ("PASS — Router meets AD-4 quality + captured-saving thresholds"
if all_ok
else "FAIL — Router violates at least one AD-4 acceptance bar")
print(f"\nRouter verdict: {verdict}")
return all_ok


async def _main(args: argparse.Namespace) -> int:
settings = Settings()
if not settings.has_anthropic:
Expand All @@ -289,61 +378,135 @@ async def _main(args: argparse.Namespace) -> int:

evaluator = LLMPlanEvaluator(gateway, models=[JUDGE])

print("=== LLMPlanner quality A/B: Sonnet 4.6 vs Haiku 4.5 ===")
arms = (SONNET, HAIKU, ROUTER) if args.router else (SONNET, HAIKU)
title = ("Sonnet 4.6 vs Haiku 4.5 vs Router"
if args.router
else "Sonnet 4.6 vs Haiku 4.5")
print(f"=== LLMPlanner quality A/B: {title} ===")
print(f"goals: {len(GOALS)} trials/model: {args.trials} judge: {JUDGE}")
print(f"cost cap: ${args.cost_cap_usd:.2f}\n")

chosen_models: dict[str, str] = {}
classifier_cost_total = 0.0
if args.router:
classifier = LLMGoalClassifier(gateway=gateway)
metrics = RouterMetrics()
bus = RouterObservingEventBus(inner=InMemoryEventBus(), metrics=metrics)
router = PlannerModelRouter(
classifier=classifier,
event_bus=bus,
enabled=True,
haiku_confidence_threshold=0.7,
classifier_timeout_ms=5_000,
)
print(" [router] classifying 10 goals...", flush=True)
verdicts = await _classify_goals(
classifier=classifier, router=router, goals=GOALS
)
for g, (m, c) in verdicts.items():
chosen_models[g] = m.value
classifier_cost_total += c
print(f" [router] classifier cost: ${classifier_cost_total:.5f} "
f"breakdown: {chosen_models}\n", flush=True)

rows: list[TrialResult] = []
for model in (SONNET, HAIKU):
planner = LLMPlanner(gateway, candidates_per_node=3, max_depth=3, model=model)
for goal in GOALS:
for trial in range(1, args.trials + 1):
running = sum(r.cost_usd for r in cost_repo.records)
if running > args.cost_cap_usd:
print(f"\n!! cost cap ${args.cost_cap_usd:.2f} exceeded "
f"(spent ${running:.4f}) — aborting", file=sys.stderr)
_print_detail(rows)
return 2
print(f" {model} | trial {trial} | {goal[:60]}", flush=True)
row = await _run_one(
planner=planner,
evaluator=evaluator,
cost_repo=cost_repo,
goal=goal,
model=model,
trial=trial,
for arm in arms:
if arm == ROUTER:
for goal in GOALS:
pm, cls_cost = verdicts[goal]
planner_model = _PLANNER_MODEL_TO_GATEWAY[pm]
planner = LLMPlanner(
gateway, candidates_per_node=3, max_depth=3, model=planner_model
)
rows.append(row)
for trial in range(1, args.trials + 1):
running = sum(r.cost_usd for r in cost_repo.records)
if running > args.cost_cap_usd:
print(f"\n!! cost cap ${args.cost_cap_usd:.2f} exceeded "
f"(spent ${running:.4f}) — aborting", file=sys.stderr)
_print_detail(rows)
return 2
print(f" router→{pm.value} | trial {trial} | {goal[:50]}",
flush=True)
row = await _run_one(
planner=planner,
evaluator=evaluator,
cost_repo=cost_repo,
goal=goal,
model=ROUTER,
trial=trial,
)
row.chosen_model = pm.value
row.classifier_cost_usd = cls_cost
# Roll the per-goal classifier overhead into the router cost.
row.cost_usd = round(row.cost_usd + cls_cost, 6)
rows.append(row)
else:
planner = LLMPlanner(gateway, candidates_per_node=3, max_depth=3, model=arm)
for goal in GOALS:
for trial in range(1, args.trials + 1):
running = sum(r.cost_usd for r in cost_repo.records)
if running > args.cost_cap_usd:
print(f"\n!! cost cap ${args.cost_cap_usd:.2f} exceeded "
f"(spent ${running:.4f}) — aborting", file=sys.stderr)
_print_detail(rows)
return 2
print(f" {arm} | trial {trial} | {goal[:60]}", flush=True)
row = await _run_one(
planner=planner,
evaluator=evaluator,
cost_repo=cost_repo,
goal=goal,
model=arm,
trial=trial,
)
rows.append(row)

_print_detail(rows)

sonnet_sum = _summarize(rows, SONNET)
haiku_sum = _summarize(rows, HAIKU)
passed = _print_summary(sonnet_sum, haiku_sum, args.threshold_pt)

router_passed = True
if args.router:
router_sum = _summarize(rows, ROUTER)
router_passed = _print_router_summary(
sonnet=sonnet_sum,
haiku=haiku_sum,
router=router_sum,
threshold_pt=args.threshold_pt,
plan_eval_threshold=args.plan_eval_threshold,
captured_saving_threshold=args.captured_saving_threshold,
chosen_models=chosen_models,
)

total_cost = sum(r.cost_usd for r in cost_repo.records)
print(f"\nTotal benchmark cost: ${total_cost:.4f} ({len(cost_repo.records)} LLM calls)")
if args.router:
print(f" (router classifier overhead: ${classifier_cost_total:.5f})")

if args.dump:
dump_payload: dict[str, object] = {
"judge": JUDGE,
"trials": args.trials,
"router_mode": args.router,
"rows": [r.__dict__ for r in rows],
"summary": {
"sonnet": sonnet_sum.__dict__,
"haiku": haiku_sum.__dict__,
},
"total_cost_usd": round(total_cost, 6),
}
if args.router:
dump_payload["summary"]["router"] = _summarize(rows, ROUTER).__dict__ # type: ignore[index]
dump_payload["router_chosen_models"] = chosen_models
dump_payload["router_classifier_cost_usd"] = round(classifier_cost_total, 6)
Path(args.dump).write_text(
json.dumps(
{
"judge": JUDGE,
"trials": args.trials,
"rows": [r.__dict__ for r in rows],
"summary": {
"sonnet": sonnet_sum.__dict__,
"haiku": haiku_sum.__dict__,
},
"total_cost_usd": round(total_cost, 6),
},
indent=2,
ensure_ascii=False,
)
json.dumps(dump_payload, indent=2, ensure_ascii=False)
)
Comment on lines 504 to 506

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Add deterministic key ordering to benchmark JSON dumps.

At Lines 504-506, serialization omits sort_keys=True, which makes dump ordering unstable across runs and creates avoidable diff noise.

Proposed patch
         Path(args.dump).write_text(
-            json.dumps(dump_payload, indent=2, ensure_ascii=False)
+            json.dumps(dump_payload, indent=2, ensure_ascii=False, sort_keys=True)
         )

As per coding guidelines, JSON/YAML serialization must use sort_keys=True for deterministic output.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@benchmarks/planner_quality_ab.py` around lines 504 - 506, The JSON dump call
using Path(args.dump).write_text(json.dumps(dump_payload, indent=2,
ensure_ascii=False)) produces non-deterministic key ordering; update the call to
include sort_keys=True so json.dumps(dump_payload, indent=2, ensure_ascii=False,
sort_keys=True) to produce deterministic, stable output for dump_payload and
reduce diff noise.

print(f"Raw results dumped to {args.dump}")

return 0 if passed else 1
return 0 if (passed and router_passed) else 1


def _parse() -> argparse.Namespace:
Expand All @@ -356,6 +519,12 @@ def _parse() -> argparse.Namespace:
help="Pass if Haiku is within this many points of Sonnet on every axis.")
p.add_argument("--dump", type=str, default=None,
help="Optional path to dump raw JSON results.")
p.add_argument("--router", action="store_true",
help="Enable router-gated 3rd arm (AD-4 per-goal routing).")
p.add_argument("--plan-eval-threshold", type=float, default=0.030,
help="Router arm passes plan_eval if Δ >= -this (default 0.030).")
p.add_argument("--captured-saving-threshold", type=float, default=0.30,
help="Router arm passes captured-saving if >= this (default 0.30).")
return p.parse_args()


Expand Down
6 changes: 6 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# CLAUDE.md Changelog

## Unreleased

- **[FEAT/TD-195]** Goal Classifier Router for planner model selection — `domain/ports/goal_classifier.py` (ABC) + `infrastructure/routing/{llm,local}_goal_classifier.py` (LLM + Ollama impls) + `domain/services/planner_model_router.py`. Per-goal routing of `LLMPlanner` between Sonnet 4.6 and Haiku 4.5, gated by confidence threshold and `MORPHIC_PLANNER_ROUTER` flag (default disabled). Targets the −11.4pt entity_preserved regression from `haiku_planner_ab_2026_05_19` while retaining ≥30% of Haiku's 47.6% cost saving on the eligible slice. See `specs/goal-classifier-router/`.

---

## v0.6.1 → v0.6.2 (2026-05-15) — **Council Pilot full merge + TD-189 per-task cache_hit_rate + TD-192 fractal-entry latency cut + Haiku 4.5 threshold pinned**

- **[PERF/TD-192]** `OutputRequirementClassifier.classify()` を `FractalBypassClassifier.should_bypass()` 内に折り畳み、fractal-entry の LLM 呼出を **2 → 1** に削減。`BypassDecision` を `(bypass, complexity, output_requirement, reason)` に拡張、`FractalTaskEngine` 側の二重呼出を撤廃。Round 22 live regression (`test_round22_td192_latency.py`, real qwen3:8b) で実測: 2 ゴール × 1 call = 2 total (baseline 4)、artifact ゴール 7.80s, text ゴール 1.08s。TD-191 architectural guard は完全維持
Expand Down
42 changes: 40 additions & 2 deletions docs/CONTINUATION.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,45 @@
# Morphic-Agent — Continuation State

> Last updated: 2026-04-13
> Last commit: `fix: hard time-based timeout for fractal engine + Round 19 E2E verification (TD-181)`
> Last updated: 2026-05-20
> Last commit: `feat(router): Goal Classifier Router for planner model selection (TD-195)`
> Branch: `feature/goal-classifier-router` (HEAD `e49499c`)

---

## What Was Just Done (2026-05-20)

### Sprint 91 (TD-195) — Goal Classifier Router

**TD-195: Per-goal routing of `LLMPlanner` between Sonnet 4.6 and Haiku 4.5**

Spec-driven (`specs/goal-classifier-router/{spec,plan,tasks.md}`), full
TDD on `feature/goal-classifier-router`. Implements:

- `GoalClassifierPort` (domain ABC) + `GoalClassification` VO + AD-3
6-bucket `ReasonCategory` Literal.
- `PlannerModelRouter.select_for(goal) → (PlannerModel, GoalClassification | None)`
— confidence-gated, fail-safe to Sonnet on timeout / parse error.
- `LLMGoalClassifier` (Haiku 4.5 via LiteLLM) + `LocalGoalClassifier`
(qwen3:8b via Ollama) — share byte-identical `SYSTEM_PROMPT` per TD-190.
- `EventBusPort` + `InMemoryEventBus` + `RouterObservingEventBus`
decorator (metrics + structured logs). `sha256(goal)[:16]` only —
raw goal **never** serialized.
- `MORPHIC_PLANNER_ROUTER` env flag (default `disabled`, opt-in
`remote` / `local`).
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

**Live A/B verdict** (3 arms × 10 goals × 3 trials, $0.97 total):

- entity_preserved: 83.8% (Sonnet) → 81.3% (Router) = **−2.5pt** (≤5pt ✓)
- plan_eval: 0.898 → 0.884 = **−0.014** (≤0.030 ✓)
- avg cost / call: $0.01351 → $0.01218 = **−9.85%**
- Routing: 4/10 Haiku, 6/10 Sonnet (entity-stressed benchmark)
- Captured-saving: 20.9% (paper bar 30% missed — workload-mix structural)

Memo: `memory/planner_router_ab_2026_05_20.md`. Ship recommendation
documented.

### Sprint 90 (TD-194) — Council Pilot full merge
(See `docs/CHANGELOG.md` for the v0.6.1 → v0.6.2 detail.)

---

Expand Down
5 changes: 5 additions & 0 deletions docs/ENV_VARS.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ LAEE_BROWSER_HEADLESS=true
LAEE_GUI_ENABLED=true
LAEE_CRON_ENABLED=true

# ── Planner Model Router (v0.6.3, TD-195) ──
MORPHIC_PLANNER_ROUTER=disabled # disabled | enabled (enabled時はANTHROPIC_API_KEY有無でremote/localを自動選択)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
MORPHIC_PLANNER_ROUTER_HAIKU_CONFIDENCE_THRESHOLD=0.7 # 0.0–1.0; Haikuを選ぶ最小信頼度
MORPHIC_PLANNER_ROUTER_CLASSIFIER_TIMEOUT_MS=1500 # classifier hard timeout (ms) → Sonnet fallback

# ── Morphic Settings ──
MORPHIC_ENV=development
AUTO_TOOL_INSTALL=false # true: 自動, false: 承認制
Expand Down
Loading
Loading