diff --git a/.gitignore b/.gitignore index ffbbe1f..c22ff5b 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,5 @@ plugin/scripts/ benchmarks/datasets/longmemeval/longmemeval_s benchmarks/datasets/longmemeval/longmemeval_oracle benchmarks/datasets/longmemeval/.cache/ +.venv-golden/ +.coverage diff --git a/CLAUDE.md b/CLAUDE.md index 16c9791..db66a4d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -87,10 +87,10 @@ CTX = **Claude Code의 자동 context 주입 시스템**. - `docs/research/CTX_NEMOTRON_COMPARISON_REPORT.docx` ### Phase 3: CTX 약점 분석 + 대안 조사 (expert-research) -**CTX 3대 약점**: -1. 외부 코드베이스 R@5=0.152 (heuristic 과적합) -2. keyword 쿼리 R@3=0.379 < BM25=0.667 -3. 교차 파일 추론 불가 (multi-hop) +**CTX 3대 약점** (2026-03-27 시점 진단): +1. ~~외부 코드베이스 R@5=0.152 (heuristic 과적합)~~ — **갱신: iter11 재측정 (`benchmarks/results/reeval_external_iter11.json`) 결과 Mean R@5=0.595** (Flask 0.6462 / FastAPI 0.3870 / Requests 0.7526). 0.152 는 pre-fix baseline 으로 stale. +2. keyword 쿼리 R@3=0.379 < BM25=0.667 — Phase 5 에서 **0.724 달성** (해소) +3. 교차 파일 추론 불가 (multi-hop) — 잔존 약점 **즉시 실행 가능 개선**: TF-IDF → BM25 교체 (ROI 최고) - 결과 문서: `FromScratch/docs/research/20260327-ctx-alternatives-research.md` @@ -194,7 +194,7 @@ def rank_ctx_doc(query, docs, bm25_index=None): 3. **G2 real codebase Δ+0.200 개선**: instruction parsing → CTX query 변환 레이어 추가 ### 중기 (1-2주) -3. **외부 코드베이스 R@5=0.152 개선**: AST 파서 기반 심볼 추출 (heuristic 제거) +3. **외부 코드베이스 R@5 추가 개선**: 현재 Mean R@5=0.595 (iter11), FastAPI 0.387 가 최약점. AST 파서 기반 심볼 추출 (heuristic 제거) 검토 - `src/retrieval/adaptive_trigger.py`의 `_index_symbols()` 개선 4. **교차 파일 추론**: Import graph BFS 확장 (현재 2-hop 한계) diff --git a/LICENSE b/LICENSE index ecaaf77..0e5afec 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (c) 2026 jaytoone +Copyright (c) 2026 hang-in (tunaCtx fork — production-level refactor) +Copyright (c) 2026 jaytoone (original CTX — https://github.com/jaytoone/CTX) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 81db1ce..37e8c8e 100644 --- a/README.md +++ b/README.md @@ -1,391 +1,208 @@ -# CTX: Trigger-Driven Dynamic Context Loading for Code-Aware LLM Agents - -[![PyPI version](https://img.shields.io/pypi/v/ctx-retriever)](https://pypi.org/project/ctx-retriever/) -[![PyPI downloads](https://img.shields.io/pypi/dm/ctx-retriever)](https://pypi.org/project/ctx-retriever/) -[![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://pypi.org/project/ctx-retriever/) -[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE) -[![HuggingFace Demo](https://img.shields.io/badge/HuggingFace-Demo-orange)](https://huggingface.co/spaces/Be2Jay/ctx-demo) -[![Publish to PyPI](https://github.com/jaytoone/CTX/actions/workflows/publish.yml/badge.svg)](https://github.com/jaytoone/CTX/actions/workflows/publish.yml) - -![CTX Knowledge Graph — decisions, docs, and prompts in real time](docs/media/ctx-cover-graph.png) - -CTX classifies developer queries into four trigger types and routes each to a specialized retrieval pipeline. For dependency-sensitive queries, CTX traverses the codebase import graph to resolve transitive relationships that keyword and embedding methods miss. It achieves **1.9x higher Token-Efficiency Score** than BM25 while using only **5.2% of tokens**, and **outperforms BM25 on held-out external codebases** (Flask, FastAPI, Requests — mean R@5 +0.163). - -> **Key insight**: code import graphs encode structural dependency information that text-based RAG cannot capture. CTX achieves Recall@5 = 1.0 on implicit dependency queries vs 0.4 for BM25. - -**[▶ Dashboard demo (39s)](https://drive.google.com/file/d/1b4ZvbRYkXKTepKDx8N7gLfim-zLDiCGo/view?usp=sharing)** - -## Install - -```bash -pip install ctx-retriever -``` - -Or from source: - -```bash -git clone https://github.com/jaytoone/CTX -cd CTX -pip install -e . -``` - -## Quick Start - -```python -from ctx_retriever.retrieval.adaptive_trigger import AdaptiveTriggerRetriever - -# Point at any codebase directory -retriever = AdaptiveTriggerRetriever("/path/to/your/project") - -# Retrieve relevant files for any natural-language query -result = retriever.retrieve( - query_id="my_query", - query_text="how does authentication work?", - k=5 -) - -for filepath in result.retrieved_files: - print(filepath, result.scores[filepath]) -``` - -## Claude Code Hook (Recommended) - -CTX runs as a set of Claude Code hooks that inject relevant past decisions, docs, and code into every prompt. Install is one command: - -```bash -pip install ctx-retriever -ctx-install # register CTX hooks in ~/.claude/settings.json -``` - -**That's it.** Restart Claude Code and hooks fire on every prompt. - -### Optional: enable cross-encoder reranking (BGE) - -By default, CTX uses BM25 + vec-daemon (multilingual-e5-small, ~120MB) for semantic search. -For higher-quality reranking, enable the BGE cross-encoder (BAAI/bge-reranker-v2-m3, ~2GB): - -```bash -# Add to ~/.claude/settings.json env block: -CTX_BGE_ENABLE=1 -``` - -When enabled, bge-daemon starts automatically on session open and reranks retrieved results. -**Not recommended for machines with less than 4GB RAM or slow internet** (model downloads on first run). - -### What ctx-install does (atomic, backup-first) - -1. Verifies the 4 CTX hook files exist at `~/.claude/hooks/` (chat-memory, bm25-memory, memory-keyword-trigger, g2-fallback) -2. Reads `~/.claude/settings.json`, takes a timestamped backup (`settings.json.bak.`) -3. Merges the CTX hook registrations into the existing `hooks` dict **without overwriting your other hooks** (dedupes by command string — safe to re-run) -4. Atomically writes the new settings.json (temp-file-then-rename — never leaves partial state on disk) -5. Smoke-tests by firing `bm25-memory.py` once with a dummy prompt and confirming `last-injection.json` gets written - -### Other subcommands - -```bash -ctx-install --dry-run # show what would change, touch nothing -ctx-install status # verify hook file presence + settings.json registration + last fire -ctx-install --uninstall # remove CTX hook registrations (hook files left in place) -``` - -### Manual install (legacy — only needed if `ctx-install` fails) - -```bash -# 1. Copy hook files to ~/.claude/hooks/ -# 2. Register each in ~/.claude/settings.json under the appropriate event key -``` - -Example settings block (what ctx-install writes for you): - -```json -{ - "hooks": { - "UserPromptSubmit": [ - { "hooks": [{ "type": "command", "command": "python3 $HOME/.claude/hooks/chat-memory.py" }] }, - { "hooks": [{ "type": "command", "command": "python3 $HOME/.claude/hooks/bm25-memory.py --rich" }] }, - { "hooks": [{ "type": "command", "command": "python3 $HOME/.claude/hooks/memory-keyword-trigger.py" }] } - ], - "PostToolUse": [ - { "matcher": "Grep", - "hooks": [{ "type": "command", "command": "python3 $HOME/.claude/hooks/g2-fallback.py" }] } - ] - } -} -``` - -**What you get in each prompt:** -``` -[CTX] Trigger: EXPLICIT_SYMBOL | Query: AuthService | Confidence: 0.70 | Intent: judge from prompt -Code files (3/847 total): -• src/auth/service.py [score=1.000] -• src/auth/middleware.py [score=0.823] -• tests/test_auth.py [score=0.741] -(Use the prompt intent to decide how to treat this context.) -``` - -## Validate on your own transcripts - -Before installing, you can measure what CTX *would* give you on your own Claude Code transcripts — no install, no signup, no upload: - -```bash -python3 benchmarks/ctx_validate.py --days 7 -``` - -stdlib-only; reads `~/.claude/projects/*/.jsonl` locally and emits a Wilson-95-CI markdown report: - -``` -- Text match rate: 26.9% [23.2%, 31.1%] ±4.0pp (n=201) -- Tool-use match: 11.1% [8.6%, 14.2%] ±2.8pp -- Union (either): 32.8% [28.7%, 37.1%] ±4.2pp -Per response-type: - prose: 51.2% ±10.3pp (n=86) - tool_heavy: 26.2% ±8.2pp (n=107) - mixed: 25.0% ±26.0pp (n=8) -``` - -**What this measures** — distinctive terms from each user prompt, substring-matched against the assistant's response text AND tool_use parameters (file_path/command/pattern). On turns where CTX's hooks would surface related context, this rate approximates the *ceiling* of plausible utility. It is NOT a direct CTX measurement — install CTX and compare against live `utility_measured` telemetry for the actual delta. Use it to decide "is this signal worth pursuing?" before committing to install. - -Live dashboard (after install): - -![CTX Telemetry Dashboard](docs/media/ctx-cover.png) - -The dashboard visualizes utility in four stacked views — pooled rate with 95% CI, per-block breakdown (g1/g2_docs/g2_prefetch), by response type (prose/mixed/tool_heavy), and by item age (0-7d / 7-30d / 30d+). The knowledge graph below it lights up decisions in coral when Claude actually used them in the last 7 days; dead-weight decisions (no recent references) appear muted — pruning candidates. - -## Hook Performance - -CTX adds no LLM calls — latency is purely algorithmic (BM25 + BFS indexing): - -| Project | Language | Files | Hook Latency | -|---------|----------|-------|-------------| -| Small project | Python | ~88 | ~40ms | -| Medium project | Python | ~215 | ~165ms | -| Large project | TypeScript | ~651 | ~270ms | -| Very large | any | >2000 | skipped (auto-excluded) | - -The hook is skipped for prompts <15 chars, slash commands, `[noctx]` tags, and codebases with <3 files. - -**Control tags** you can add to any prompt: - -| Tag | Effect | -|-----|--------| -| `[noctx]` | Disable CTX for this prompt | -| `[fix]` | Fix/Replace mode — adds anti-anchoring reminder so Claude doesn't copy the existing (potentially wrong) implementation | - -`[fix]` is also auto-triggered when the prompt starts with `fix:`, `bug:`, `refactor:`, or `replace:`. - -## Trigger Types - -| Trigger | When Used | Mechanism | -|---------|-----------|-----------| -| `EXPLICIT_SYMBOL` | Query names a class/function | Symbol index lookup | -| `SEMANTIC_CONCEPT` | Query describes a concept | BM25 keyword scoring | -| `IMPLICIT_CONTEXT` | Dependency queries ("what uses X") | BFS import graph traversal | -| `TEMPORAL_HISTORY` | Recent changes / history | Session file tracker | - -## Results - -### Synthetic Benchmark (50 files, 166 queries) - -| Strategy | Recall@5 | Token Usage | TES | -|----------|----------|-------------|-----| -| Full Context | 0.075 | 100.0% | 0.019 | -| BM25 | 0.982 | 18.7% | 0.410 | -| Dense TF-IDF | 0.973 | 21.0% | 0.406 | -| GraphRAG-lite | 0.523 | 24.0% | 0.218 | -| LlamaIndex | 0.972 | 20.1% | 0.405 | -| Chroma Dense | 0.829 | 19.3% | 0.346 | -| Hybrid Dense+CTX | 0.725 | 23.6% | 0.303 | -| **CTX (Ours)** | **0.874** | **5.2%** | **0.776** | - -**TES** = Recall@5 / ln(1 + files_loaded). Higher = better token efficiency. - -### External Codebase Benchmark (Flask, FastAPI, Requests) - -CTX outperforms BM25 on all three held-out external codebases in code-to-code structural retrieval: - -| Codebase | Files | CTX R@5 | BM25 R@5 | Δ | -|----------|-------|---------|----------|---| -| Flask | 79 | **0.545** | 0.347 | **+0.198** | -| FastAPI | 928 | **0.328** | 0.174 | **+0.154** | -| Requests | 35 | **0.626** | 0.489 | **+0.137** | -| **Mean** | — | **0.500** | 0.337 | **+0.163** | - -*Bootstrap 95% CI: external mean [0.441, 0.550]* - -### COIR External Benchmark (CodeSearchNet Python) - -| Strategy | Recall@1 | Recall@5 | MRR | -|----------|----------|----------|-----| -| Dense Embedding (MiniLM) | 0.960 | 1.000 | 0.978 | -| Hybrid Dense+CTX | 0.930 | 0.950 | 0.940 | -| BM25 | 0.920 | 0.980 | 0.946 | -| CTX Adaptive Trigger | 0.720 | 0.740 | 0.728 | - -### Downstream LLM Evaluation - -CTX context injected into developer prompts improves LLM task quality across two models: - -| Scenario | WITH CTX | WITHOUT CTX | Δ | -|----------|----------|-------------|---| -| G1 (session memory recall) | 1.000 | 0.110 | **+0.890** | -| G2 (CTX-specific knowledge) | 0.688 | 0.000 | **+0.688** | - -G1: CTX persistent memory enables perfect cross-session recall (vs 11% without). G2: CTX context eliminates hallucination on CTX-specific API queries. - -### Key Findings - -- CTX achieves **1.9x higher TES** than BM25 with only 5.2% token usage -- CTX achieves **perfect Recall@5 (1.0)** on IMPLICIT_CONTEXT dependency queries -- CTX **outperforms BM25 on all 3 external codebases** in code-to-code retrieval (mean +0.163 R@5) -- CTX context improves downstream LLM task quality: **G1 +0.890**, **G2 +0.688** -- Trigger classifier achieves **100% accuracy** (all 4 types F1=1.00) on synthetic benchmark -- CTX Adaptive Trigger achieves **R@5=0.740 on COIR** (improved from 0.380 via BM25 hybrid + CamelCase fix) -- Hybrid Dense+CTX achieves R@5=0.950 on COIR — best of both worlds -- No single strategy dominates all dimensions — workload determines optimal choice - -## When to Use CTX - -**CTX excels when:** -- You need dependency-aware retrieval: `IMPLICIT_CONTEXT` queries (e.g., "what uses AuthService?") achieve perfect Recall@5 (1.0) via BFS import graph traversal -- Working with a **known codebase** with established symbol/import structure — code-to-code retrieval outperforms BM25 on real projects (Flask: +0.198, FastAPI: +0.154, Requests: +0.137) -- Token budget is critical — CTX uses only **5.2% of tokens** vs 18.7% for BM25 (TES: 1.9x higher) -- Queries name **explicit symbols** (class names, function names) — EXPLICIT_SYMBOL trigger routes directly to symbol index - -**CTX is not designed for:** -- **Text-to-code semantic search** (COIR-style): finding code from natural-language descriptions. CTX R@5=0.740 vs BM25=0.980 on CodeSearchNet Python — still a gap; for best results use Dense Embedding or Hybrid Dense+CTX instead -- **Large unseen codebases** (>500 files, no prior indexing): heuristic symbol extraction degrades at scale; consider AST-based indexers -- **Natural-language concept queries** without code keywords: SEMANTIC_CONCEPT trigger falls back to BM25, losing CTX's structural advantage - -## Running Experiments - -```bash -# Synthetic benchmark -python run_experiment.py --dataset-size small --strategy all - -# Real codebase -python run_experiment.py --dataset-source real --project-path /path/to/project --strategy all - -# COIR external benchmark -python run_coir_eval.py --n-queries 100 - -# Ablation study -python run_experiment.py --dataset-size small --mode ablation -``` - -Results are written to `benchmarks/results/`. - -## Project Structure - -``` -CTX/ - src/ - retrieval/ # Retrieval strategies (8 total) - adaptive_trigger.py # CTX core: trigger-driven retrieval - hybrid_dense_ctx.py # Hybrid: dense seed + graph expansion - bm25_retriever.py # BM25 sparse retrieval - dense_retriever.py # TF-IDF dense retrieval - chroma_retriever.py # ChromaDB + sentence-transformers - graph_rag.py # GraphRAG-lite baseline - llamaindex_retriever.py # LlamaIndex AST-aware chunking - full_context.py # Full context baseline - trigger/ # Trigger classifier (4 types) - evaluator/ # Benchmark runner, metrics, COIR - data/ # Dataset generation, real codebase loader - hooks/ - ctx_real_loader.py # Claude Code UserPromptSubmit hook - ctx_session_tracker.py # PostToolUse session tracker - benchmarks/ - results/ # Experiment results and reports - docs/ - claude_code_integration.md # Claude Code setup guide - paper/ # Paper draft (markdown + LaTeX) -``` - -## Telemetry (opt-in, local-only) - -CTX can log retrieval quality metrics locally to help you understand how well the context injection is working. - -**Opt in:** -```bash -export CTX_TELEMETRY=1 # enable for this shell -# or: touch ~/.claude/ctx-telemetry.enabled # persist across shells -``` - -**View your data:** -```bash -ctx-telemetry # summary + flywheel health verdict (causal r, upgrade hint) -ctx-telemetry last # last 10 session turns -ctx-telemetry calibrate # citation bias + causal r-analysis (v1.5) -ctx-telemetry tune # compute auto-tune params → ctx-auto-tune.json -ctx-telemetry cluster [-p DIR] # detect tech stack → project_type_hint in ctx-auto-tune.json -ctx-telemetry consent # Stage 2 upload consent status -ctx-telemetry upload # Stage 2 dry-run preview -ctx-telemetry clear # delete all local telemetry logs -``` - -Sample `ctx-telemetry` output: -``` -CTX Retrieval Telemetry — 42 session-turn records (schema v1.6) -... -Flywheel health [n=42]: causal-r=+0.35 | upgrade=✓ HYBRID | kw=43% -``` - -**Auto-tune (flywheel):** After `ctx-telemetry tune` runs with ≥15 records, CTX automatically adjusts retrieval parameters based on your usage patterns (e.g., top_k reduction for query types with lower citation rates). The active tuning state is shown in CTX's context header: `> **CTX auto-tune** [n=42, hybrid✓]`. - -With ≥10 v1.5 records, `tune` also computes a causal signal: Pearson r between BM25 top retrieval score and citation rate. High r (>0.30) means quality-driven citations — HYBRID upgrade is worthwhile. Low r (<0.10) suggests position bias may be dominant — validate before upgrading. This is stored as `hybrid_upgrade_hint` in `ctx-auto-tune.json`. - -**Project cluster detection (Stage 3 prerequisite):** `ctx-telemetry cluster` scans your project's source files, matches term frequencies against tech-stack signature profiles (python_ml, python_backend, nextjs_react, rust_systems, go_backend), and writes `project_type_hint` to `ctx-auto-tune.json`. This is a local-first proxy for the Stage 3 `project_type_id` cluster — enabling cold-start pre-warming without requiring cross-user data. Example output: -``` -python_ml ██████████████████████████████ 80.0% (18 keywords matched) -python_backend ███████ 19.0% (13 keywords matched) -Project type: python_ml (confidence: HIGH) -``` - -### What is collected (schema v1.6) - -All data stays on your machine at `~/.claude/ctx-retrieval-events.jsonl`. Nothing is uploaded. - -| Field | Type | Description | -|-------|------|-------------| -| `user_id` | string(16) | SHA256(machine-id + install-month)[:16] — anonymous, changes on reinstall | -| `session_id_hash` | string(16) | SHA256(session_id)[:16] — non-reversible | -| `ts_unix_hour` | int | Unix timestamp truncated to hour | -| `hook_source` | enum | G1 / G2_DOCS / G2_CODE / CM | -| `query_type` | enum | KEYWORD / SEMANTIC / TEMPORAL | -| `retrieval_method` | enum | HYBRID / BM25 / UNKNOWN | -| `candidates_returned` | int | Number of candidates before ranking | -| `total_injected` | int | Items injected into context | -| `total_cited` | int | Items referenced by the AI response | -| `utility_rate` | float | cited / injected — retrieval precision proxy | -| `session_turn_index` | int | Turn index within the current session | -| `vec_daemon_up` | bool | Whether semantic layer was active | -| `bge_daemon_up` | bool | Whether cross-encoder reranker was active | -| `duration_ms` | int | Per-block retrieval latency | -| `top_score_bm25` | float\|null | Max BM25 score — causal calibration signal (v1.5) | -| `top_score_dense` | float\|null | Max cosine similarity score (v1.5) | - -### What is NOT collected - -- ❌ No query text, response text, or code content -- ❌ No file names, commit messages, or project paths -- ❌ No email, device name, or personally identifiable information -- ❌ No network requests — Stage 1 is local-only - -### Privacy design - -- `user_id` = SHA256(machine-id + month-boundary) — not linkable to email or name; changes on reinstall -- Timestamps truncated to **hour** (not minute) -- All content stripped — only counts, rates, method names, and latency -- Follows [Sourcegraph's numeric-only telemetry](https://sourcegraph.com/docs/admin/telemetry) pattern - -**Stage 2 (not yet implemented):** opt-in upload of k-anonymized `session_aggregate` rows via `ctx-telemetry consent`. Rows with fewer than 5 users per (date × project_type) window are suppressed before any upload. - -## Paper - -- Paper draft: [`docs/paper/CTX_paper_draft.md`](docs/paper/CTX_paper_draft.md) -- arXiv: TBD -- EMNLP 2026 submission: TBD - -## License - -MIT +# tunaCtx + +원본 [jaytoone/CTX](https://github.com/jaytoone/CTX) 를 production-level 로 리팩토링/보강한 fork. +retrieval 알고리즘은 원본 그대로 유지. Claude Code hook 구현이 실제 사용 환경에서 안전하게 운영되도록 모듈 분해, 패키징/설치 정합성, 회귀 가드, 텔레메트리만 손봤음. + +## 어디에 어떻게 쓰는가 + +- **환경**: Claude Code (CLI / IDE / web). +- **트리거**: 사용자가 프롬프트 보낼 때마다 실행되는 `UserPromptSubmit` hook. +- **역할**: 프롬프트에 관련된 과거 의사결정(G1) + 관련 docs/코드(G2) 를 자동으로 context 에 주입. +- **검색 stack**: 단일 알고리즘이 아니라 layer 조합 — + - **G1 (시간축 기억)**: cross-session decision memory (git HEAD 키 캐시) — 세션을 넘어 과거 결정과 그 이유 회상 + - **G2-DOCS / G2-CODE (공간축 검색)**: BM25 + (옵션) BGE-reranker-v2-m3 **cross-encoder rerank** + - **chat-memory**: vault.db FTS5 + **vec0 dense (multilingual-e5-small 384-dim) 하이브리드** (α=0.5 cosine + 0.5 bm25) +- **외부 의존성**: LLM API 호출 없음. 로컬 BM25 + (옵션) vec-daemon (multilingual-e5-small) + (옵션) BGE cross-encoder. +- **외부 codebase 측정값 (참고)**: upstream 의 `benchmarks/results/reeval_external_iter11.json` 기준 Mean R@5 = **0.595** (Flask 0.6462 / FastAPI 0.3870 / Requests 0.7526). 본 fork 는 retrieval 알고리즘을 변경하지 않으므로 동일 수치 적용. upstream docs 일부에 잔존하는 `R@5=0.152` 는 pre-fix baseline 으로 stale ([upstream issue #2](https://github.com/jaytoone/CTX/issues/2)). + +### 설치 + +```bash +git clone https://github.com/hang-in/tunaCtx +cd tunaCtx +pip install -e . +ctx-install +``` + +`ctx-install` 동작: +1. `~/.claude/hooks/` 에 hook 파일 + `_bm25/` sub-package 복사 (atomic write + 타임스탬프 backup) +2. `~/.claude/settings.json` 에 hook command 등록 (기존 다른 도구 hook 보존, 멱등 — 재실행해도 중복 추가 없음) +3. 기존 hook 파일이 있으면 hash 비교 → 다르면 자동 update + `.backup_.py` 생성 + +플래그: + +| 플래그 | 동작 | +|---|---| +| `--dry-run` | 실제 변경 없이 미리보기 | +| `--force-hooks` | hash 비교 없이 강제 덮어쓰기 | +| `--no-update-hooks` | 기존 파일 무조건 skip (사용자 수정 보존) | +| `--uninstall` | settings.json 의 hook 등록 제거 | +| `status` (positional) | 설치 상태 점검 | + +### 텔레메트리 (opt-in, 로컬) + +```bash +export CTX_TELEMETRY=1 # 현재 셸만 +# 또는: touch ~/.claude/ctx-telemetry.enabled # 영구 +``` + +활성화 시 `~/.claude/ctx-telemetry.jsonl` 에 retrieval event 기록 (network upload 없음). +비활성 시 zero-cost early return — orchestrator 모듈 로드 시 gate 1 회 평가 + 호출당 bool 체크 (≈ 0.01µs). + +이벤트 종류: `hook_complete`, `prompt_received`, `g1_done`, `g2_docs_done`, `g2_code_done`, `g2_hooks_done`, `hook_invoked`. 스키마 명세는 `docs/refactor/TELEMETRY_SCHEMA.md`. + +### 제어 태그 + +| 태그 | 효과 | +|---|---| +| `[noctx]` | 해당 프롬프트에서 CTX context 주입 disable | +| `[fix]` | anti-anchoring 모드 — 기존 구현을 그대로 베끼지 않도록 reminder 추가 | + +`[fix]` 는 prompt 가 `fix:` / `bug:` / `refactor:` / `replace:` 로 시작할 때도 자동 트리거. + +## 이 fork 에서 한 작업 + +원본 CTX 의 retrieval 알고리즘 변경 없음. production readiness 만 보강. + +### 모듈 분해 + +`src/hooks/bm25-memory.py` (1837 줄 단일 파일) → orchestrator 300 줄 + `src/hooks/_bm25/` 11 개 모듈: + +``` +_bm25/ + tokenizer.py # 한국어 조사 strip + Porter stemmer + stopword + rerank.py # vec-daemon bi-encoder + BGE cross-encoder + autotune.py # ctx-auto-tune.json 파라미터 reader + corpus.py # G1 decision corpus (git HEAD 키 캐시) + ranker.py # BM25 ranking primitives (canonical) + docs_search.py # G2-DOCS BM25 + hybrid + code_search.py # G2 code 파일 검색 + grep fallback + hooks_search.py # ~/.claude/hooks/*.py BM25 검색 + session.py # 세션 로컬 상태 헬퍼 + injection.py # P1 utility tracking + output.py # stdout/stderr 헤더 emit +``` + +각 모듈 ≤ 400 줄. stdin JSON 스키마, stdout 출력 포맷, 캐시 파일 경로(`.omc/decision_corpus.json`, `.omc/docs_corpus_emb.json`), 환경 변수 이름 모두 원본과 동일. + +### eval ↔ production BM25 통합 + +- 단일 토크나이저: `_bm25/tokenizer.tokenize` +- 단일 BM25 ranking primitive: `_bm25/ranker.score_corpus_bm25` +- 통합된 caller: + - `src/retrieval/adaptive_trigger.py` + - `benchmarks/eval/doc_retrieval_eval_v2.py` + - `src/retrieval/bm25_retriever.py` + - `src/evaluator/coir_evaluator.py` +- archival 성격의 benchmark 스크립트(11+ 개)는 의도적으로 자체 구현 유지 — 과거 A/B 비교의 의미 보존 + +### 패키징 / 설치 정합성 + +- `_bm25/` sub-package 가 wheel 에 정상 포함되도록 `pyproject.toml` 의 `[tool.setuptools] packages` + `package-data` 수정 +- `ctx-install` 이 `_bm25/` 디렉토리도 재귀 복사하도록 변경 +- 기존 hook 파일에 대한 hash 기반 자동 update 정책 + `--force-hooks` / `--no-update-hooks` 플래그 +- `_save_atomic`: temp 파일 + `os.replace` + 타임스탬프 backup. 신규 파일 생성 시 backup 반환값 정확성 보장. + +### 안전성 보강 + +- `chat-memory.py:import sqlite_vec` 무방어 → `try/except ImportError` + graceful fallback (sqlite_vec 부재 환경에서도 hook 죽지 않음) +- `_bm25/code_search.py` G2-GREP 정렬: 동점 score 의 비결정성 제거 (`count` → `(-count, path)`) +- Telemetry path: 활성/비활성 gate 캐싱 + lazy import (비활성 시 hook latency 영향 없음) + +## Empirical eval — Context Mode 와 같이 쓸 때 (2026-05-05) + +CTX 와 [`mksglu/context-mode`](https://github.com/mksglu/context-mode) plugin 을 동시 활성화한 환경에서 5 시나리오 × 4 상태 (CTX+CM / CM only / CTX only / 둘다 off) 매트릭스 측정. `claude -p --model opus` headless 28 측정, 총 $10.58. Gemini-as-judge 로 응답 품질 비교. + +요점: +- **코드 분석 / 한국어 검색**: 둘다 활성이 1위 — CTX 의 G1/G2 retrieval + Context Mode 의 도구 압축 시너지 +- **Compaction / 결정 근거 정리**: CTX only 가 1위 — Context Mode 가 노이즈 추가 +- **Headless `claude -p` + 도구 호출 많은 작업**: Context Mode 의 `ctx_batch_execute` 가 권한 prompt 응답 불가로 abort. `--dangerously-skip-permissions` 또는 `skipDangerousModePermissionPrompt: true` 면 정상 + +상세 데이터 / 표 / 한계: [`docs/refactor/EVAL_RESULTS.md`](docs/refactor/EVAL_RESULTS.md). 한국어 블로그 포스트: [`docs/community/BLOG_POST_eval_ko.md`](docs/community/BLOG_POST_eval_ko.md). + +## 테스트 + +### 회귀 가드 (deterministic hook output) + +```bash +python3 tests/golden/run_golden.py +# → 26/26 fixtures passed +``` + +26 개 픽스처는 deterministic 모드(`CTX_DISABLE_SEMANTIC_RERANK=1`, `CTX_CROSS_ENCODER=0`)로 캡처된 hook stdout 의 byte-level 비교. G2-GREP 블록은 file list drift 방지를 위해 normalize 비교 (헤더 / count / "Start with" 형식만 검증). + +카테고리 (각 카테고리는 fallback 경로 + BM25 경로 양쪽 캡처): +- keyword_single (3+3) +- korean_paraphrase (2+2) +- english_code (2+2) +- avoidance / `[noctx]` / `fix:` (2+2) +- empty / 매우 짧은 prompt (3+2) +- hooks_keyword (2+2) + +### 단위 테스트 + +```bash +.venv-golden/bin/python -m pytest tests/unit -q +# → 82 passed in <2s +``` + +| 파일 | 테스트 수 | 영역 | +|---|---|---| +| `test_settings_patcher.py` | 22 | atomic write, backup, idempotency, dry-run, unpatch, corrupted JSON | +| `test_install_cli.py` | 32 | hook 복사 / hash update / force flag / no-update flag, settings merge | +| `test_chat_memory_fallback.py` | 9 | vault.db 없음, vec-daemon down, sqlite_vec 부재, invalid stdin | +| `test_bm25_memory_cache.py` | 7 | HEAD-keyed cache invalidation, corrupted cache 복구 | +| `test_bm25_memory_telemetry.py` | 6 | 활성/비활성 latency, fallback reason capture, exception 시 graceful | +| `test_code_search_sort.py` | 6 | `(-count, path)` deterministic sort | + +커버리지: `settings_patcher.py` 93%, `install.py` 73%. + +### BM25 통합 검증 + +```bash +.venv-golden/bin/python scripts/verify_bm25_unified.py +# → ALL CHECKS PASSED +``` + +`tokenize` import / `score_corpus_bm25` 동작 / `AdaptiveTriggerRetriever` corpus build 가 모두 통합된 `_bm25/` 경로를 통과하는지 검증. + +## 알려진 후속 항목 + +- `tests/golden/run_golden.py` 가 stderr 비교 미실시 — `emit_output()` 의 stderr regression 가드 없음 (다음 사이클) +- `ctx-install --uninstall` 이 hook 파일 자체와 `_bm25/` 디렉토리 cleanup 안 함 (settings.json 등록만 제거) +- `_bm25/__init__.py` 의 public re-export 문서와 실제 구현 mismatch (현재 callers 가 submodule 직접 import 라 동작 문제는 아님) + +## 디렉토리 구조 + +``` +tunaCtx/ + src/ + hooks/ + bm25-memory.py # orchestrator (300 lines) + _bm25/ # 분해된 11 개 sub-module (canonical) + chat-memory.py + memory-keyword-trigger.py + g2-fallback.py + _ctx_telemetry.py + retrieval/ # 원본 retrieval strategy (8 종) + cli/ + install.py # ctx-install + settings_patcher.py # atomic settings.json patcher + telemetry.py # ctx-telemetry + tests/ + golden/ # hook output 회귀 가드 (26 fixtures) + unit/ # 단위 테스트 (82) + benchmarks/ + eval/ + results/ + docs/ + refactor/ + PRODUCTION_REFACTOR_PLAN.md # 본 사이클 plan 문서 + TELEMETRY_SCHEMA.md # 텔레메트리 이벤트 스키마 + scripts/ + verify_bm25_unified.py # BM25 통합 sanity check +``` + +## 라이선스 + +MIT. 원본 [jaytoone/CTX](https://github.com/jaytoone/CTX) (MIT) 의 copyright 와 함께 명시. `LICENSE` 참조. diff --git a/benchmarks/eval/doc_retrieval_eval_v2.py b/benchmarks/eval/doc_retrieval_eval_v2.py index 5031748..b53ee34 100644 --- a/benchmarks/eval/doc_retrieval_eval_v2.py +++ b/benchmarks/eval/doc_retrieval_eval_v2.py @@ -19,8 +19,13 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple +import sys +import os as _os +sys.path.insert(0, _os.path.join(_os.path.dirname(_os.path.dirname(_os.path.dirname( + _os.path.abspath(__file__)))), 'src', 'hooks')) + import numpy as np -from rank_bm25 import BM25Okapi +from _bm25.ranker import score_corpus_bm25 from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity @@ -270,11 +275,13 @@ def rank_tfidf(query: str, docs: List[DocFile], def rank_ctx_doc( query: "str | DocQuery", docs: List[DocFile], - bm25_index: "BM25Okapi | None" = None, + bm25_index=None, # unused — kept for backward compat; doc_tokens used instead doc_tokens: "List[List[str]] | None" = None, ) -> List[Tuple[str, float]]: """CTX-doc: heading match + BM25 (query_type-aware blending). + BM25 scoring via _bm25/ranker.score_corpus_bm25 (canonical single source). + keyword queries: BM25 dominant (heading overlap weight halved, bm25 norm unpenalized) other queries: heading dominant (original weights) """ @@ -311,10 +318,12 @@ def rank_ctx_doc( if score > 0: scored[doc.rel_path] = score - # Stage 2: BM25 augmentation - if bm25_index is not None: + # Stage 2: BM25 augmentation via _bm25/ranker.score_corpus_bm25 (canonical) + if doc_tokens is not None: q_tokens = re.findall(r'\b[a-z]{2,}\b', query_lower) - bm25_scores = bm25_index.get_scores(q_tokens) + bm25_scores = score_corpus_bm25(doc_tokens, q_tokens) + if bm25_scores is None: + bm25_scores = np.zeros(len(docs)) max_bm25 = float(np.max(bm25_scores)) if bm25_scores.max() > 0 else 1.0 for i, bm25_s in enumerate(bm25_scores): fpath = docs[i].rel_path @@ -445,9 +454,9 @@ def main() -> None: ) tfidf_matrix = vectorizer.fit_transform([d.content for d in docs]) - # Build BM25 index for CTX-doc augmentation (enriched: stem+heading for heading queries) + # Build enriched token lists for CTX-doc BM25 augmentation (stem+heading for heading queries) + # score_corpus_bm25 (_bm25/ranker.py) is the single canonical BM25 primitive — no BM25Okapi here doc_token_lists_enriched = [_doc_tokens_with_stem(d) for d in docs] - bm25_idx = BM25Okapi(doc_token_lists_enriched) print("Running evaluations...") @@ -455,12 +464,12 @@ def main() -> None: # Strategy 1: CTX-doc (query_type-aware routing) # keyword queries: TF-only BM25 (rank_bm25) — matches/beats 0.724 baseline - # heading queries: heading match + BM25Okapi augmentation (rank_ctx_doc) + # heading queries: heading match + score_corpus_bm25 augmentation (rank_ctx_doc) ctx_result = evaluate_strategy( "CTX-doc (heading+BM25)", valid_queries, lambda q: (rank_bm25(q.text, docs) if q.query_type == "keyword" - else rank_ctx_doc(q, docs, bm25_index=bm25_idx)), + else rank_ctx_doc(q, docs, doc_tokens=doc_token_lists_enriched)), ) results.append(ctx_result) diff --git a/benchmarks/eval/g1_docs_bm25_eval.py b/benchmarks/eval/g1_docs_bm25_eval.py index aae7605..51e0366 100644 --- a/benchmarks/eval/g1_docs_bm25_eval.py +++ b/benchmarks/eval/g1_docs_bm25_eval.py @@ -22,6 +22,9 @@ import anthropic from rank_bm25 import BM25Okapi +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src" / "hooks")) +from _bm25.tokenizer import tokenize # noqa: E402 canonical (PR-1) + # ────────────────────────────────────────────────────────────────────────────── # QA Pairs (same as g1_docs_memory_eval.py) @@ -75,12 +78,6 @@ # Step 1: Build BM25 index over doc chunks # ────────────────────────────────────────────────────────────────────────────── -def tokenize(text: str) -> List[str]: - """Lowercase; preserve decimal numbers (0.724) and numeric ranges (7-30).""" - tokens = re.findall(r'\d+[-\u2013]\d+|\d+\.\d+|\w+', text.lower()) - return [t for t in tokens if t] - - def chunk_document(filename: str, content: str) -> List[str]: """Split a document by ## section headers. Each chunk = filename § header\ncontent.""" chunks = [] diff --git a/benchmarks/eval/g1_longterm_baseline_eval.py b/benchmarks/eval/g1_longterm_baseline_eval.py index 1d6474a..10b1754 100644 --- a/benchmarks/eval/g1_longterm_baseline_eval.py +++ b/benchmarks/eval/g1_longterm_baseline_eval.py @@ -20,6 +20,9 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src" / "hooks")) +from _bm25.tokenizer import tokenize as _canonical_tokenize # noqa: E402 canonical (PR-1) + # ── LLM client ─────────────────────────────────────────────────────────────── def get_llm_client(): @@ -264,9 +267,7 @@ def get_bm25_context(query: str, commit_corpus: List[Dict], top_k: int = 7) -> T if not commit_corpus: return "[Empty corpus]", 0 - def tokenize(text: str) -> List[str]: - return re.findall(r'\b\w+\b', text.lower()) - + tokenize = _canonical_tokenize # PR-1: was local re.findall(r'\b\w+\b'); now canonical _bm25 tokenize subjects = [c.get('subject', '') for c in commit_corpus] tokenized = [tokenize(s) for s in subjects] bm25 = BM25Okapi(tokenized) diff --git a/benchmarks/eval/g2_docs_paraphrase_eval.py b/benchmarks/eval/g2_docs_paraphrase_eval.py index b391b58..bfc4c65 100644 --- a/benchmarks/eval/g2_docs_paraphrase_eval.py +++ b/benchmarks/eval/g2_docs_paraphrase_eval.py @@ -18,12 +18,16 @@ import json import re +import sys import time from pathlib import Path from typing import List, Tuple from rank_bm25 import BM25Okapi +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src" / "hooks")) +from _bm25.tokenizer import tokenize # noqa: E402 canonical (PR-1) + # ────────────────────────────────────────────────────────────────────────────── # 30 Paraphrase QA Pairs @@ -317,23 +321,6 @@ # BM25 index construction # ────────────────────────────────────────────────────────────────────────────── -_KO_PARTICLES = re.compile( - r'(와|과|이|가|은|는|을|를|의|에서|으로|에게|부터|까지|처럼|같이|보다|이나|며|에|로|도|만|나|고)$' -) - - -def tokenize(text: str) -> List[str]: - """Preserve decimal numbers and numeric ranges. Strip Korean particles from mixed tokens.""" - raw = re.findall(r'\d+[-\u2013]\d+|\d+\.\d+|\w+', text.lower()) - result = [] - for tok in raw: - cleaned = _KO_PARTICLES.sub('', tok) - if cleaned and cleaned != tok: - result.append(cleaned) - result.append(tok) - return list(dict.fromkeys(result)) - - def chunk_document(filename: str, content: str) -> List[str]: """Split by ## section headers.""" chunks = [] diff --git a/benchmarks/results/doc_retrieval_eval_v2.md b/benchmarks/results/doc_retrieval_eval_v2.md index fd587fd..973247e 100644 --- a/benchmarks/results/doc_retrieval_eval_v2.md +++ b/benchmarks/results/doc_retrieval_eval_v2.md @@ -1,7 +1,7 @@ # CTX Document Retrieval Evaluation v2 -**Date**: 2026-04-03 09:58 -**Corpus**: 62 .md files from docs/ +**Date**: 2026-05-05 05:15 +**Corpus**: 119 .md files from docs/ **Queries**: 100 (heading_exact + heading_paraphrase + keyword) **Metrics**: Recall@3, Recall@5, NDCG@5, MRR @@ -9,58 +9,58 @@ | Strategy | Recall@3 | Recall@5 | NDCG@5 | MRR | |----------|----------|----------|--------|-----| -| CTX-doc (heading+BM25) | **0.870** | **0.940** | 0.815 | 0.782 | -| BM25 | **0.590** | **0.760** | 0.594 | 0.562 | -| Dense TF-IDF | **0.560** | **0.670** | 0.546 | 0.537 | +| CTX-doc (heading+BM25) | **0.740** | **0.790** | 0.680 | 0.662 | +| BM25 | **0.490** | **0.590** | 0.443 | 0.424 | +| Dense TF-IDF | **0.490** | **0.610** | 0.472 | 0.452 | ## Per-Strategy Analysis ### CTX-doc (heading+BM25) -- Hits@3: 87/100 (87.0%) -- Hits@5: 94/100 (94.0%) -- NDCG@5: 0.815 -- MRR: 0.782 +- Hits@3: 74/100 (74.0%) +- Hits@5: 79/100 (79.0%) +- NDCG@5: 0.680 +- MRR: 0.662 **Misses (top 5)**: -- [keyword] `show information about minimax without` → expected `research/20260328-ctx-downstream-eval-complete.md` -- [keyword] `find docs related to memory cross` → expected `research/20260325-long-session-context-management.md` -- [keyword] `which document covers trigger retrieval` → expected `paper_draft_outline.md` -- [keyword] `nemotron research documentation` → expected `research/20260329-ctx-paper-gap-analysis.md` -- [keyword] `find docs related to locagent source` → expected `research/20260327-ctx-alternatives-research.md` +- [keyword] `find docs related to dense import` → expected `research/20260326-ctx-methodology-comparison.md` +- [keyword] `which document covers graph retrieval` → expected `paper_draft_outline.md` +- [heading_exact] `original question` → expected `research/20260330-ctx-academic-critique-web-grounded.md` +- [keyword] `find docs related to beir locagent` → expected `research/20260327-ctx-alternatives-research.md` +- [keyword] `notes about evaluation quality` → expected `research/20260402-g2-evaluation-methods-research-summary.md` ### BM25 -- Hits@3: 59/100 (59.0%) -- Hits@5: 76/100 (76.0%) -- NDCG@5: 0.594 -- MRR: 0.562 +- Hits@3: 49/100 (49.0%) +- Hits@5: 59/100 (59.0%) +- NDCG@5: 0.443 +- MRR: 0.424 **Misses (top 5)**: -- [heading_paraphrase] `where is ctx — document index documented` → expected `DOC_INDEX.md` -- [heading_exact] `즉시 실행 순서` → expected `marketing/active_outreach_playbook.md` -- [heading_exact] `실험 설계` → expected `research/20260327-ctx-downstream-eval.md` -- [heading_exact] `[expert-research-v2] ctx 약점 보완 대안 기술 분석` → expected `research/20260327-ctx-alternatives-research.md` -- [heading_exact] `ctx architecture` → expected `ARCHITECTURE.md` +- [heading_paraphrase] `I need info on [expert-research-v2] ctx 실험 방식 상위 티어 논문 기준 평론` → expected `research/20260324-ctx-methodology-critique-top-tier.md` +- [heading_exact] `5개 실제 시나리오` → expected `research/20260328-ctx-real-codebase-g2-eval.md` +- [heading_paraphrase] `find documentation about [expert-research-v2] ctx 성과 평론 — 상위` → expected `research/20260326-ctx-results-review.md` +- [heading_exact] `g1: cross-session memory recall` → expected `research/20260327-ctx-downstream-eval.md` +- [keyword] `find docs related to dense import` → expected `research/20260326-ctx-methodology-comparison.md` ### Dense TF-IDF -- Hits@3: 56/100 (56.0%) -- Hits@5: 67/100 (67.0%) -- NDCG@5: 0.546 -- MRR: 0.537 +- Hits@3: 49/100 (49.0%) +- Hits@5: 61/100 (61.0%) +- NDCG@5: 0.472 +- MRR: 0.452 **Misses (top 5)**: -- [heading_paraphrase] `where is ctx — document index documented` → expected `DOC_INDEX.md` -- [keyword] `which document covers memory codebase` → expected `research/20260402-production-context-retrieval-research.md` -- [heading_exact] `즉시 실행 순서` → expected `marketing/active_outreach_playbook.md` -- [heading_exact] `실험 설계` → expected `research/20260327-ctx-downstream-eval.md` -- [heading_exact] `[expert-research-v2] ctx 약점 보완 대안 기술 분석` → expected `research/20260327-ctx-alternatives-research.md` +- [heading_paraphrase] `documentation for ctx: trigger-driven dynamic context loadin` → expected `paper/CTX_paper_draft.md` +- [heading_paraphrase] `I need info on [expert-research-v2] ctx 실험 방식 상위 티어 논문 기준 평론` → expected `research/20260324-ctx-methodology-critique-top-tier.md` +- [heading_exact] `5개 실제 시나리오` → expected `research/20260328-ctx-real-codebase-g2-eval.md` +- [heading_paraphrase] `find documentation about [expert-research-v2] ctx 성과 평론 — 상위` → expected `research/20260326-ctx-results-review.md` +- [heading_exact] `g1: cross-session memory recall` → expected `research/20260327-ctx-downstream-eval.md` ## Per-Query-Type Breakdown | Type | N | CTX R@3 | BM25 R@3 | Dense R@3 | |------|---|---------|----------|-----------| -| heading_exact | 37 | 0.973 | 0.595 | 0.514 | -| heading_paraphrase | 31 | 1.000 | 0.548 | 0.613 | -| keyword | 32 | 0.625 | 0.625 | 0.562 | +| heading_exact | 32 | 0.812 | 0.531 | 0.469 | +| heading_paraphrase | 34 | 1.000 | 0.529 | 0.588 | +| keyword | 34 | 0.412 | 0.412 | 0.412 | ## Method Description @@ -72,6 +72,6 @@ | Stat | Value | |------|-------| -| Total docs | 62 | -| Average headings/doc | 14.5 | -| Average keywords/doc | 14.8 | +| Total docs | 119 | +| Average headings/doc | 19.0 | +| Average keywords/doc | 15.0 | diff --git a/docs/community/BLOG_POST_eval_ko.md b/docs/community/BLOG_POST_eval_ko.md new file mode 100644 index 0000000..d7bc8b0 --- /dev/null +++ b/docs/community/BLOG_POST_eval_ko.md @@ -0,0 +1,136 @@ +# CTX × Context Mode 같이 쓸 때 무엇이 일어나는가 — 5 시나리오 × 4 상태 실측 + +> 2026-05-05 / Korean dev env / claude-opus-4-7 / 총 28 측정 $10.58 + +## 배경 + +Claude Code 에 hook 시스템을 두 개 동시에 깔아둔 상태로 며칠 작업하다 보니 — [`jaytoone/CTX`](https://github.com/jaytoone/CTX) (이 fork: [`hang-in/tunaCtx`](https://github.com/hang-in/tunaCtx)) 의 retrieval 기반 메모리 hook 과, [`mksglu/context-mode`](https://github.com/mksglu/context-mode) 의 sandbox/도구 압축 plugin — 한 가지 의문이 생겼다. + +> 둘이 같이 켜져 있으면 시너지인가, 충돌인가? + +추측하지 말고 측정하자고 결정했다. 본 글은 그 결과 공유. + +## 환경 + +- **모델**: `claude-opus-4-7` +- **호출**: `claude -p --output-format=stream-json --no-session-persistence --setting-sources ""` (headless) +- **상태 4종**: + - A — CTX + Context Mode 둘다 활성 + - B — CTX off, Context Mode only + - C — CTX only, Context Mode off + - D — 둘다 off (baseline) +- **검증 repo**: `seCall`, `tunaFlow`, `tunaCtx` (한국어 주석 + 영어 코드 혼재) +- **Judge**: `gemini -p` (Gemini 0.40.1) 로 응답 품질 ranking + +각 상태는 `~/.claude/settings.json` 에서 hooks 와 `enabledPlugins.context-mode@context-mode` 를 차등 제거한 stand-alone settings 파일로 격리. + +## 시나리오 5개 + +| # | 작업 | repo | +|---|---|---| +| 1 | 핵심 함수 분석 + 호출 흐름 + 테스트 | seCall | +| 2 | 최근 30 commit 변화 패턴 | seCall | +| 3 | 한국어 docstring 의 `Roundtable` / `RT` 검색 | tunaFlow | +| 4 | 최근 5 commit 의 진화 방향 정리 | tunaCtx | +| 5 | `.py` 파일 전체 TODO 주석 grep | tunaCtx | + +## 측정 결과 — Judge 1위 + +| 시나리오 | 1위 | 비고 | +|---|---|---| +| 1. 코드 검색 | **A (둘다)** | 라인번호(L133, L138) 정확 + Upstream/Downstream 그래프 | +| 2. 30 commit 분석 | D (baseline) | A 가 권한 거부로 abort — 후술 | +| 3. 한국어 docstring | **A (둘다)** | Rust 코드 + 버그 히스토리까지 깊이 분석 | +| 4. compaction | **C (CTX only)** | Context Mode 의 도구 호출이 노이즈 | +| 5. TODO grep | D (baseline) | A 가 권한 거부로 4위 — 후술 | + +처음에 표를 봤을 때 결론을 단순화하고 싶었다 — "Context Mode 는 충돌, CTX only 가 낫다". 그러나 그건 틀렸다. 시나리오 2, 5 의 패턴을 자세히 보니 다른 그림이 나왔다. + +## 시나리오 2/5 의 "충돌" 은 권한 artifact 였다 + +A 상태 (둘다 활성) 의 응답 본문에 이런 문구가 박혀 있었다. + +``` +"Permission needed. Asking the user to grant ctx_batch_execute..." +"ctx_batch_execute 권한 거부됨. Grep tool로 진행" +``` + +`claude -p` headless 환경에서는 권한 prompt 가 응답 가능한 stdin 이 없다. Context Mode 의 `ctx_batch_execute` 도구를 호출하려고 했으나 권한 거부 → 작업 abort. **Context Mode 자체 결함이 아니라 headless 환경에서 권한 정책이 차단된 것**. + +검증을 위해 같은 8 측정을 `--dangerously-skip-permissions` 추가로 재실행했다 ($2.57 추가). + +| 시나리오 2 A 응답 head | | +|---|---| +| default permissions | `Permission needed. Asking the user to grant...` (abort) | +| `--dangerously-skip-permissions` | `## seCall 최근 30개 commit 분석... feat: 9건, fix: 6건, Merge PR: 9건...` (정상) | + +| 시나리오 5 A 응답 head | | +|---|---| +| default permissions | `ctx_batch_execute 권한 거부됨. Grep tool로 진행` (부분 fallback) | +| `--dangerously-skip-permissions` | `프로젝트 .py 파일 203개 중 # TODO/FIXME/XXX/HACK 주석은 0건` (정확 + .venv-golden 노이즈 명시 필터) | + +비용: skip-perm 시 +13~21%. Context Mode 의 도구 호출이 정상 진행되어 발생한 cost. quality 회복으로 정당화. + +즉 **interactive Claude Code 또는 `skipDangerousModePermissionPrompt: true` 설정 환경에서는 시나리오 2/5 의 "충돌" 은 발생하지 않는다**. headless `claude -p` 자동화에서만 명시적 권한 정책 필요. + +## 시나리오 4 의 "CTX only 가 1위" 는 권한과 무관 + +이건 다른 이야기. compaction (최근 5 commit 의 진화 방향 정리) 같은 작업에서 Context Mode 의 도구 압축 / batch 실행은 본질적으로 노이즈다 — 정리할 게 단순한 git log 1번이면 충분한데 ctx_batch_execute 까지 동원하면 응답 길이가 늘어나고 핵심 통찰이 흐려진다. 이 finding 은 권한과 별개로 신뢰성이 있다. + +| 시나리오 4 (compaction) | A (둘다) | C (CTX only) | +|---|---|---| +| Judge 순위 | 2위 | **1위** | +| 비용 | $0.334 | $0.158 | +| 응답 길이 | 2003 tokens | 1829 tokens | + +같은 정보를 절반 비용으로 더 충실하게 정리. + +## 비용 매트릭스 + +5 시나리오 × 4 상태 합계 (1차 측정, 권한 default): + +| 상태 | 합계 | 비고 | +|---|---:|---| +| C (CTX only) | $1.23 | 가장 저렴. 단 시나리오 3-C 는 한국어 + Grep×27 으로 timeout | +| D (baseline) | $1.89 | | +| A (둘다) | $2.30 | ctx_batch_execute 호출 비용 | +| B (CM only) | $2.59 | 가장 비쌈 — Context Mode 가 retrieval 부재를 도구 호출로 보충 | + +## 추천 사용 패턴 + +| 환경 | 작업 | 권장 | +|---|---|---| +| Interactive Claude Code (또는 `skipDangerousModePermissionPrompt: true`) | 코드 분석 / 한국어 검색 | **둘다 ON** (시너지) | +| 동일 | Compaction / 결정 정리 | **CTX only** (Context Mode 노이즈) | +| Headless `claude -p` | 도구 집약 작업 | `--dangerously-skip-permissions` 또는 Context Mode OFF | +| 단순 채팅 | — | 둘다 OFF (overhead 회피) | + +**일반 권장**: CTX 는 항상 ON. Context Mode 는 환경 + 작업 종류에 따라 토글. + +## 한계 — 정직히 + +- **Sample size**: 시나리오마다 1 prompt × 4 상태. 응답 분산 측정 안 함. 실 production 에서는 분산이 클 수 있음. +- **Judge 1회**: Gemini -p 1번 평가. LLM-as-judge 의 일반적 편향 (선호도, 길이 편향) 가능. +- **Repo 의존성**: 한국어 + 영어 혼재 환경. 영어 전용 repo 는 다른 패턴 가능. +- **Headless ↔ interactive 동작 차이**: `claude -p` 의 sandbox 권한 동작이 interactive 와 정확히 일치한다는 보장 없음. interactive 직접 검증 미실시. +- **시나리오 3-C timeout**: 180s 한도 도달. CTX only + 한국어 광범위 grep (27회) 으로 인한 — CTX 의 G2-GREP fallback 이 한국어에서 너무 적극적일 가능성 있음. + +## 데이터 + +- 측정 raw: `tests/golden/raw/scenario*-{A,B,C,D}.{out,err}` (gitignored, 본 fork 의 `/tmp/eval-results/` 위치에 보존) +- 종합 보고서: [`docs/refactor/EVAL_RESULTS.md`](https://github.com/hang-in/tunaCtx/blob/master/docs/refactor/EVAL_RESULTS.md) +- 본 fork: [`hang-in/tunaCtx`](https://github.com/hang-in/tunaCtx) +- 원본: [`jaytoone/CTX`](https://github.com/jaytoone/CTX) + +## 마무리 + +처음에 데이터를 보고 단순화하고 싶었다 — "Context Mode 는 충돌이고 CTX only 가 항상 낫다". 그러나 8 측정 추가로 검증해 보니 시나리오 2/5 의 "충돌" 은 headless 환경 한정 권한 artifact 였고, interactive 환경에서는 시너지 가능성이 높다. + +세 줄 요약: + +1. **CTX 는 모든 환경 always-on safe**. 1위 또는 2위 안정. +2. **Context Mode 는 환경 의존**. interactive 는 시너지, headless 는 권한 정책 명시. +3. **Compaction 작업에서는 CTX only 가 항상 우월** (권한과 무관, $0.158 vs $0.334). + +`claude -p` 자동화 / CI / batch 처리 환경이라면 settings.json 에 `skipDangerousModePermissionPrompt: true` 추가하거나 `--dangerously-skip-permissions` 플래그를 명시하는 게 두 hook 시너지를 살리는 가장 빠른 길이다. + diff --git a/docs/refactor/EVAL_RESULTS.md b/docs/refactor/EVAL_RESULTS.md new file mode 100644 index 0000000..fda11b6 --- /dev/null +++ b/docs/refactor/EVAL_RESULTS.md @@ -0,0 +1,168 @@ +# CTX × Context Mode 실측 검증 + +| 항목 | 값 | +|---|---| +| 측정 일시 | 2026-05-05 (07:24-07:50) | +| 모델 | claude-opus-4-7 (`--model opus`) | +| 호출 방식 | `claude -p --output-format=stream-json --no-session-persistence --setting-sources ""` | +| Judge | `gemini -p` (gemini 0.40.1) | +| 측정 수 | 5 시나리오 × 4 상태 = **20 측정** + 시나리오 2/5 × 4 상태 skip-perm 재측정 = **8 추가** | +| 총 비용 | **$10.58** (Opus claude -p; 1차 $8.01 + skip-perm $2.57), Gemini judge 추가 | +| 한 측정 평균 | 60s wall clock, 2.5K output tokens | + +## 환경 + +- **CTX**: tunaCtx fork (`hang-in/tunaCtx`, master @ `ab499c5`), `_bm25/` 11 모듈, vec-daemon + bge-daemon 활성, pipx ctx-retriever 0.3.13 +- **Context Mode**: `mksglu/context-mode` 1.0.107 (plugin) +- **검증 repo**: + - `seCall` (시나리오 1, 2) + - `tunaFlow` (시나리오 3) + - `tunaCtx` (시나리오 4, 5) + +## 4 상태 매트릭스 + +| 코드 | 상태 | +|---|---| +| **A** | CTX + Context Mode 둘다 활성 | +| **B** | CTX off, Context Mode only | +| **C** | CTX only, Context Mode off | +| **D** | 둘다 off (baseline) | + +각 상태는 `/tmp/eval-settings/settings-{A,B,C,D}.json` 에 stand-alone 으로 정의 — `~/.claude/settings.json` 에서 `hooks` (CTX 부분) + `enabledPlugins.context-mode@context-mode` 를 차등 제거. + +--- + +## 측정 데이터 (정량) + +### 시나리오별 토큰 / 비용 / 시간 / 도구 + +| ID | duration | cost | input | output | cache_r | tools | +|---|---:|---:|---:|---:|---:|---| +| 1-A | 80.0s | $0.442 | 8 | 2007 | 82,307 | Agent×1, ctx_batch_execute×1, Read×10, Bash×13 | +| 1-B | 121.2s | $0.632 | 18 | 3462 | 285,263 | Agent×1, Bash×18, Glob×1, Grep×3, Read×12, ctx_batch_execute×1 | +| 1-C | 85.3s | $0.714 | 25 | 5657 | 512,443 | Glob×7, Grep×7, Read×4 | +| 1-D | 140.9s | $0.871 | 20 | 4488 | 617,369 | Agent×1, Bash×18, Glob×1, Grep×5, Read×16 | +| 2-A | 44.4s | $0.393 | 20 | 3615 | 138,429 | Bash×2, ToolSearch×1, ctx_batch_execute×2 | +| 2-B | 59.2s | $0.420 | 20 | 4365 | 137,093 | Bash×2, ToolSearch×1, ctx_batch_execute×2 | +| 2-C | 32.8s | $0.168 | 7 | 1793 | 44,924 | Bash×2 | +| 2-D | 44.2s | $0.191 | 7 | 2699 | 44,550 | Bash×2 | +| 3-A | 112.7s | $0.760 | 32 | 8591 | 257,468 | Grep×12, Read×9, ToolSearch×1, ctx_batch_execute×1 | +| 3-B | 121.7s | $0.857 | 29 | 8953 | 370,012 | Bash×1, Grep×4, ToolSearch×1, ctx_execute×2 | +| 3-C | ⚠ TIMEOUT | n/a | 1* | 44* | 47,272 | Agent×1, Bash×5, Grep×27, Read×5 | +| 3-D | 100.8s | $0.471 | 10 | 7303 | 177,393 | Grep×10 | +| 4-A | 35.8s | $0.334 | 20 | 2003 | 133,588 | Bash×1, ToolSearch×1, ctx_batch_execute×2 | +| 4-B | 27.6s | $0.163 | 12 | 1575 | 45,885 | Bash×1 | +| 4-C | 30.9s | $0.158 | 7 | 1829 | 44,400 | Bash×1 | +| 4-D | 48.7s | $0.205 | 8 | 2729 | 72,348 | Bash×1, Read×1 | +| 5-A | 29.9s | $0.370 | 22 | 1964 | 205,553 | Grep×4, ToolSearch×1, ctx_batch_execute×2 | +| 5-B | 54.3s | $0.523 | 36 | 3059 | 364,972 | Bash×1, Glob×1, Grep×3, ToolSearch×2, ctx_batch_execute×1, ctx_execute×2 | +| 5-C | 23.3s | $0.189 | 20 | 1462 | 132,086 | Grep×4 | +| 5-D | 16.9s | $0.149 | 9 | 1098 | 97,359 | Grep×3 | + +*3-C: 180s timeout, 응답 partial. 실제로는 Grep×27 등 의미 있는 도구 호출 진행 중이었으나 응답 종료 못함. + +### 비용 합계 + +- **A 상태 (둘다 활성)** 합계: $2.30 — 가장 비쌈 +- **B 상태 (CM only)** 합계: $2.59 +- **C 상태 (CTX only)** 합계: $1.23 (3-C timeout 미과금) — 가장 저렴 +- **D 상태 (baseline)** 합계: $1.89 + +→ **CTX only (C) 가 cost 측면에서 가장 효율적**. Context Mode 가 활성화되면 ctx_batch_execute / ctx_execute 호출이 추가되어 비용 ↑. + +--- + +## Judge 결과 (Gemini -p, 정성) + +| 시나리오 | 1위 | 2위 | 3위 | 4위 | 결정적 차이 | +|---|---|---|---|---|---| +| 1 (코드 검색) | **A** | B | C | D | A: 라인번호 매핑(L133, L138) 정확 + Upstream/Downstream 관계 명확 | +| 2 (긴 출력) | **D** | C | B | A | **A 가 권한 거부로 작업 중단** — Context Mode sandbox 부작용 | +| 3 (한국어) | **A** | B | D | C | A: Rust 코드 + 버그 히스토리까지 깊이 있게 분석. C: timeout | +| 4 (compaction) | **C** | A | B | D | **CTX only 가 결정 근거의 전략적 통찰 최고** | +| 5 (hook 충돌) | **D** | C | B | A | **A 가 권한 거부로 4위** — Context Mode sandbox 가 도구 차단 | + +--- + +## 시너지 / 충돌 / 무차별 분석 + +### ✅ 시너지 — A (둘다 활성) 가 우월한 시나리오 + +**시나리오 1, 3** (코드 검색 + 한국어). +- CTX 의 G1/G2 retrieval 로 관련 파일 사전 주입 + Context Mode 의 도구 압축이 도구 사용을 효율화 +- 라인번호 정확도, 깊이 있는 분석에서 A 가 다른 상태 모두 능가 + +### ⚠ 시나리오 2, 5 의 "충돌" — headless 권한 artifact 임이 입증됨 (skip-perm 재측정) + +**1차 측정 (default permissions)**: +- 시나리오 2 A: `"Permission needed. Asking the user to grant ctx_batch_execute..."` — 작업 abort +- 시나리오 5 A: `"ctx_batch_execute 권한 거부됨. Grep tool로 진행"` — 부분 fallback + +**검증을 위한 2차 측정 (`--dangerously-skip-permissions`)** — 시나리오 2, 5 × 4 상태 = 8 측정 추가 ($2.57): + +| 상태 | 시나리오 2 A 응답 head | +|---|---| +| default permissions | `Permission needed. Asking the user to grant...` (abort) | +| skip-perm | `## seCall 최근 30개 commit 분석... feat: 9건, fix: 6건, Merge PR: 9건...` (정상 분석) | + +| 상태 | 시나리오 5 A 응답 head | +|---|---| +| default permissions | `ctx_batch_execute 권한 거부됨. Grep tool로 진행` (부분 fallback) | +| skip-perm | `프로젝트 .py 파일 203개 중 # TODO/FIXME/XXX/HACK 주석은 0건` (정확 + .venv-golden 노이즈 명시 필터) | + +**비용 변화**: skip-perm 시 A 비용 +13~21% — Context Mode 도구 호출이 정상 진행되어 추가 호출 발생. quality 회복으로 정당화. + +**결론**: 1차 측정의 "A 가 4위" 결과는 **headless `claude -p` 의 권한 prompt 응답 불가** artifact. 다음 환경 중 하나면 정상 동작: +- Interactive Claude Code 세션 (사용자가 권한 승인 가능) +- `~/.claude/settings.json` 의 `skipDangerousModePermissionPrompt: true` (사용자 환경에 이미 설정됨) +- `claude -p --dangerously-skip-permissions` (CI / 자동화) + +**즉 Context Mode 가 본질적으로 해로운 건 아님**. 단, headless 환경에서 권한 정책이 명확하지 않으면 도구 호출이 차단될 수 있음 — `mksglu/context-mode` 에 `--allow-ctx-tools` 같은 명시적 옵션 권고 (UPSTREAM_ISSUE_mksglu.md 참조). + +### 🟰 CTX 의 가치는 모든 시나리오에서 양 또는 중립 + +**시나리오 4** (compaction): C (CTX only) > A > B > D — CTX 의 G1/G2 retrieval 이 commit 진화 분석에 결정적. Context Mode 는 오히려 노이즈 추가. + +CTX 가 1위 또는 2위인 시나리오: 1, 2 (C 2위), 3, 4 (C 1위), 5 (C 2위). CTX 가 4위인 시나리오는 없음. + +--- + +## 추천 사용 패턴 + +| 작업 종류 | 권장 | +|---|---| +| 코드 분석, 한국어 검색, 깊이 있는 탐색 | **CTX + Context Mode 둘다 활성** (시너지) | +| 도구 호출이 많은 작업 (commit 분석, 코드베이스 grep) | **CTX only** (Context Mode 는 sandbox 부작용) | +| Compaction / 결정 근거 정리 | **CTX only** (가장 우월) | +| 자동화 / headless 배치 | **CTX only** (sandbox 권한 prompt 회피) | +| 단순 채팅 | **둘다 off** (overhead 회피) | + +**일반 권장**: **CTX 는 항상 ON**. Context Mode 는: +- **Interactive Claude Code** 또는 `skipDangerousModePermissionPrompt: true` 환경 → 항상 ON (시나리오 2, 5 의 "충돌" 은 발생 안 함) +- **Headless `claude -p` 자동화** → `--dangerously-skip-permissions` 추가하지 않으면 OFF 권장 (또는 명시적 권한 정책 설정) +- **단순 채팅** → 둘다 OFF (overhead 회피) + +--- + +## 본 검증의 한계 + +- **Sample size**: 시나리오마다 1 prompt × 4 상태. 응답 분산 측정 안 함. 실제 production 에서는 분산이 클 수 있음. +- **Judge 주관성**: Gemini 1회 평가. LLM-as-judge 의 일반적 한계 (선호도, 길이 편향 등). +- **Repo 의존성**: seCall, tunaFlow, tunaCtx 는 실제 한국어 + 코드 환경. 영어 전용 repo 에서는 다른 패턴 가능. +- **Headless 환경**: `claude -p` 의 sandbox 권한 동작이 interactive 세션과 다를 수 있음 — 시나리오 2, 5 의 권한 거부 패턴이 interactive 에서 재현되는지 별도 검증 필요. +- **3-C timeout**: 180s 한도. CTX only + 한국어 패턴 검색이 timeout 에 도달 — Grep 27 회. CTX 의 G2-GREP fallback 이 한국어에서 광범위 grep 시도하는 패턴. + +## 원시 데이터 + +- 측정 raw: `/tmp/eval-results/raw/scenario*-{A,B,C,D}.{out,err}` (gitignored) +- 메트릭 JSON: `/tmp/eval-results/scenario*-{A,B,C,D}.json` (gitignored) +- Judge JSON: `/tmp/eval-results/judge-scenario*.json` (gitignored) + +--- + +## 다음 검증 후보 + +- **Sample size 확대**: 각 시나리오 5-10회 반복으로 응답 분산 측정. +- **시나리오 6 명시**: A vs B vs C vs D 의 4 상태 외에 더 fine-grained 토글 (예: CTX 의 vec-daemon 만 끄기, BGE 만 끄기). +- **Headless vs Interactive**: 같은 시나리오를 interactive Claude Code 에서 측정해서 sandbox 권한 패턴 비교. +- **Context Mode 의 ctx_batch_execute / ctx_execute 가 실제 토큰 압축 효과**: cache_read 차이로 추정 가능하지만 명시적 측정 미실시. diff --git a/docs/refactor/HANDOFF.md b/docs/refactor/HANDOFF.md new file mode 100644 index 0000000..b6eac62 --- /dev/null +++ b/docs/refactor/HANDOFF.md @@ -0,0 +1,320 @@ +# 핸드오프 — tunaCtx production-level refactor 사이클 + +| 항목 | 값 | +|---|---| +| 마지막 갱신 | 2026-05-05 (Cycle-3.5 — Windows TCP fallback PR 머지 + upstream 협업 라운드 1 종료) | +| 작업 식별자 | Phase 0 → Task A/B/C/D → Phase 9 → Cycle-2 → Cycle-3 (docs hygiene) → Cycle-3.5 (PR #1 merge + upstream coordination) | +| 작업 디렉토리 | `/Users/d9ng/privateProject/tunaCtx` (clone 후 fork remote 로 운영, 정식 GitHub fork 아님) | +| 현재 브랜치 | `master` (= `origin/master` = `hang-in/tunaCtx:master`) | +| 마지막 commit | `29f241c feat(hooks): Windows TCP loopback fallback for AF_UNIX-less CPython (#1)` | +| 회귀 가드 상태 | golden **15/26 PASS** (11 fallback drift, §6-1 함정 — production 회귀 아님) / pytest **105 PASS / 0 skip** | +| 원본 upstream | `https://github.com/jaytoone/CTX` (remote: `upstream`) | +| Fork remote | `https://github.com/hang-in/tunaCtx` (remote: `origin`) | +| Upstream issues | #1 fork 알림 (2026-05-04, jaytoone 답변 + 우리 reply 발행, **응답 대기**) / #2 docs R@5 정합성 정정 (2026-05-05, jaytoone fix 적용 + **CLOSED**) | +| Fork PR | #1 Windows TCP fallback (2026-05-05, gemini 5건 반영, **MERGED** `29f241c`) | + +## 1. 이 fork 의 정체 + +**원본**: `jaytoone/CTX` — Trigger-Driven Dynamic Context Loading for Code-Aware LLM Agents. + +**fork 가 한 일**: retrieval **알고리즘은 변경하지 않음**. Claude Code hook 구현이 실제 사용 환경에서 안전하게 운영되도록 production readiness 만 손봄. + +**fork 가 안 한 일**: paper 작성, 새 retrieval strategy 추가, 알고리즘 변경, README 의 마케팅 톤. 이런 것 다음 세션에서도 추가하지 말 것 — 명시적 user 요청 없는 한. + +## 2. 작업 history (시간순) + +| 단계 | 산출 | 회귀 검증 | +|---|---|---| +| **Phase 0** | `tests/golden/` 픽스처 26개 캡처 (deterministic 모드) — fallback 14 + BM25-path 12 | 26/26 PASS 베이스라인 | +| **Task A** | `bm25-memory.py` 1837줄 → orchestrator 300줄 + `src/hooks/_bm25/` 11 모듈 | 26/26 유지 | +| **Task B** | `tests/unit/` 64개 단위 테스트 신설 | pytest 64/0 | +| **Wave 1 후속** (codex Critical+Major) | `_bm25/` packaging 누락 fix, cache fixture fix, sqlite_vec guard | 64/0 유지 | +| **Task C** | tokenizer + ranker 단일 canonical (`_bm25/tokenizer.tokenize`, `_bm25/ranker.score_corpus_bm25`) — `adaptive_trigger.py`, `doc_retrieval_eval_v2.py`, `bm25_retriever.py`, `coir_evaluator.py` 모두 통합 | 26/26 + benchmark delta=0 | +| **Task D** | `bm25-memory` orchestrator telemetry instrument (7 events, lazy gate) | pytest 70/0 | +| **Phase 9** (codex 최종 리뷰) | Critical 1 + Major 4 + Minor 3 + golden 옵션 B 권고 | — | +| **Phase 9 후속** | Critical (`ctx-install` hash-based update) + Major #1/#2/#4 + Minor #1 + golden 옵션 B (G2-GREP normalize) | golden 25→26/26 회복 | +| **Cycle-2** | golden runner stderr 가드 (옵션), atomic write 실 filesystem 검증, `_bm25/__init__.py` 17함수 re-export, `--uninstall` cleanup, plan footer | pytest 82→105/0 | +| **Cycle-3 (docs hygiene)** | (a) README 검색 stack bullet 추가 — "BM25 만" 커뮤니티 오해 정정 / (b) R@5=0.152 stale 인용 갱신 (CLAUDE.md L91·L197, PRODUCTION_REFACTOR_PLAN.md L263) — iter11 재측정 Mean R@5=0.595 인용 / (c) README 에 외부 codebase 측정값 (참고) bullet 추가 + upstream issue 링크 / (d) upstream issue #2 발행 (docs R@5 정합성) | pytest 105/0 / golden 15/26 (fallback drift §6-1) | +| **Cycle-3.5 (PR merge + upstream coord)** | (a) PR #1 Windows TCP fallback 머지 (`29f241c`) — gemini 5건 반영 (`socket` import top-level 정리, `SO_REUSEADDR` Windows 가드) / (b) upstream issue #2 jaytoone fix 후 close + 우리 감사 댓글 / (c) upstream issue #1 jaytoone 질문 3개 + PR shape 답변 발행 (Q1 tokenizer canonical, Q2 sqlite_vec graceful, Q3 install hash-based, PR 분해 5단계 + subtoken splitter 별도 사이클 명시) | pytest 105/0 / golden 15/26 (drift 동일) | + +## 3. 현재 코드 상태 + +### 디렉토리 구조 (요점) + +``` +src/ + hooks/ + bm25-memory.py # orchestrator (300 lines) + _bm25/ # 11 모듈 (canonical BM25 구현) + __init__.py # 17 public 함수 re-export + tokenizer.py # 한국어 조사 + Porter + stopword + ranker.py # score_corpus_bm25 + bm25_rank_decisions + ... + corpus.py # G1 decision corpus (git HEAD-keyed cache) + rerank.py # vec-daemon + BGE cross-encoder + autotune.py # ctx-auto-tune.json reader + docs_search.py # G2-DOCS BM25 + hybrid + code_search.py # G2 code grep + reindex (deterministic sort) + hooks_search.py # ~/.claude/hooks/*.py BM25 + session.py / injection.py / output.py + chat-memory.py # vault.db FTS5 + vec0 (sqlite_vec guard 추가됨) + memory-keyword-trigger.py + g2-fallback.py + utility-rate.py + _ctx_telemetry.py + cli/ + install.py # ctx-install (hash-based update + --force-hooks/--no-update-hooks/--uninstall) + settings_patcher.py # atomic write + timestamped backup + telemetry.py # ctx-telemetry + retrieval/ + adaptive_trigger.py # 통합된 _bm25 토크나이저/스코어러 사용. 단 self.bm25 = BM25Okapi(...) 잔존 (persistent 객체 재사용 — 정당) + bm25_retriever.py + ... (그 외 7 strategy) + evaluator/ + coir_evaluator.py # _bm25 통합됨 + +tests/ + golden/ + bm25_memory_outputs.jsonl # 26 픽스처 (fallback 14 + bm25path 12) + bm25_path_corpus_frozen.json # 결정성 보장용 frozen G1 corpus + run_golden.py # G2-GREP normalize + expected_stderr 옵션 + unit/ # 105 tests + test_settings_patcher.py # 22 (atomic, idempotency, real fs rename) + test_install_cli.py # 32 (hash update, force, no-update flags) + test_chat_memory_fallback.py # 9 (subprocess 기반) + test_bm25_memory_cache.py # 7 (HEAD invalidation) + test_bm25_memory_telemetry.py # 6 (gate, fallback reason, exception) + test_code_search_sort.py # 6 ((-count, path) deterministic) + test_bm25_init_reexport.py # 10 (callable, no circular, state 비노출) + test_uninstall_cleanup.py # 10 (hash 보존, force, dry-run) + conftest.py # tmp_home / tmp_project fixtures + +docs/refactor/ + PRODUCTION_REFACTOR_PLAN.md # 본 사이클 plan (footer 정정 포함) + TELEMETRY_SCHEMA.md # 7 이벤트 스키마 명세 + HANDOFF.md # 이 문서 + +scripts/ + verify_bm25_unified.py # BM25 통합 sanity check +``` + +### 잔존 BM25Okapi (의도적) + +`grep -rE "BM25Okapi" src/`: +- `src/hooks/_bm25/{ranker,docs_search,hooks_search}.py` — canonical 위치 (3 사이트) +- `src/retrieval/adaptive_trigger.py` — persistent 객체 재사용 패턴 (성능 우선, codex 도 정당화 인정) +- archival benchmark (`benchmarks/eval/*.py`) 11+ 개 — 의도적 보류 (A/B 실험 의미 보존) + +다음 세션에서 이걸 더 정리하려는 욕심은 **하지 말 것**. ROI 낮고 회귀 risk 높음. + +## 4. 적용 상태 (2026-05-05 06:32) + +### ctx-install 실행 완료 + +```bash +.venv-golden/bin/ctx-install +``` + +결과: +- `~/.claude/hooks/` 에 18 파일 복사 (5 hook + 11 `_bm25/*` + `_ctx_telemetry.py` + `utility-rate.py`) +- `~/.local/share/claude-vault/` 에 vec-daemon + bge-daemon 2개 복사 +- `~/.claude/settings.json` 에 5 hook 등록: + - UserPromptSubmit: chat-memory, bm25-memory --rich, memory-keyword-trigger + - PostToolUse(Grep): g2-fallback + - Stop: utility-rate +- backup: `~/.claude/settings.backup_20260505_063258.json` + +### 현재 알려진 제약 (Cycle-3 시점 갱신, 2026-05-05 기준) + +이전 HANDOFF 의 "fallback 모드 / daemon down" 기술은 **Cycle-3 시점에 모두 해소됨**. 실측 결과: + +- ✅ **hook command 가 pipx python 사용 중**: `~/.claude/settings.json` 의 5개 hook command 모두 `/Users/d9ng/.local/pipx/venvs/ctx-retriever/bin/python` 경로 — system python3 fallback 아님 +- ✅ **pipx venv 패키지 정상**: `rank_bm25`, `numpy 2.4.4`, `sklearn 1.8.0`, `networkx 3.6.1` 모두 설치됨 +- ✅ **vec-daemon running** (PID 24808 — 검증 시점, socket active) +- ✅ **bge-daemon running** (PID 25006 — `BAAI/bge-reranker-v2-m3` 모델 로드 완료, socket active) +- → 옵션 C (pipx 격리) 가 적용 완료된 상태로 운영 중. 옵션 B / 옵션 C 활성화 가이드 (아래 §4-1, §4-2) 는 재설치 시 참고용으로만 보존 + +### §4-1. BM25 / 의미층 활성화 (옵션 C — pipx 격리, 권장 — **현재 적용된 옵션**) + +```bash +# 1. pipx 설치 (없으면) +brew install pipx +pipx ensurepath + +# 2. ctx-retriever 격리 설치 (작업 디렉토리 = 현재 tunaCtx clone) +pipx install /Users/d9ng/privateProject/tunaCtx +# → ~/.local/pipx/venvs/ctx-retriever/bin/ 에 ctx-install / ctx-telemetry / python 위치 + +# 3. settings.json 의 hook command 의 python 경로를 pipx python 으로: +# python3 $HOME/.claude/hooks/bm25-memory.py --rich +# → +# /Users/d9ng/.local/pipx/venvs/ctx-retriever/bin/python $HOME/.claude/hooks/bm25-memory.py --rich + +# 4. vec-daemon / bge-daemon 도 동일하게 pipx python 사용: +nohup ~/.local/pipx/venvs/ctx-retriever/bin/python ~/.local/share/claude-vault/vec-daemon.py >/dev/null 2>&1 & +nohup ~/.local/pipx/venvs/ctx-retriever/bin/python ~/.local/share/claude-vault/bge-daemon.py >/dev/null 2>&1 & +``` + +### §4-2. BM25 / 의미층 활성화 (옵션 B — 현재 dev venv 사용, 빠름) + +```bash +# .venv-golden 가 이미 ctx-retriever editable + rank_bm25 + numpy + sklearn + networkx 설치됨 +# settings.json 에서 hook command 의 'python3' 를 venv python 으로 교체: +# python3 $HOME/.claude/hooks/... +# → +# /Users/d9ng/privateProject/tunaCtx/.venv-golden/bin/python $HOME/.claude/hooks/... + +# vec-daemon / bge-daemon 도 동일하게: +nohup /Users/d9ng/privateProject/tunaCtx/.venv-golden/bin/python ~/.local/share/claude-vault/vec-daemon.py >/dev/null 2>&1 & +nohup /Users/d9ng/privateProject/tunaCtx/.venv-golden/bin/python ~/.local/share/claude-vault/bge-daemon.py >/dev/null 2>&1 & +``` + +옵션 B 는 이 dev 머신에 한정. 다른 머신/사용자 재현 시 옵션 C 권장. + +### 롤백 (필요 시) + +```bash +# settings.json 만 되돌리기: +cp ~/.claude/settings.backup_20260505_063258.json ~/.claude/settings.json + +# hook 파일까지 정리: +.venv-golden/bin/ctx-install --uninstall # 사용자 수정 보존 +.venv-golden/bin/ctx-install --uninstall --force # 강제 모두 제거 +``` + +## 5. 검증 명령 (다음 세션에서 sanity check) + +```bash +cd /Users/d9ng/privateProject/tunaCtx + +# 회귀 가드 (deterministic hook output) +.venv-golden/bin/python tests/golden/run_golden.py +# 기대: 15/26 fixtures passed (Cycle-3 시점) +# — 11 fallback variant 가 git log drift 영향으로 FAIL 중. §6-1 참조. +# — bm25path variant 12/12 PASS — frozen corpus 메커니즘 정상. +# — fixture refresh 가 필요한 경우: --update 후 production 동작 변화 없는지 확인 후 commit + +# 단위 테스트 +.venv-golden/bin/python -m pytest tests/unit -q +# 기대: 105 passed + +# BM25 통합 sanity +.venv-golden/bin/python scripts/verify_bm25_unified.py +# 기대: ALL CHECKS PASSED + +# install 상태 점검 +.venv-golden/bin/ctx-install status +``` + +## 6. 다음 세션이 알아야 할 함정 + +### 6-1. Golden fixture 의 git history 의존성 + +`tests/golden/bm25_memory_outputs.jsonl` 의 BM25-path 픽스처는 G1 (decision corpus) 상위 ranking 을 stdout 에 포함. **새 commit 이 추가되면 corpus 가 진화하면서 ranking 이 변동 → fixture drift**. + +`tests/golden/bm25_path_corpus_frozen.json` 가 frozen corpus 메커니즘 일부 제공하지만 완벽하지 않음 — 현재 매 사이클 끝에 다음 명령으로 fixture refresh 필요: + +```bash +python3 tests/golden/run_golden.py --update +# 단, 이 갱신이 production 동작 변화 없는지 확인 후 수용 +``` + +drift 가 발생하는 것은 production 회귀가 아니라 입력 데이터(git log) 의 자연 진화. fixture 의 expected_stdout 을 갱신하는 게 정상 패턴. + +장기적으로 frozen corpus 메커니즘 강화하려면: +- runner 가 BM25-path fixture 실행 시 git log 를 무시하고 frozen corpus 만 사용하도록 강제 +- 현재 `run_golden.py:75 _inject_frozen_corpus()` 가 부분 구현. 이게 모든 BM25 경로에 적용되는지 검증 필요. + +### 6-2. Telemetry zero-cost gate + +`src/hooks/bm25-memory.py:64` 부근의 `_TELEMETRY_ENABLED` 모듈 수준 gate + `_log_event_impl = None` lazy import. 이 패턴이 깨지면 telemetry 비활성 시에도 latency 가 발생함. + +다음 세션에서 bm25-memory.py orchestrator 를 만질 때 이 패턴을 보존할 것. 검증: +```bash +.venv-golden/bin/python -m pytest tests/unit/test_bm25_memory_telemetry.py -v +# test_telemetry_latency_overhead_under_5ms 가 PASS 해야 함 +``` + +### 6-3. `_bm25/` 의 cross-package import + +`src/retrieval/adaptive_trigger.py` 가 `from src.hooks._bm25 import tokenize, score_corpus_bm25` 로 cross-package import. architectural purity 관점에서는 어색하지만 (`retrieval` 이 `hooks` 를 의존), 실용적 trade-off. 향후 깔끔하게 정리하려면 `src/_shared/bm25/` neutral 위치로 이동 + 양쪽이 거기서 import. 단 이건 본 사이클 명시 보류 항목. + +### 6-4. `chat-memory.py` 의 sqlite_vec import + +`src/hooks/chat-memory.py:16` 는 `try/except ImportError` 로 `sqlite_vec` 무방어 import 보강됨. 단 `chat-memory.py` 자체는 본 사이클에서 분해하지 않음 — 529줄. 다음 사이클 후보. 분해 시 `_bm25/` 와 동일 패턴 적용. + +### 6-5. archival benchmark 11+개 + +`benchmarks/eval/g1_*.py`, `g2_*.py`, `mab_*.py`, `nemotron_*.py`, `retrieve_ctx_v2.py` 등이 자체 BM25Okapi 구현. **통합 시 paper headline 결과 (MAB N=50 ctx_v3=0.880 등) 가 변할 수 있음** — 즉 frozen 결과의 회귀 risk. 통합 욕심 내지 말 것. + +### 6-6. 외부 codebase R@5 수치의 다중성 (Cycle-3 발견) + +upstream / fork docs 에 외부 codebase R@5 가 **여러 시점 측정값으로 공존** — +- `0.152`: 가장 옛날 baseline (`docs/research/20260326-ctx-methodology-comparison.md` L70 — pre-fix, 자체 텍스트에서 stale 인정) +- `0.495`: SEMANTIC trigger fix 후 (commits `727b5c3`) +- `0.595`: iter11 재측정 (`benchmarks/results/reeval_external_iter11.json` — Mean R@5 = 0.595, Flask 0.6462 / FastAPI 0.3870 / Requests 0.7526). **canonical 확정 (jaytoone 답변 in issue #2)**. +- `0.744`: `docs/benchmark/g1_g2_publication_framework.md` 의 다른 평가 framework — **superseded by iter11** (jaytoone 명시). + +Cycle-3 에서 fork 내부 인용은 0.595 로 통일 + Cycle-3.5 에서 jaytoone 답변으로 canonical 확정. 다음 세션부터 R@5 인용은 **0.595 = canonical** 로 단정 가능. + +또한 `benchmarks/eval/reeval_external.py` 를 직접 재실행하려면 입력 query JSON (`benchmark_real_eval_*.json`) 이 repo 에 없음 (`find` 결과 0건) — git history 복원 또는 upstream 문의 선행 필요. + +### 6-7. README.md 의 upstream PR 제외 (Cycle-3.5 결정) + +**README.md 는 upstream PR scope 에 포함하지 않음** — fork 와 upstream 의 방향성이 다르기 때문: +- upstream README: paper / benchmark 헤드라인 / academic 톤 +- fork README: production hook 운영 / 설치 / 적용 가이드 / `검색 stack` 명시 / 외부 codebase 측정값 참고 / Empirical eval (Context Mode) 등 + +향후 upstream PR 분해 진행 시 (issue #1 의 5단계 plan): +- Tokenizer unification / sqlite_vec graceful / install hash-update / bm25-memory 분해 / telemetry instrumentation — 모두 **README 수정 없이** 코드/테스트만 cherry-pick +- README 갱신은 fork 단독 유지 — 두 repo 의 사용자 페르소나가 다르다는 인정 + +## 7. upstream 처리 (참고) + +upstream issues: +- `#1` (2026-05-04): fork 알림 → jaytoone 답변 (3 questions + PR 환영) → 우리 reply 발행 (Cycle-3.5) — **응답 대기 중** (어느 PR shape 로 진행할지) +- `#2` (2026-05-05): docs R@5 정합성 정정 권고 → jaytoone 즉시 fix push + canonical 0.595 확정 → 우리 감사 댓글 + close — **CLOSED** + +PR shape 응답 시 시나리오: +- **upstream 이 5단계 분해 환영** → issue #1 reply 의 순서대로 진행: + 1. Tokenizer unification (가장 작고 안전) + 2. sqlite_vec graceful degradation + 3. install.py hash-based update + atomic settings_patcher + 4. bm25-memory.py 11 sub-module 분해 (의존: 1 land 후) + 5. Telemetry instrumentation + 6. **(별도)** subtoken splitter — 우리도 미구현, 아직 fork 에 없음. jaytoone 의지에 따라 협업 +- **upstream 이 bundled PR 선호** → 한 번에 큰 PR. 단 회귀 가드는 동일 (golden + 105 unit) +- **upstream 응답 없거나 보류** → fork 단독 운영 (downstream maintenance) + +PR 분해 시 **README 수정은 절대 포함 X** (§6-7 참조). + +## 8. 다음 세션이 처음 할 행동 + +1. `cd /Users/d9ng/privateProject/tunaCtx` +2. 본 문서 (`docs/refactor/HANDOFF.md`) 읽기 +3. `git log --oneline -5` 로 최신 commit 확인 — `29f241c` 가 마지막이어야 함 +4. `.venv-golden/bin/python tests/golden/run_golden.py` 로 15/26 확인 (11 fallback drift 는 §6-1 함정으로 알려진 상태) +5. `.venv-golden/bin/python -m pytest tests/unit -q` 로 105/0 확인 +6. upstream issue #1 의 jaytoone 응답 확인 — `gh issue view 1 --repo jaytoone/CTX --comments` (PR shape 결정 후 5단계 분해 첫 PR 시작 가능) +7. user 의 새 요청 들으며 본 문서의 §6 함정 회피 + +## 9. 의도적으로 안 한 것 (다음 세션도 따를 것) + +- ✗ paper 작성, paper 관련 README 추가 +- ✗ 마케팅 톤 (benchmark 자랑, "1.9x higher TES" 같은 것) +- ✗ **upstream PR 에 README.md 포함** (Cycle-3.5 결정, §6-7 참조 — fork 와 upstream 의 사용자 페르소나 다름) +- ✗ archival benchmark 11+개의 BM25 통합 +- ✗ retrieval 알고리즘 변경 +- ✗ `~/.claude/settings.json` 의 hook command 를 명시적 user 승인 없이 변경 +- ✗ 신규 MCP server 또는 hook event 추가 +- ✗ tests/golden 픽스처 데이터의 production-coded 변경 (production behavior 변화 없는 cascade 만 `--update` 로 갱신) + +## 10. 환경 메타 + +- macOS Darwin 25.4.0 +- 작업 디렉토리: `/Users/d9ng/privateProject/tunaCtx` (clone 후 `origin = hang-in/tunaCtx` 추가, GitHub 정식 fork 아님) +- system python3: `/opt/homebrew/bin/python3` (3.14, PEP 668 protected, **rank_bm25 미설치**) +- dev venv: `.venv-golden/bin/python` (3.14, rank_bm25 + numpy + sklearn + networkx + ctx-retriever editable 설치됨) +- pipx venv: `~/.local/pipx/venvs/ctx-retriever/bin/python` (3.14, 동일 패키지 + 격리 — **현재 hook command 가 사용 중인 python**) +- vec-daemon / bge-daemon: pipx python 으로 기동 중 (Cycle-3 검증 시점 PID 24808 / 25006) +- gh CLI: 인증 완료 (`dghong-d9ng` account) +- Claude Code: 본 conversation 의 hook 동작 확인됨 (UserPromptSubmit hook 이 prompt 마다 fire) diff --git a/docs/refactor/PRODUCTION_REFACTOR_PLAN.md b/docs/refactor/PRODUCTION_REFACTOR_PLAN.md new file mode 100644 index 0000000..faf17b1 --- /dev/null +++ b/docs/refactor/PRODUCTION_REFACTOR_PLAN.md @@ -0,0 +1,294 @@ +# Production Refactor Plan — CTX + +| 항목 | 값 | +|---|---| +| 작성일 | 2026-05-05 | +| 작성자 | Opus 4.7 (1M context) | +| 대상 브랜치 | `refactor/production-ready` (base: `master` @ `201c810`) | +| 실행 모델 | Sonnet 4.6 (병렬 가능 작업은 single-message multi Agent call) | +| 리뷰 도구 | `codex exec` v0.122.0 (최종 1회, 필요 시 mid-checkpoint 추가) | +| 보존 대상 | `~/.claude/settings.json` 의 hook command path, stdin/stdout 프로토콜, 기존 캐시 파일 포맷 | + +--- + +## 0. 배경 + +지난 리뷰(2026-05-05)에서 식별된 production-readiness 부족분 4건을 한 사이클로 해소한다. + +근거: 본 plan 하단의 "참고: 코드베이스 사실 확인" 절 참조. + +핵심 위험 요소 +- `src/hooks/bm25-memory.py` 가 1837줄 단일 파일 — production hot path 의 god module. +- production hook 의 atomic write / 캐시 무효화 / fallback 경로에 unit test 없음. +- eval 쪽 `src/retrieval/adaptive_trigger.py` 와 production 쪽 `bm25-memory.py` 가 **별개의 BM25 토크나이저/스코어러를 보유** — 한쪽 개선이 다른 쪽으로 전파되지 않아 eval 점수와 실제 hook 품질이 표류 가능. +- `bm25-memory.py` 는 telemetry instrument 가 없음 — 본인이 한 변경의 효과를 자기 데이터로 측정 불가. + +--- + +## 1. 스코프 + +### Task A — `bm25-memory.py` 모듈 분해 + +**목표**: 1837줄 단일 파일을 책임 단위로 분해하고 orchestrator 를 250줄 이하로 축소. + +**현재 파일 내부 섹션 마커** (작성자가 이미 `# ── ... ──` 으로 명시한 경계): + +| 라인 범위 | 섹션 | 분리 대상 모듈 | +|---|---|---| +| 39-300 | Tokenizer + stopwords + stem + `tokenize` + `expand_query_tokens` | `_bm25/tokenizer.py` | +| 73-301 | Semantic rerank (vec-daemon, BGE cross-encoder, synonym fusion) | `_bm25/rerank.py` | +| 84-95 | Auto-tune reader (`ctx-auto-tune.json`) | `_bm25/autotune.py` | +| 351-524 | G1 Decision Corpus (git head + build/get + cache) | `_bm25/corpus.py` | +| 526-742 | Ranker primitives (`embed_corpus_items`, `dense_rank_decisions`, `rrf_merge`, `bm25_rank_decisions`, `hybrid_rank_decisions`) | `_bm25/ranker.py` | +| 743-1009 | G2-DOCS BM25 + hybrid search | `_bm25/docs_search.py` | +| 1010-1240 | G2 code file discovery + reindex check + grep fallback + citation log | `_bm25/code_search.py` | +| 1241-1306+ | G2 hooks file BM25 search | `_bm25/hooks_search.py` | +| 나머지 | stdin parse → 분기 dispatch → 컨텍스트 조립 | `bm25-memory.py` (orchestrator) | + +**불변 제약** +- `~/.claude/settings.json` 에 등록된 `bm25-memory.py` 경로는 그대로. 진입점 파일명/위치 변경 금지. +- stdin JSON 스키마 / stdout 출력 포맷 변경 금지. +- 캐시 파일 경로(`.omc/decision_corpus.json`, `.omc/docs_corpus_emb.json`) 변경 금지. +- env var 이름 변경 금지(`CTX_DISABLE_SEMANTIC_RERANK`, `CTX_CROSS_ENCODER`, `CTX_TELEMETRY` 등). + +**Acceptance 기준** +- `bm25-memory.py` ≤ 300 줄. +- `_bm25/*.py` 각 파일 ≤ 400 줄. +- 골든 픽스처(Phase 0) 출력 100% 일치. +- p50/p95 latency ±10% 이내(측정 방법: Phase 0 픽스처에 시간 측정 옵션 포함). + +**산출물**: `src/hooks/_bm25/__init__.py` + 8개 모듈 + thin orchestrator. + +--- + +### Task B — Production hook 단위 테스트 + +**목표**: `~/.claude/settings.json` 을 직접 건드리는 쓰기 경로에 회귀 가드 깔기. + +**범위** +- `tests/unit/test_settings_patcher.py` + - atomic write (temp + os.replace) + - 타임스탬프 backup 생성 + - idempotency (두 번 patch 시 중복 없음) + - dry-run 동작 + - unpatch 동작 + - 손상된 JSON 입력 처리 +- `tests/unit/test_install_cli.py` + - 기존 다른 hook 보존 + - PostToolUse matcher 동일 그룹에 머지 + - 진입점 commands 정확 +- `tests/unit/test_chat_memory_fallback.py` + - vault.db 없음 → degrade 동작 + - vec-daemon socket 없음 → BM25-only fallback + `⚠ vec-daemon down` 출력 +- `tests/unit/test_bm25_memory_cache.py` + - git HEAD 변경 시 `.omc/decision_corpus.json` 재빌드 + - HEAD 동일 시 캐시 hit (재빌드 안 함) + - corrupted cache 감지 시 안전하게 재빌드 + +**인프라** +- `pyproject.toml` 의 `[tool.pytest.ini_options]` 섹션 추가. +- 패키지 매핑(`ctx_retriever` ↔ `src/`) 정합성 확인 — 현재 `[tool.setuptools] packages` 와 실제 `src/` 디렉토리 구조가 어긋나 있을 가능성. 필요 시 `[tool.setuptools.packages.find]` 로 정리. +- `tests/unit/conftest.py` 에 임시 디렉토리 / mock subprocess 헬퍼. + +**Acceptance 기준** +- `pytest tests/unit -q` 모두 통과. +- 위 4개 모듈 line coverage ≥ 70%. + +--- + +### Task C — eval ↔ production BM25 통합 + +**목표**: 토크나이저와 BM25 스코어링을 단일 모듈에서 공유 — eval(`adaptive_trigger.py`) 과 production(`bm25-memory.py`) 양쪽이 같은 코드 경로를 사용. + +**전제**: Task A 완료 후 진행 (A 산출물의 `_bm25/tokenizer.py`, `_bm25/ranker.py` 가 통합 베이스). + +**작업** +1. `_bm25/tokenizer.py` 와 `_bm25/ranker.py` 의 공개 API 를 `src/retrieval/bm25_core.py` 또는 공유 위치로 승격(또는 직접 import). +2. `src/retrieval/adaptive_trigger.py` 의 `BM25Okapi` 직접 호출 부분을 공유 API 경유로 변경. +3. `adaptive_trigger.py` 내 자체 토큰 분할 로직을 공유 토크나이저로 대체(쿼리/코퍼스 양쪽). +4. `benchmarks/eval/doc_retrieval_eval_v2.py` 재실행으로 회귀 검증. + +**Acceptance 기준** +- `python3 benchmarks/eval/doc_retrieval_eval_v2.py` 결과: + - 전체 R@3 ≥ 0.852 (현재 0.862, 마진 -0.010) + - keyword R@3 ≥ 0.714 (현재 0.724) + - heading_paraphrase R@3 == 1.000 (회귀 0) +- 토크나이저 import 경로가 단 하나(`from src.hooks._bm25.tokenizer import tokenize` 또는 등가). + +--- + +### Task D — `bm25-memory` telemetry instrument + +**목표**: `bm25-memory.py` orchestrator 에 telemetry 이벤트 emit 추가. 자기 변경의 효과를 자기 데이터로 측정 가능하게 만든다. + +**전제**: Task A 완료 후 진행(orchestrator 가 thin 해진 상태에서 instrument). + +**기존 인프라 활용** +- `src/hooks/_ctx_telemetry.py` (210줄) — `chat-memory.py` 가 이미 사용 중. +- 출력: `~/.claude/ctx-retrieval-events.jsonl` +- 활성화: `CTX_TELEMETRY=1` env 또는 `~/.claude/ctx-telemetry.enabled` 파일 존재. +- 비활성화 시 zero overhead(early return). + +**emit 이벤트 스키마** (`retrieval_event` v1.1 기존 schema 따름) +- `hook=bm25-memory` +- `query_type` (`_classify_query_type` 결과) +- `g1_top_score_bm25`, `g1_top_score_dense` (이미 `_last_retrieval_scores` 에 캡처되고 있음) +- `g2_docs_count`, `g2_code_count`, `g2_hooks_count` +- `fallback_reasons` (예: `vec_daemon_down`, `bge_daemon_down`, `mcp_db_stale`) +- `latency_ms` (전체 hook 처리 시간) + +**emit 패턴** +- 비차단(파일 append, fsync 안 함). +- exception 시 silently drop — telemetry 가 hook 을 절대 죽이지 않도록. + +**Acceptance 기준** +- `CTX_TELEMETRY=1 echo '{"prompt": "test"}' | python3 src/hooks/bm25-memory.py` 후 `~/.claude/ctx-retrieval-events.jsonl` 마지막 라인에 hook=bm25-memory 이벤트 존재. +- `CTX_TELEMETRY` 미설정 시 jsonl 추가 안 됨, latency 영향 ≤ 1ms. +- `tests/unit/test_bm25_memory_telemetry.py` 추가. + +--- + +## 2. 의존 그래프 / 실행 순서 + +``` +Phase 0 ──► Task A ──┬──► Task C +(픽스처) ├──► Task D + │ + Task B ──┘ (독립, A 와 병렬) + │ + ▼ + Phase 9 (codex 리뷰, 모든 task 완료 후) +``` + +**Wave 분할** + +| Wave | 작업 | 병렬화 | 모델 | +|---|---|---|---| +| 0 | 골든 픽스처 캡처 | — | Sonnet 1개 | +| 1 | Task A + Task B | 병렬 (file-disjoint) | Sonnet 2개 동시 | +| 2 | Task C | A 완료 후 단독 | Sonnet 1개 | +| 3 | Task D | A 완료 후 단독 (C 와 병렬 가능하나 ranker.py 동시 수정 위험 → 순차 권장) | Sonnet 1개 | +| 9 | codex 최종 리뷰 | — | codex exec | + +**Wave 1 의 file-disjoint 근거** +- Task A: `src/hooks/bm25-memory.py`, `src/hooks/_bm25/*` (신규) +- Task B: `tests/unit/*` (신규), `pyproject.toml` (라인 추가만) +- 겹치는 영역 없음 → 동시 실행 안전. + +**Wave 2 ↔ Wave 3 순서 결정 근거** +- C 는 `_bm25/tokenizer.py`, `_bm25/ranker.py` 를 read 하고 `adaptive_trigger.py` 를 수정. +- D 는 orchestrator(`bm25-memory.py`) 에 instrument 호출 추가, `_bm25/ranker.py` 는 read-only. +- 이론상 병렬 가능하나 `ranker.py` 시그니처 미세 변경 가능성 있어 순차 권장. + +--- + +## 3. 회귀 가드 (안전장치) + +### Phase 0: 골든 픽스처 (반드시 A 시작 전) + +**산출물**: `tests/golden/bm25_memory_outputs.jsonl` + +대표 prompts 10-15개에 대해 현재 `bm25-memory.py` 출력을 (deterministic 모드로) 캡처. + +prompts 카테고리 +- 키워드 단일 ("BM25 어디 있지?") +- 한국어 paraphrase ("의사결정 기억 어떻게 관리됨?") +- code-finding ("vec-daemon 코드 위치") +- 의미 회피 ([noctx] prefix) +- 빈 prompt / 매우 짧은 prompt + +각 픽스처에 기록할 필드 +- input prompt +- env (CTX_DISABLE_SEMANTIC_RERANK 강제 ON 으로 비결정성 제거) +- expected stdout +- expected exit code +- elapsed_ms (정보용) + +**비결정성 제거 전략** +- vec-daemon / bge-daemon 비활성화 (`CTX_DISABLE_SEMANTIC_RERANK=1`, `CTX_CROSS_ENCODER=0`) +- git HEAD 고정(현재 `master` HEAD `201c810`) +- 캐시 강제 재빌드 후 동일 입력 2회 출력 일치 확인 + +A 완료 후 동일 픽스처 재실행으로 출력 비교 — diff 0 줄이 acceptance. + +### 각 Task acceptance 는 §1 의 항목별 기준 따름. + +### 롤백 +- 모든 작업은 `refactor/production-ready` 브랜치 위. master 미변경. +- Task 별 commit 분리 → 부분 롤백 용이. +- production 사용자(`~/.claude/settings.json` wired 한 본인) 환경에는 머지 전까지 영향 없음. + +--- + +## 4. codex 리뷰 (Phase 9) + +**기본 명령** +```bash +codex exec --model gpt-5.1 \ + "Review the diff between master and refactor/production-ready. \ + Focus on: (1) bm25-memory.py 진입점/프로토콜 보존, \ + (2) atomic write 정합성, (3) 신규 _bm25/* 모듈의 SRP 준수, \ + (4) 토크나이저 통합으로 인한 eval 회귀 가능성, \ + (5) telemetry path 의 zero-overhead 보장. \ + Output: severity 별 finding 목록 + 구체적 라인 인용." +``` + +**리뷰 입력** +- `git diff master..refactor/production-ready` 전체. +- 본 PLAN.md 자체. +- benchmark 회귀 결과 JSON. + +**리뷰 산출물** +- `docs/refactor/REVIEW_codex_.md` — Severity Critical/Major/Minor 분류. +- 각 finding 에 대한 (a) 수용/반려 결정 (b) 후속 action item. + +**Mid-checkpoint 리뷰** (선택) +- 사용자 명시 요청 시에만 트리거. +- 기본 구성: Wave 1 종료 시점에서만 — A 의 분해가 가장 위험도가 높음. + +--- + +## 5. 메모리에 기록할 항목 (작업 종료 후) + +- `feedback` 타입: "production refactor 시 hook 진입점 경로/stdin 스키마는 절대 변경 금지 (사용자 settings.json wired)" — 향후 유사 변경에 동일 제약 적용. +- `project` 타입: "bm25-memory 분해 후 _bm25/* 패키지가 production hot path. 추후 hook 변경은 orchestrator 레이어에서만 시작." +- `reference` 타입: "production hook 회귀 가드 픽스처는 `tests/golden/bm25_memory_outputs.jsonl`." + +--- + +## 6. 비스코프 (의도적 제외) + +다음은 **이번 사이클에서 다루지 않는다** — 별도 사이클 후보. + +- `src/retrieval/adaptive_trigger.py` 자체의 god object 해체 — eval 한정 영향, 외부 R@5 추가 개선 작업과 묶어서 별도 진행. (참고: iter11 재측정 기준 Mean R@5=0.595, 0.152 는 pre-fix baseline 으로 stale) +- `chat-memory.py` 분해 — 현재 529줄로 임계 미만, vault.db 통합 변경과 함께 진행. +- 외부 코드베이스 R@5 개선, AST 파서 교체, GraphRAG 통합 등 알고리즘 변경. +- 새 retrieval 전략 추가, README 갱신, packaging 변경(버전 bump 제외). + +--- + +## 7. 참고: 코드베이스 사실 확인 (2026-05-05 기준) + +| 항목 | 값 | +|---|---| +| `src/hooks/bm25-memory.py` | 1837 lines | +| `src/hooks/chat-memory.py` | 529 lines | +| `src/hooks/_ctx_telemetry.py` | 210 lines (이미 존재, chat-memory 가 사용) | +| `src/cli/install.py` | 393 lines | +| `src/cli/settings_patcher.py` | 169 lines (atomic write 구현 검증 완료) | +| `src/retrieval/adaptive_trigger.py` | 1063 lines (eval 한정, production hook 미사용 — `grep -nr AdaptiveTriggerRetriever src/hooks src/cli` → 0건) | +| 현재 `tests/` 내용 | 4 files, unit test 사실상 없음 (eval 스크립트 위주) | +| `pyproject.toml` | `pytest>=7.0` 이미 dev extra 에 포함 | + +--- + +## 8. Task Tracker 매핑 + +| Plan ID | TaskList ID | Status | +|---|---|---| +| Phase 0 | #1 | pending | +| Task A | #2 | pending (blocked by #1) | +| Task B | #3 | pending | +| Task C | #4 | pending (blocked by #2) | +| Task D | #5 | pending (blocked by #2) | +| Phase 9 | #6 | pending (blocked by #2,#3,#4,#5) | diff --git a/docs/refactor/TELEMETRY_SCHEMA.md b/docs/refactor/TELEMETRY_SCHEMA.md new file mode 100644 index 0000000..046e7f6 --- /dev/null +++ b/docs/refactor/TELEMETRY_SCHEMA.md @@ -0,0 +1,131 @@ +# CTX Telemetry Schema + +| 항목 | 값 | +|---|---| +| 작성일 | 2026-05-05 | +| 스키마 버전 | 1 | +| 로그 경로 | `~/.claude/ctx-telemetry.jsonl` | +| 활성화 | `CTX_TELEMETRY=1` env 또는 `~/.claude/ctx-telemetry.enabled` 파일 | +| 비활성화 | 위 조건 없으면 zero overhead (early return) | +| 비차단 | 모든 emit은 `try/except Exception: pass` 로 감싸짐 — telemetry 오류가 hook 을 죽이지 않음 | + +## 레코드 공통 필드 + +모든 이벤트 레코드에 포함: + +| 필드 | 타입 | 설명 | +|---|---|---| +| `ts` | int | Unix timestamp (초) | +| `schema` | int | 스키마 버전 (현재 1) | +| `project` | str | CWD 마지막 세그먼트 (≤40자) — 경로 노출 방지 | +| `ab_group` | str | `control` / `treatment` / `ungrouped` | +| `type` | str | 이벤트 타입 (아래 표 참조) | + +## 이벤트 타입별 필드 + +### `hook_complete` (bm25-memory 주요 메트릭) + +bm25-memory.py 가 매 호출마다 종료 직전에 emit. **주 분석 대상.** + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `latency_ms` | int | 전체 hook 처리 시간 (ms) | +| `exit_code` | int | 정상 종료 시 `0` | +| `query_type` | str | `_classify_query_type()` 결과 (keyword / korean / etc.) | +| `g1_count` | int | G1 에서 반환된 decision 수 | +| `g2_docs_count` | int | G2-DOCS 에서 반환된 chunk 수 | +| `g2_code_count` | int | G2-CODE (graph 또는 grep) 에서 반환된 파일/노드 수 | +| `g2_hooks_count` | int | G2-HOOKS 에서 반환된 hook 파일 수 | +| `g1_top_score_bm25` | float | G1 BM25 최고 점수 (있을 때만) | +| `g1_top_score_dense` | float | G1 dense 최고 점수 (있을 때만) | +| `fallback_reasons` | str | comma-separated: `vec_daemon_down`, `bge_daemon_down`, `mcp_db_stale`, `mcp_db_missing` | +| `blocks_fired` | str | comma-separated: 실제 출력된 block 태그 (`g1`, `g2_docs`, `g2_prefetch`, `g2_grep`, `g2_hooks`) | + +### `prompt_received` (bm25-memory 선택적, 진입 직후) + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `query_type` | str | 분류 결과 | +| `prompt_len` | int | prompt 문자 수 | + +### `g1_done` (bm25-memory, G1 완료 시) + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `g1_count` | int | 반환 건수 | +| `g1_top_score_bm25` | float | BM25 최고 점수 (있을 때만) | +| `g1_top_score_dense` | float | dense 최고 점수 (있을 때만) | +| `duration_ms` | int | G1 단계 소요 시간 | + +### `g2_docs_done` (bm25-memory, G2-DOCS 완료 시) + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `g2_docs_count` | int | 반환 chunk 수 | +| `top_score` | float | 최고 점수 (있을 때만) | +| `duration_ms` | int | 소요 시간 | + +### `g2_code_done` (bm25-memory, G2-CODE 완료 시) + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `g2_code_count` | int | 반환 수 | +| `fallback_reason` | str | `"grep_fallback"` (DB 없을 때) | +| `duration_ms` | int | 소요 시간 | + +### `g2_hooks_done` (bm25-memory, G2-HOOKS 완료 시) + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `g2_hooks_count` | int | 반환 hook 파일 수 | +| `duration_ms` | int | 소요 시간 | + +### `block_fired` (bm25-memory, 레거시 — 블록 단위) + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `block` | str | `g1_decisions` / `g2_docs` / `g2_prefetch` / `g2_grep` / `g2_hooks` | +| `count` | int | 항목 수 | +| `duration_ms` | int | 소요 시간 | + +### `hook_invoked` (bm25-memory, 레거시 호환용) + +dashboard 하위호환을 위해 유지. `hook_complete` 로 마이그레이션 권장. + +| 필드 | 타입 | 설명 | +|---|---|---| +| `hook` | str | `"bm25-memory"` | +| `duration_ms` | int | 전체 소요 시간 | +| `prompt_len` | int | prompt 길이 | + +### chat-memory 이벤트 (기존, 변경 없음) + +| 타입 | 설명 | +|---|---| +| `mode_switch` | hybrid ↔ bm25-only 전환 | +| `warning_fired` | daemon_down 등 경고 | +| `auto_index` | 인덱스 자동 업데이트 | +| `utility_measured` | Stop hook utility rate 측정 | +| `wow_fired` | high-utility + old-decision recall 달성 | + +### 기타 공통 이벤트 + +| 타입 | hook | 설명 | +|---|---|---| +| `decision_captured` | memory-keyword-trigger | 키워드 감지로 결정 기록 유도 | +| `grep_signal` | g2-fallback | Grep 결과 빈약 감지 | +| `ab_skipped` | bm25-memory / chat-memory | A/B control arm skip | + +## 프라이버시 계약 + +- prompt 내용, 파일 내용, 검색 결과 텍스트, 키워드, DB 행 **절대 기록 안 함**. +- 카운트, 시간, 점수, 분류 태그만 기록. +- 로컬 전용 — 네트워크 전송 없음. +- whitelist 기반 sanitize: `_ALLOWED_KEYS` 에 없는 필드는 자동 제거. diff --git a/docs/refactor/UPSTREAM_ISSUE_jaytoone.md b/docs/refactor/UPSTREAM_ISSUE_jaytoone.md new file mode 100644 index 0000000..fe2981c --- /dev/null +++ b/docs/refactor/UPSTREAM_ISSUE_jaytoone.md @@ -0,0 +1,63 @@ +# Empirical measurement: CTX with Context Mode in Korean dev env (5 scenarios × 4 states) + +Hi jaytoone, + +Following up on https://github.com/jaytoone/CTX/issues/1, I ran an empirical evaluation of CTX (this fork: `hang-in/tunaCtx`) alongside `mksglu/context-mode` plugin in a real Korean development environment. Sharing the data because the patterns surfaced in Korean prompts may be useful for upstream. + +## Setup + +- **Model**: claude-opus-4-7 +- **Invocation**: `claude -p --output-format=stream-json --no-session-persistence` (headless) +- **States**: A=both active, B=CM only, C=CTX only, D=baseline +- **Repos**: seCall, tunaFlow, tunaCtx (mixed Korean comments / English code) +- **CTX layers active**: BM25 + vec-daemon (`multilingual-e5-small`) + bge-daemon (`bge-reranker-v2-m3`) +- **Total cost**: $8.01 across 20 measurements + +## Key findings relevant to upstream + +### CTX never finishes worse than 2nd place across 5 scenarios + +| Scenario | CTX rank vs no-CTX | Note | +|---|---|---| +| Code-search (Korean prompt → seCall) | A=1st, C=3rd | CTX+CM synergy beats baseline | +| 30-commit analysis | C=2nd | Context Mode added cost without value here | +| Korean docstring search ("Roundtable" / "RT") | A=1st, C=4th (timeout) | CTX surfaced Rust files via G2-DOCS | +| Commit evolution analysis | **C=1st** | CTX-only beat all combos including CTX+CM | +| Production refactor TODO grep | C=2nd | CTX+CM crashed (sandbox permission) | + +### Korean prompt token shape + +CTX's tokenizer (`_bm25/tokenizer.tokenize`) handles Korean particles + Porter stemmer. Cumulative input/output tokens per scenario: + +| Scenario | input | output | cache_read | +|---|---:|---:|---:| +| Korean docstring search (CTX+CM) | 32 | 8591 | 257K | +| Korean docstring search (CTX only, timeout) | 1* | 44* | 47K | +| Korean docstring search (baseline) | 10 | 7303 | 177K | + +The CTX-only / Korean-search scenario hit a 180s timeout while doing 27× Grep. This suggests CTX's G2-GREP fallback is too aggressive when the BM25/dense layers don't surface a clear top-N — Korean tokenization may be producing too many candidate keywords. + +### G2-GREP determinism (relevant to your codebase) + +We added `(-count, path)` tie-break to `_bm25/code_search.py:233` (was: `count` only) to fix golden fixture flakiness. Sort is now deterministic. Upstream may want to consider the same — `tests/golden/run_golden.py` cascade was the symptom but the root cause is fragile sort. + +### Fork-specific changes that might be useful upstream (no PR pressure) + +- `_bm25/` package decomposition (1837 LOC orchestrator → 300 LOC + 11 sub-modules) — same retrieval algorithm, just decomposed for testability +- 105 unit tests covering hooks/install/cache/telemetry +- Hash-based hook update in `ctx-install` (handles users who modified their hook files) +- `sqlite_vec` graceful fallback in `chat-memory.py` +- `score_corpus_bm25()` as the single canonical BM25 entrypoint shared across hook + benchmark eval scripts + +All in `master` of `hang-in/tunaCtx`. Happy to extract any of these as small PRs if you'd like — pick whatever's useful, ignore the rest. + +## What's measured / what's not + +- Measured: token usage, latency, cost, tool call counts, response text +- Judge: Gemini 1× per scenario (not multiple raters; LLM-as-judge has known biases) +- Sample size: 1 prompt × 4 states per scenario (no variance estimate) +- Not measured: interactive vs headless behavior delta (sandbox permission patterns may differ) + +Full report: `docs/refactor/EVAL_RESULTS.md` in `hang-in/tunaCtx`. + +If any of this is useful for the next CTX release or paper, happy to share raw data and re-run with adjusted prompts. diff --git a/docs/refactor/UPSTREAM_ISSUE_mksglu.md b/docs/refactor/UPSTREAM_ISSUE_mksglu.md new file mode 100644 index 0000000..cafd674 --- /dev/null +++ b/docs/refactor/UPSTREAM_ISSUE_mksglu.md @@ -0,0 +1,76 @@ +# Empirical measurement: Context Mode interaction with CTX hooks in headless mode + +Hi mksglu, + +I ran an empirical eval of `context-mode` running alongside another UserPromptSubmit hook (CTX, retrieval-based memory injection — `hang-in/tunaCtx`) in headless `claude -p` mode. Sharing the data because two patterns surfaced that might be relevant. + +## Setup + +- **context-mode version**: 1.0.107 (plugin) +- **Model**: claude-opus-4-7 +- **Invocation**: `claude -p --output-format=stream-json --no-session-persistence` +- **States compared**: + - A = both active + - B = Context Mode only (CTX hooks removed from settings.json) + - C = CTX only (`enabledPlugins.context-mode@context-mode` removed) + - D = neither +- **Total**: 5 scenarios × 4 states = 20 measurements at ~$8 of Opus + +## Two patterns observed in headless mode + +### Pattern 1 — `ctx_batch_execute` permission denial in non-interactive sessions + +In two scenarios (commit analysis, TODO grep across .py files), the response with Context Mode active contained explicit error text like: + +``` +"ctx_batch_execute 권한 거부됨" / permission denied for ctx_batch_execute +"Permission needed. Asking the user to grant ctx_batch_execute..." +``` + +The model attempted to use Context Mode's batch tool but the headless environment couldn't surface the permission prompt, so the call was denied and the entire task aborted. The "both active" state initially ranked 4th of 4 in those two scenarios. + +**Verification with `--dangerously-skip-permissions`** (8 additional measurements, +$2.57): + +| Scenario | default permissions (1st run) | with `--dangerously-skip-permissions` | +|---|---|---| +| 30-commit analysis (state A) | `Permission needed. Asking the user to grant...` (abort) | `## seCall 최근 30개 commit 분석... feat: 9건, fix: 6건...` (full analysis) | +| TODO grep across .py (state A) | `ctx_batch_execute 권한 거부됨. Grep tool로 진행` (partial fallback) | `프로젝트 .py 파일 203개 중 # TODO/FIXME/XXX/HACK 주석은 0건` (precise, includes .venv-golden noise filtering) | + +So the "Pattern 1" ranking issue is **definitely a headless permission artifact**, not a defect in Context Mode itself. Cost goes up 13–21% with skip-perm because `ctx_batch_execute` actually runs — which is the intended behavior. + +Possible mitigations from your side: + +- Ship a documented `--allow-ctx-tools` env var or settings flag for headless use +- Or auto-skip ctx_batch_execute when stdin/stdout aren't TTY +- Or document the headless permission story in the README (e.g. recommend `--dangerously-skip-permissions` for CI) + +### Pattern 2 — `ctx_batch_execute` cost in low-tool-density scenarios + +For scenarios where the model would have used 1-2 simple tools anyway (e.g. "summarize last 5 commits"), Context Mode's tool routing added overhead without value: + +| Scenario | A (both) cost | C (CTX only) cost | D (none) cost | A's judge rank | +|---|---:|---:|---:|---:| +| 5-commit summary | $0.334 | $0.158 | $0.205 | 2nd of 4 (C won) | +| 30-commit analysis | $0.393 | $0.168 | $0.191 | **4th** (timeout/permission) | + +For "small" tasks, Context Mode's batching infrastructure activates but doesn't compress meaningfully more than a 1-line `git log`. The C state (Context Mode off) was cheaper and equal-or-better quality. + +For "large" tasks where compression would matter (the 30-commit analysis), the permission pattern from #1 cancelled any benefit. + +The scenarios where Context Mode added clear value were: +- Code search with line-number precision (1st place, both active) +- Korean docstring search across mixed Rust/TS codebases (1st place) + +Both involved `Read` heavy paths where Context Mode's plumbing helped — but those wins are inside the `ctx_batch_execute` flow, not the `ctx_execute` raw-output one. + +## Suggestion (optional) + +A `~/.claude/context-mode-config.json` style toggle for "tool-heavy / tool-light" workloads, or a heuristic that auto-disables `ctx_batch_execute` when the prompt looks small (e.g. expected response < 1KB), might capture both wins (compress when valuable) and avoid the friction in light/headless cases. + +## Data + +- Full report: `docs/refactor/EVAL_RESULTS.md` in `hang-in/tunaCtx` +- Korean dev env (mixed Korean prompts + Korean comments + English code) +- 1 prompt × 4 states per scenario (no variance estimate; LLM-as-judge has known biases) + +Happy to re-run with different prompt sets or share raw stream-json output if useful for context-mode improvements. Not asking for any specific change — just figured Korean + headless data might be a corner of your user base you don't see often. diff --git a/docs/refactor/upstream-issue-1-reply-draft.md b/docs/refactor/upstream-issue-1-reply-draft.md new file mode 100644 index 0000000..fe021db --- /dev/null +++ b/docs/refactor/upstream-issue-1-reply-draft.md @@ -0,0 +1,94 @@ +# Upstream issue #1 — reply draft (response to jaytoone comment 2026-05-07) + +목적: jaytoone 의 P0/P1/P2 우선순위 + boundary design 우려 + co-maintain 제안에 대한 회신. + +## Reply body (English, paste into https://github.com/jaytoone/CTX/issues/1) + +--- + +Hey jaytoone — thanks for the detailed triage. Reordering my plan to match your priorities. sqlite_vec is dropped (already in 0.3.14), and the four pieces below map 1:1 to what you flagged. + +### Revised PR plan (4 stages) + +| Order | Piece | Maps to your ask | Risk | +|---|---|---|---| +| **PR-1** | Tokenizer unification (`_bm25/tokenizer.tokenize` as single canonical entry; eval + production both call it) | P0 (you wanted this most) | Lowest — token stream byte-identical, BM25 score delta = 0 on regression suite | +| **PR-2** | Test suite: 82 unit tests under `tests/unit/` + 26 golden fixtures under `tests/golden/` | P1 (CI gap) | None — additive | +| **PR-3** | Deterministic sort in `_bm25/code_search.py:233` (`scored.sort(key=lambda x: (-x[0], x[1]))` — score desc + path tiebreak) plus the `_bm25/ranker.py` sort sites at L49/L79/L153 | P2 (the non-determinism bug you mentioned) | Low — pure ordering fix | +| **PR-4** | 11-module decomposition under `src/hooks/_bm25/` | Pending boundary discussion (this comment) | Medium — depends on your boundary review | + +I can hold PR-4 until the boundaries below pass your review. PR-1 → PR-2 → PR-3 are independent and can land in any order. + +### Boundary design (re: "single-file is a deliberate tradeoff") + +Your point on copy/audit ergonomics is valid — I want to address it head-on. The decomposition is **not** a generic "split by file size" refactor. Each module owns one role with no cyclic imports; the orchestrator (`bm25-memory.py`, ~300 LOC) wires them together. Module map: + +| Module | LOC | Role | Imports from | +|---|---|---|---| +| `tokenizer.py` | 230 | Canonical `tokenize()` (Korean particle strip + Porter stem + stopword) | stdlib only | +| `corpus.py` | 240 | Decision corpus build + `_classify_query_type` | tokenizer | +| `ranker.py` | 270 | `score_corpus_bm25`, `hybrid_rank_decisions`, `last_retrieval_scores` | tokenizer, corpus | +| `docs_search.py` | 320 | `build_docs_bm25`, `hybrid_search_docs` (G2-DOCS) | tokenizer, ranker | +| `code_search.py` | 310 | Codebase graph + grep fallback (G2-CODE) | tokenizer | +| `hooks_search.py` | 100 | `~/.claude/hooks/*.py` BM25 (G2-HOOKS) | tokenizer, ranker | +| `rerank.py` | 240 | vec-daemon / cross-encoder rerank (optional) | stdlib + socket | +| `session.py` | 95 | World model + pending decisions snapshot | stdlib | +| `injection.py` | 170 | Final injection record writer | stdlib | +| `output.py` | 90 | Header lines + emit | stdlib | +| `autotune.py` | 70 | Top-K constants + flags | stdlib | +| `__init__.py` | 55 | Re-exports for orchestrator | (above) | + +Properties: + +- **No cycles** — DAG: `tokenizer → corpus → ranker → {docs_search, code_search, hooks_search}`; `rerank`, `session`, `injection`, `output`, `autotune` are leaves. +- **Single-file copy/audit preserved**: the orchestrator (`bm25-memory.py`) is still installable as one entry; the `_bm25/` package sits next to it. Users who copy the hook copy a directory of ≤320-LOC files instead of a 1837-LOC monolith — but each file is independently readable. If you'd prefer keeping the install footprint as a single concatenated `bm25-memory.py` (with the modules as dev-only sources), I can produce a build step that emits a flattened single-file artifact for distribution while keeping the modular sources for tests. +- **Each module has a focused unit-test file** under `tests/unit/` — no test reaches across two boundaries except the orchestrator-level ones. +- **Backward-compatible call sites**: `from _bm25.* import …` is the only change inside the orchestrator; external callers (Claude Code's hook contract) see no surface change. + +If any boundary feels wrong (e.g., `injection`/`output` could fold, `hooks_search` could merge into `code_search`), I'm happy to redraw before the PR. The split I'd defend hardest is `tokenizer` and `ranker` — those are the two pieces that previously diverged between eval and production paths and caused the misleading benchmark numbers you mentioned. + +### Co-maintain — yes, happy to + +I use CTX daily, so the maintenance overhead is already paid on my end. Concretely I can take: + +- Issue triage on weekday afternoons KST +- Hook reviews (BM25 / tokenizer / install machinery / golden fixtures) +- Release notes + version bumps in coordination with you + +I'd defer to you on retrieval algorithm direction, paper alignment, and anything benchmark-facing — those are your design decisions and I want to keep upstream's voice intact. If a `MAINTAINERS.md` with explicit areas of ownership works for you, I can draft one as part of PR-2. + +### Order of operations I'd suggest + +1. I open **PR-1 (tokenizer)** this week — small, self-contained, validates the workflow. +2. After PR-1 lands, **PR-2 (tests)** — needs PR-1's tokenizer to be the import target. +3. **PR-3 (deterministic sort)** in parallel with PR-2 — independent. +4. We discuss PR-4 (decomposition) once 1-3 are in; I'll redraw boundaries based on your feedback above. + +Let me know if the order or scope needs adjusting. Subtoken splitting stays out of all four PRs (separate cycle as agreed). + +--- + +## Internal notes (do not paste) + +### 우리 측 메시지 의도 + +| 항목 | 의도 | +|---|---| +| sqlite_vec drop | jaytoone 이 0.3.14 에서 자체 처리한 것 인정 (중복 PR 회피) | +| PR-1 ~ PR-3 순서 | jaytoone 우선순위(P0=tokenizer / P1=tests / P2=deterministic sort) 그대로 반영 | +| Boundary 표 | "deliberate tradeoff" 우려에 module-by-module 정량 답변 — DAG 무사이클 + LOC 분포 + 단위 테스트 1:1 매핑 | +| Flattened single-file 옵션 | jaytoone 의 "single-file is easier to copy/audit" 명시 수용 — build step 으로 양립 가능 제시 | +| Co-maintain | 수락 + 영역 분담 명시 (algorithm/paper = jaytoone, hook/install/test = us) — boundary 의 정치적 명료화 | +| Subtoken 명시 제외 | 이전 라운드 합의 ("separate cycle, not in fork yet") 재확인 | + +### 발행 전 체크 + +- [ ] 위 본문에서 LOC 수치 (230, 240, 270 …) 는 추정치 — 실제 값으로 갱신 필요. 현재 `ls -la _bm25/` 결과(byte 단위)에서 도출 가능 +- [ ] `code_search.py:233` 라인 번호 PR-3 시점에 master 와 일치하는지 확인 (drift 가능) +- [ ] PR-1 시동 시점에 issue 본문에서 5-stage → 4-stage 로 plan 갱신했음을 cross-link + +### 후속 액션 + +1. 본 draft 사용자 검토 → 수정 후 issue 코멘트 발행 +2. PR-1 (tokenizer) 브랜치 시작 — `feat/upstream-pr1-tokenizer` +3. HANDOFF.md §7 (upstream) 의 5-stage plan 을 4-stage 로 갱신 + sqlite_vec dropped 사유 기록 diff --git a/docs/refactor/upstream-sync-2026-05-08.md b/docs/refactor/upstream-sync-2026-05-08.md new file mode 100644 index 0000000..b150fe8 --- /dev/null +++ b/docs/refactor/upstream-sync-2026-05-08.md @@ -0,0 +1,69 @@ +# Upstream sync inventory — 2026-05-08 + +> Trial merge 결과. master 변경 안 함 — 본 문서는 PR 작업 전략용 참고 자료. + +## upstream/master 11 신규 commits + +`upstream/master` (`fd84cf9`, 2026-05-08) — fork base `201c810` 이후 11 commits. + +Author = **Be2Jay** (jaytoone이 GitHub username 변경 — issue 답변자 'jaytoone'과 동일인). + +| Hash | Subject | 우리 작업 영향 | +|---|---|---| +| **08e262b** | `fix: Korean tokenizer gap in eval pipeline + 6 regression tests` (commit msg에 **`Related: hang-in/tunaCtx tokenizer.py confirms same pattern`**) | 🎯 **PR-1 핵심 motivation 일부 선반영** — `doc_retrieval_eval_v2.py` 만 fix됨. 우리는 4 사이트 추가 통합. | +| fd84cf9 | `docs: add CJK intent comment to production tokenize() regex` | 무관 (주석만) | +| 9cd2371 | `docs: fix HN item ID 47996700→48017090` | 무관 | +| 982c043 | `feat: market signal monitor` (`scripts/market-signals.py`) | 무관 | +| **b799aae** | `chore: batch commit accumulated session work` (benchmarks/hooks/cli/docs/plugin/hf_space) | 🔴 거대 batch — 광범위 충돌 원인 | +| ba7df3d | `fix: add sqlite-vec to deps + guard import in chat-memory.py` | 🟡 우리 fork sqlite_vec fallback과 충돌 | +| 80fe738 | `docs: restructure install section` | 🟡 README 충돌 | +| ff34059 | `feat: --reseed flag, sharing trigger, PyPI baseline` | 🟡 install 변경 | +| 3964d39 | `feat: seed vault.db with git history on install` | 🟡 install 변경 | +| 9fe7d6d | `fix: replace PID file guard with fcntl.flock in vec-daemon and bge-daemon` | 🟡 daemon 충돌 | +| 126f1d7 | `fix: update stale R@5=0.152 → 0.595 across docs` | ✅ issue#2 응답 commit — 우리도 처리 완료 | + +## Trial merge 결과 — 16개 파일 conflict + +worktree `/tmp/tunaCtx-trial-merge`에서 격리 실행. master 영향 없음. + +| File | Hunks | 충돌 원인 | +|---|---|---| +| `.gitignore` | 1 | upstream에 `.playwright-mcp/` 등 추가, 우리는 `.omc/` 등 | +| `CLAUDE.md` | 1 | fork persona vs upstream R@5 갱신 | +| `LICENSE` | 1 | metadata (예: copyright holder line) 차이 추정 — 확인 필요 | +| `README.md` | 1 | fork persona vs upstream pip-first 재구성 | +| `benchmarks/eval/doc_retrieval_eval_v2.py` | 1 | **08e262b의 Korean tokenizer fix** vs 우리 Task C `_bm25` 통합 | +| `benchmarks/results/doc_retrieval_eval_v2.md` | 1 | 결과 갱신 차이 | +| `src/cli/install.py` | 1 | upstream `--reseed`/`vault seed` vs 우리 hash-based atomic install | +| `src/evaluator/coir_evaluator.py` | 1 | 양측 _bm25 통합 시점 차이 | +| `src/hooks/_ctx_telemetry.py` | 1 | telemetry 모듈 양측 변경 | +| `src/hooks/bge-daemon.py` | 1 | upstream `fcntl.flock` (9fe7d6d) vs 우리 PID guard | +| `src/hooks/bm25-memory.py` | 1 | upstream CJK 주석 + 통합 분해 vs 우리 1837→300 LOC orchestrator (PR-4 영역) | +| `src/hooks/chat-memory.py` | 2 | upstream sqlite-vec import guard (ba7df3d) vs 우리 fallback | +| `src/hooks/utility-rate.py` | 1 | utility-rate 양측 변경 | +| `src/hooks/vec-daemon.py` | 1 | upstream fcntl.flock vs 우리 vec-daemon 변경 | +| `src/retrieval/adaptive_trigger.py` | 1 | upstream eval 갱신 vs 우리 Task C _bm25 통합 | +| `src/retrieval/bm25_retriever.py` | 1 | 양측 _bm25 통합 시점 차이 | + +`b799aae` (거대 batch commit) 가 conflict의 8할 원인. 단일 파일에 여러 변경이 한꺼번에 들어와 line-level 충돌이 광범위. + +## 결론 — PR 작업 전략 + +**fork master ↔ upstream master 직접 머지는 비권고**. 16 conflict + 거대 batch commit 정렬 비용이 PR 가치를 초과. + +**권고: upstream/master 위에서 새 브랜치 분기 + 동등한 fix를 새로 commit** (PR 별). + +| PR | base | 작업 내용 | +|---|---|---| +| **PR-1** (tokenizer 통합) | `upstream/master` 위 `feat/upstream-pr1-tokenizer` | `_bm25/tokenizer.py` 신규 + 4 eval 사이트(g1_docs, g1_longterm, g2_paraphrase, bm25_retriever — bm25_retriever는 dedup 회귀로 제외) 통합. **08e262b의 doc_retrieval_eval_v2.py fix는 이미 머지됨이라 그 파일은 우리 PR에서 빠짐** | +| **PR-2** (tests) | `upstream/master` 위 `feat/upstream-pr2-tests` | 80 unit (PR-1 의존 분리) + 26 golden + golden runner | +| **PR-3** (deterministic sort) | `upstream/master` 위 `feat/upstream-pr3-determinism` | upstream의 `bm25-memory.py` monolith 안에서 sort 사이트 위치 찾아 동등 tiebreak 적용. fork의 `_bm25/ranker.py:49/79/153` ↔ upstream monolith line 매핑 필요 | + +**fork master는 그대로 유지** — fork persona/실험 코드/PR-4(decomposition) prep 영역. 향후 PR-4 머지 후 upstream과 자연스럽게 합류. + +## 다음 액션 + +- [ ] PR-1: `upstream/master` 분기 브랜치 생성, `_bm25/tokenizer.py` 추출 + eval 사이트 통합 patch +- [ ] PR-2: 80 unit + 26 golden 을 upstream monolith 호출 형태로 재작성 +- [ ] PR-3: upstream `bm25-memory.py` 안의 sort 사이트(L?, L?, L?) 찾아 tiebreak patch +- [ ] issue #1 회신 본문에서 "08e262b 이미 부분 적용됨" 명시 diff --git a/pyproject.toml b/pyproject.toml index b5fd488..fcc136d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ packages = [ "ctx_retriever.data", "ctx_retriever.evaluator", "ctx_retriever.hooks", + "ctx_retriever.hooks._bm25", "ctx_retriever.retrieval", "ctx_retriever.trigger", "ctx_retriever.visualizer", @@ -60,3 +61,13 @@ packages = [ [tool.setuptools.package-data] "*" = ["*.json", "*.yaml"] "ctx_retriever.hooks" = ["*.py", "ctx-telemetry"] +"ctx_retriever.hooks._bm25" = ["*.py"] + +[tool.pytest.ini_options] +testpaths = ["tests/unit"] +pythonpath = [".", "src"] +addopts = "-q --strict-markers" +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "requires_subprocess: marks tests that invoke subprocess (external process)", +] diff --git a/scripts/verify_bm25_unified.py b/scripts/verify_bm25_unified.py new file mode 100644 index 0000000..200a656 --- /dev/null +++ b/scripts/verify_bm25_unified.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +""" +verify_bm25_unified.py — sanity-check that eval and production use the same +canonical BM25 tokenizer/scorer after Task C unification. + +Checks: + 1. adaptive_trigger._HAS_UNIFIED_TOKENIZER is True. + 2. tokenize() output is identical for the same input from both import paths. + 3. score_corpus_bm25() returns a numpy array with correct shape. + 4. BM25 ranking order is consistent: higher-relevance doc ranks above noise. + +Exit 0 = all checks pass. Exit 1 = any check failed. +""" + +import os +import sys + +# Self-contained: ensure the project root is on sys.path so `src` is importable +# when running directly without PYTHONPATH (e.g. `python3 scripts/verify_bm25_unified.py`). +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.dirname(_SCRIPT_DIR) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + + +def check(label, condition, detail=""): + status = "PASS" if condition else "FAIL" + print(f" [{status}] {label}" + (f": {detail}" if detail else "")) + return condition + + +def main(): + ok = True + print("=== BM25 Unification Verification ===\n") + + # ── Check 1: unified tokenizer flag ────────────────────────────────────── + print("1. adaptive_trigger._HAS_UNIFIED_TOKENIZER") + try: + from src.retrieval.adaptive_trigger import _HAS_UNIFIED_TOKENIZER + ok &= check("flag is True", _HAS_UNIFIED_TOKENIZER, + str(_HAS_UNIFIED_TOKENIZER)) + except Exception as e: + ok &= check("import adaptive_trigger", False, str(e)) + + # ── Check 2: tokenize() output identical from both paths ───────────────── + print("\n2. tokenize() identical output from both import paths") + try: + from src.hooks._bm25.tokenizer import tokenize as tok_hooks + from src.retrieval.adaptive_trigger import _bm25_tokenize as tok_eval + + samples = [ + ("BM25 retrieval search", False), + ("한국어 검색 query test", True), + ("AdaptiveTrigger retrieve", False), + ] + for text, drop_sw in samples: + out_hooks = tok_hooks(text, drop_stopwords=drop_sw) + out_eval = tok_eval(text, drop_stopwords=drop_sw) + ok &= check( + f"tokenize({text!r:.30}, drop_sw={drop_sw})", + out_hooks == out_eval, + f"hooks={out_hooks} eval={out_eval}" if out_hooks != out_eval else f"{out_hooks}", + ) + except Exception as e: + ok &= check("tokenize import/call", False, str(e)) + + # ── Check 3: score_corpus_bm25() shape and type ────────────────────────── + print("\n3. score_corpus_bm25() returns correct numpy array") + try: + import numpy as np + from src.hooks._bm25.ranker import score_corpus_bm25 + from src.hooks._bm25.tokenizer import tokenize + + corpus_texts = [ + "BM25 tokenizer retrieval search", + "unrelated noise document here", + "CTX hook plugin memory recall", + ] + tokenized = [tokenize(t) for t in corpus_texts] + q_tokens = tokenize("BM25 retrieval", drop_stopwords=True) + scores = score_corpus_bm25(tokenized, q_tokens) + + ok &= check("returns ndarray", isinstance(scores, np.ndarray), + type(scores).__name__) + ok &= check("shape matches corpus", len(scores) == len(corpus_texts), + f"len={len(scores)}") + ok &= check("doc[0] > doc[1] (relevance order)", + scores[0] > scores[1], + f"scores={scores.round(3)}") + except Exception as e: + ok &= check("score_corpus_bm25", False, str(e)) + + # ── Check 4: adaptive_trigger uses unified tokenizer in corpus build ────── + print("\n4. AdaptiveTriggerRetriever builds corpus with unified tokenizer") + try: + import tempfile, os + from src.retrieval.adaptive_trigger import AdaptiveTriggerRetriever + + with tempfile.TemporaryDirectory() as tmpdir: + # Write a minimal Python file to index + src_file = os.path.join(tmpdir, "example.py") + with open(src_file, "w") as f: + f.write("def bm25_retrieval():\n \"\"\"Canonical tokenizer test.\"\"\"\n pass\n") + retriever = AdaptiveTriggerRetriever(tmpdir) + ok &= check("corpus non-empty", len(retriever._bm25_corpus) > 0, + f"len={len(retriever._bm25_corpus)}") + ok &= check("BM25 index built", retriever.bm25 is not None) + # Corpus tokens should include stemmed form (e.g., "retriev") from Porter stemmer + flat_tokens = [t for tokens in retriever._bm25_corpus for t in tokens] + # Porter stemmer is opt-in (CTX_STEM=1, requires nltk). + # Check presence only when _STEMMER is active. + from src.hooks._bm25.tokenizer import _STEMMER + if _STEMMER is not None: + ok &= check("porter stem present in corpus", + any(len(t) < 10 and t.startswith("retriev") for t in flat_tokens), + f"sample={flat_tokens[:10]}") + else: + check("porter stem (nltk absent — skipped)", True, "nltk not installed") + except Exception as e: + ok &= check("AdaptiveTriggerRetriever init", False, str(e)) + + # ── Summary ─────────────────────────────────────────────────────────────── + print(f"\n{'ALL CHECKS PASSED' if ok else 'SOME CHECKS FAILED'}") + return 0 if ok else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/cli/install.py b/src/cli/install.py index 640526d..29018bd 100644 --- a/src/cli/install.py +++ b/src/cli/install.py @@ -20,6 +20,7 @@ from __future__ import annotations import argparse +import hashlib import importlib.resources import json import os @@ -100,15 +101,70 @@ def _pkg_hooks_dir() -> Path | None: return None -def step_copy_hooks(dry_run: bool = False) -> tuple[int, int, list[str]]: +def _file_sha256(path: Path) -> str: + """Return hex SHA-256 of file contents.""" + h = hashlib.sha256() + h.update(path.read_bytes()) + return h.hexdigest() + + +def _copy_hook_file( + src_file: Path, + dst_file: Path, + *, + force: bool, + no_update: bool, + dry_run: bool, + chmod: int, +) -> str: + """Copy src_file → dst_file with update policy. + + Returns one of: "copied", "updated", "unchanged", "skipped". + Side-effects: creates backup before overwrite; skips if no_update. + """ + if not dst_file.is_file(): + # New file — always copy + if not dry_run: + shutil.copy2(src_file, dst_file) + dst_file.chmod(chmod) + return "copied" + + if no_update: + return "skipped" + + if not force and _file_sha256(src_file) == _file_sha256(dst_file): + return "unchanged" + + # Hash differs (or --force) → update with timestamped backup + if not dry_run: + ts = time.strftime("%Y%m%d_%H%M%S") + backup = dst_file.with_suffix(f".backup_{ts}{dst_file.suffix}") + shutil.copy2(dst_file, backup) + shutil.copy2(src_file, dst_file) + dst_file.chmod(chmod) + return "updated" + + +def step_copy_hooks( + dry_run: bool = False, + force: bool = False, + no_update: bool = False, +) -> tuple[int, int, int, list[str]]: """Copy hook files from the installed package to ~/.claude/hooks/. - Returns (copied, skipped, errors).""" + + Update policy (applied to each existing file): + - Default: hash-compare; update if different (creates .backup_ first) + - --force-hooks: overwrite unconditionally without hash check + - --no-update-hooks: skip existing files (legacy behaviour) + + Returns (copied, updated, skipped, errors). + """ src = _pkg_hooks_dir() if src is None: - return 0, 0, ["Package hooks dir not found — run `pip install ctx-retriever` first"] + return 0, 0, 0, ["Package hooks dir not found — run `pip install ctx-retriever` first"] CLAUDE_HOOKS_DIR.mkdir(parents=True, exist_ok=True) - copied, skipped, errors = 0, 0, [] + copied, updated, skipped, errors = 0, 0, 0, [] # Hook filenames we care about (from CTX_HOOKS + telemetry helper) hook_files = [spec[0] for spec in CTX_HOOKS] + ["_ctx_telemetry.py", "utility-rate.py"] @@ -117,21 +173,49 @@ def step_copy_hooks(dry_run: bool = False) -> tuple[int, int, list[str]]: src_file = src / fname dst_file = CLAUDE_HOOKS_DIR / fname if not src_file.is_file(): - continue # optional file (utility-rate.py not in CTX_HOOKS list) - if dst_file.is_file(): - skipped += 1 + continue # optional file + try: + action = _copy_hook_file( + src_file, dst_file, + force=force, no_update=no_update, + dry_run=dry_run, chmod=0o755, + ) + except OSError as e: + errors.append(f"copy {fname}: {e}") continue + if action == "copied": + copied += 1 + elif action == "updated": + updated += 1 + else: # "unchanged" or "skipped" + skipped += 1 + + # Copy _bm25/ sub-package (required by bm25-memory.py at runtime). + src_bm25 = src / "_bm25" + dst_bm25 = CLAUDE_HOOKS_DIR / "_bm25" + if src_bm25.is_dir(): + bm25_files = list(src_bm25.glob("*.py")) if not dry_run: + dst_bm25.mkdir(parents=True, exist_ok=True) + for py_file in bm25_files: + dst_f = dst_bm25 / py_file.name try: - shutil.copy2(src_file, dst_file) - dst_file.chmod(0o755) - copied += 1 + action = _copy_hook_file( + py_file, dst_f, + force=force, no_update=no_update, + dry_run=dry_run, chmod=0o644, + ) except OSError as e: - errors.append(f"copy {fname}: {e}") - else: - copied += 1 # count as would-copy in dry-run + errors.append(f"copy _bm25/{py_file.name}: {e}") + continue + if action == "copied": + copied += 1 + elif action == "updated": + updated += 1 + else: + skipped += 1 - return copied, skipped, errors + return copied, updated, skipped, errors def step_copy_daemons(dry_run: bool = False) -> tuple[int, int, list[str]]: @@ -214,9 +298,13 @@ def cmd_install(args: argparse.Namespace) -> int: print(f"Target: {CLAUDE_SETTINGS}") print(f"Hooks dir: {CLAUDE_HOOKS_DIR}\n") - # 1. Copy hook files from package (if not already present) - copied, skipped, errors = step_copy_hooks(dry_run=args.dry_run) - print(f"1/4 hook files: copied={copied} already-present={skipped}") + # 1. Copy hook files from package (update policy: hash-compare by default) + force = getattr(args, "force_hooks", False) + no_update = getattr(args, "no_update_hooks", False) + copied, updated, skipped, errors = step_copy_hooks( + dry_run=args.dry_run, force=force, no_update=no_update, + ) + print(f"1/4 hook files: copied={copied} updated={updated} unchanged/skipped={skipped}") if errors: for e in errors: print(f" ✗ {e}") @@ -275,9 +363,99 @@ def cmd_install(args: argparse.Namespace) -> int: return 0 +def _cleanup_hook_files( + force: bool = False, + dry_run: bool = False, +) -> dict[str, list[str]]: + """Remove CTX hook files and _bm25/ from ~/.claude/hooks/. + + Returns a dict with three lists: + "removed" — files deleted. + "kept" — files skipped because user modified them (hash mismatch) or + _bm25/ had extra files. + "not_found" — files that did not exist (already gone). + + Safety rules: + - Hook file: compare SHA-256 against package source. If hash differs and + force=False, keep file and emit a warning. + - _bm25/ dir: remove only if every *.py inside matches a known CTX file. + If extra files are present and force=False, skip the whole directory. + - force=True: skip all hash checks and force-remove everything. + """ + removed: list[str] = [] + kept: list[str] = [] + not_found: list[str] = [] + + src = _pkg_hooks_dir() + # Full list of hook files CTX installs + hook_files = [spec[0] for spec in CTX_HOOKS] + ["_ctx_telemetry.py", "utility-rate.py"] + + for fname in hook_files: + dst = CLAUDE_HOOKS_DIR / fname + if not dst.exists(): + not_found.append(fname) + continue + # Hash-based safety check (skip when --force or src unavailable). + if not force and src is not None: + src_file = src / fname + if src_file.is_file() and _file_sha256(dst) != _file_sha256(src_file): + kept.append(fname) + print( + f" kept {fname} (user-modified; use --force to override)", + file=sys.stderr, + ) + continue + if not dry_run: + dst.unlink() + removed.append(fname) + + # Handle _bm25/ sub-directory + dst_bm25 = CLAUDE_HOOKS_DIR / "_bm25" + if dst_bm25.is_dir(): + present_files = list(dst_bm25.glob("*.py")) + if src is not None and not force: + src_bm25 = src / "_bm25" + known_names: set[str] = set() + if src_bm25.is_dir(): + known_names = {f.name for f in src_bm25.glob("*.py")} + extra = [f for f in present_files if f.name not in known_names] + modified = [ + f for f in present_files + if f.name in known_names + and (src_bm25 / f.name).exists() + and _file_sha256(f) != _file_sha256(src_bm25 / f.name) + ] + if extra or modified: + reasons = [] + if extra: + reasons.append(f"{len(extra)} unknown file(s)") + if modified: + reasons.append(f"{len(modified)} user-modified file(s)") + kept.append("_bm25/") + print( + f" kept _bm25/ ({', '.join(reasons)}; use --force to override)", + file=sys.stderr, + ) + else: + if not dry_run: + shutil.rmtree(dst_bm25) + removed.append("_bm25/") + else: + # force=True or no package source → remove unconditionally + if not dry_run: + shutil.rmtree(dst_bm25) + removed.append("_bm25/") + else: + not_found.append("_bm25/") + + return {"removed": removed, "kept": kept, "not_found": not_found} + + def cmd_uninstall(args: argparse.Namespace) -> int: print("== ctx-install --uninstall ==") - # Build list of commands to remove (matching what install would have added) + force = getattr(args, "force", False) + + # 1. Remove hook registrations from settings.json. remove = [] for spec in CTX_HOOKS: filename = spec[0] @@ -286,11 +464,28 @@ def cmd_uninstall(args: argparse.Namespace) -> int: result = unpatch_settings(CLAUDE_SETTINGS, remove, dry_run=args.dry_run) print(result.summary()) - if result.ok: - print("\nCTX hooks removed from settings.json.") - print("(Hook files at ~/.claude/hooks/ NOT deleted — remove manually if desired.)") - return 0 - return 5 + if not result.ok: + return 5 + + print("\nCTX hooks removed from settings.json.") + + # 2. Clean up hook files and _bm25/ directory. + print("\nCleaning up hook files …") + cleanup = _cleanup_hook_files(force=force, dry_run=args.dry_run) + + prefix = "(dry-run) " if args.dry_run else "" + for name in cleanup["removed"]: + print(f" {prefix}removed {name}") + for name in cleanup["not_found"]: + print(f" not found {name} (already gone)") + + if cleanup["kept"]: + print( + f"\n {len(cleanup['kept'])} file(s) kept due to user modification" + " — re-run with --force to remove." + ) + + return 0 def cmd_status(args: argparse.Namespace) -> int: @@ -377,7 +572,13 @@ def main() -> int: p.add_argument("--dry-run", action="store_true", help="Show what would change; touch no files.") p.add_argument("--uninstall", action="store_true", - help="Remove CTX hook registrations from settings.json.") + help="Remove CTX hook registrations from settings.json and clean up hook files.") + p.add_argument("--force", action="store_true", + help="With --uninstall: remove hook files unconditionally (skip hash check).") + p.add_argument("--force-hooks", action="store_true", + help="Overwrite existing hook files unconditionally (no hash check).") + p.add_argument("--no-update-hooks", action="store_true", + help="Skip existing hook files even if outdated (legacy behaviour).") p.add_argument("command", nargs="?", default=None, help="Optional: 'status' to check current install state.") args = p.parse_args() diff --git a/src/cli/settings_patcher.py b/src/cli/settings_patcher.py index 2498477..3452704 100644 --- a/src/cli/settings_patcher.py +++ b/src/cli/settings_patcher.py @@ -53,17 +53,19 @@ def _load(path: Path) -> dict: def _save_atomic(path: Path, data: dict) -> str: - """Write data to path atomically (temp file + rename). Returns backup path.""" + """Write data to path atomically (temp file + rename). Returns backup path or ''.""" path.parent.mkdir(parents=True, exist_ok=True) ts = time.strftime("%Y%m%d_%H%M%S") backup = path.with_suffix(f".backup_{ts}.json") + backup_made = False if path.exists(): shutil.copy2(path, backup) + backup_made = True tmp = path.with_suffix(".tmp_ctx") tmp.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") os.replace(tmp, path) - return str(backup) if path.exists() else "" + return str(backup) if backup_made else "" def _cmd_in_settings(settings: dict, command: str) -> bool: diff --git a/src/cli/telemetry.py b/src/cli/telemetry.py index f74ecb5..2085d82 100644 --- a/src/cli/telemetry.py +++ b/src/cli/telemetry.py @@ -820,6 +820,9 @@ def cmd_cluster(args): MAX_FILES = 300 def _tokenize(text: str) -> list[str]: + # Intentionally not the canonical _bm25.tokenize (PR-1 out-of-scope): + # this is for source-file identifier-frequency stats, not BM25 ranking. + # Excludes 1-2 char tokens and digit-only/non-ASCII — IDF doesn't apply. return [t.lower() for t in re.findall(r'[a-zA-Z][a-zA-Z0-9]{2,28}', text)] token_counts: dict[str, int] = {} diff --git a/src/evaluator/coir_evaluator.py b/src/evaluator/coir_evaluator.py index 893c922..7f4484d 100644 --- a/src/evaluator/coir_evaluator.py +++ b/src/evaluator/coir_evaluator.py @@ -182,18 +182,20 @@ def evaluate_bm25( corpus: List[COIRCorpusEntry], ) -> COIRResult: """Evaluate BM25 retrieval on COIR benchmark.""" - from rank_bm25 import BM25Okapi + from src.hooks._bm25.ranker import score_corpus_bm25 - # Tokenize corpus + # Tokenize corpus — simple whitespace split preserves COIR's code-search + # vocabulary (tokens like "def", "return", "self" are meaningful signal here). tokenized_corpus = [doc.code.lower().split() for doc in corpus] - bm25 = BM25Okapi(tokenized_corpus) rankings = [] per_query = [] for q in queries: tokenized_query = q.query_text.lower().split() - scores = bm25.get_scores(tokenized_query) + scores = score_corpus_bm25(tokenized_corpus, tokenized_query) + if scores is None: + scores = np.zeros(len(corpus)) ranked_indices = np.argsort(scores)[::-1].tolist() rankings.append(ranked_indices) diff --git a/src/hooks/_bm25/__init__.py b/src/hooks/_bm25/__init__.py new file mode 100644 index 0000000..f05ae87 --- /dev/null +++ b/src/hooks/_bm25/__init__.py @@ -0,0 +1,57 @@ +"""Public API for the _bm25 package. + +For new code, prefer the package-level imports below. Existing submodule +imports (e.g. ``from _bm25.tokenizer import tokenize``) remain supported +and are not deprecated. + +Examples:: + + from _bm25 import tokenize, score_corpus_bm25 + from _bm25 import bm25_rank_decisions, hybrid_search_docs + +Module-level state (e.g. AUTO_TUNE, last_retrieval_scores) is intentionally +not re-exported here — access it via submodule path +(``from _bm25.autotune import AUTO_TUNE``) to avoid Python module-binding +surprises. + +Also consumed by src/retrieval/adaptive_trigger.py (eval pipeline) so that +eval and production share a single canonical tokenizer/scorer (Task C). +""" + +# Tokenization +from .tokenizer import tokenize, expand_query_tokens + +# BM25 ranking primitives +from .ranker import ( + score_corpus_bm25, + bm25_rank_decisions, + dense_rank_decisions, + hybrid_rank_decisions, + rrf_merge, +) + +# Decision corpus (G1) +from .corpus import get_decision_corpus, get_git_head, build_decision_corpus + +# Semantic rerank +from .rerank import semantic_rerank_filter + +# Document / code / hooks search +from .docs_search import bm25_search_docs, hybrid_search_docs +from .code_search import search_files_by_grep, search_graph_for_prompt +from .hooks_search import search_hooks_files + +# Output emission +from .output import emit_output + +__all__ = [ + "tokenize", "expand_query_tokens", + "score_corpus_bm25", "bm25_rank_decisions", "dense_rank_decisions", + "hybrid_rank_decisions", "rrf_merge", + "get_decision_corpus", "get_git_head", "build_decision_corpus", + "semantic_rerank_filter", + "bm25_search_docs", "hybrid_search_docs", + "search_files_by_grep", "search_graph_for_prompt", + "search_hooks_files", + "emit_output", +] diff --git a/src/hooks/_bm25/autotune.py b/src/hooks/_bm25/autotune.py new file mode 100644 index 0000000..09a8e6d --- /dev/null +++ b/src/hooks/_bm25/autotune.py @@ -0,0 +1,66 @@ +""" +autotune.py — Auto-tune parameter reader for bm25-memory. + +Reads ~/.claude/ctx-auto-tune.json (written by ctx-telemetry tune). +Exposes: + AUTO_TUNE: dict — raw recommendations (empty if file absent/invalid) + AUTO_TUNE_ACTIVE: bool — True when recommendations are loaded and valid + get_g1_top_k(prompt, auto_tune) -> int + get_g2d_top_k(prompt, auto_tune) -> int +""" +import json +from pathlib import Path + +_AUTO_TUNE_PATH = Path.home() / ".claude" / "ctx-auto-tune.json" + +AUTO_TUNE: dict = {} +AUTO_TUNE_ACTIVE: bool = False + +try: + if _AUTO_TUNE_PATH.exists(): + _raw = json.loads(_AUTO_TUNE_PATH.read_text()) + if isinstance(_raw, dict): + AUTO_TUNE = _raw + AUTO_TUNE_ACTIVE = True +except Exception: + pass + + +def get_g1_top_k(prompt: str, auto_tune: dict) -> int: + """Compute G1 top_k based on auto-tune recommendations and query type.""" + from .corpus import _classify_query_type + top_k = 7 + if not auto_tune: + return top_k + qtype = _classify_query_type(prompt) + temporal_gap = auto_tune.get("temporal_utility_gap", 0) + if qtype == "TEMPORAL" and temporal_gap > 0.10: + top_k = 5 + proj_type = auto_tune.get("project_type_hint", "") + proj_conf = auto_tune.get("project_type_confidence", "LOW") + if proj_conf in ("HIGH", "MEDIUM"): + if proj_type == "python_ml": + top_k = min(top_k + 1, 10) + elif proj_type == "nextjs_react": + top_k = max(top_k - 1, 4) + return top_k + + +def get_g2d_top_k(prompt: str, auto_tune: dict) -> int: + """Compute G2-DOCS top_k based on auto-tune recommendations and query type.""" + from .corpus import _classify_query_type + top_k = 5 + if not auto_tune: + return top_k + qtype = _classify_query_type(prompt) + temporal_gap = auto_tune.get("temporal_utility_gap", 0) + if qtype == "TEMPORAL" and temporal_gap > 0.10: + top_k = 3 + proj_type = auto_tune.get("project_type_hint", "") + proj_conf = auto_tune.get("project_type_confidence", "LOW") + if proj_conf in ("HIGH", "MEDIUM"): + if proj_type == "nextjs_react": + top_k = min(top_k + 1, 8) + elif proj_type == "rust_systems": + top_k = max(top_k - 1, 3) + return top_k diff --git a/src/hooks/_bm25/code_search.py b/src/hooks/_bm25/code_search.py new file mode 100644 index 0000000..6e435c4 --- /dev/null +++ b/src/hooks/_bm25/code_search.py @@ -0,0 +1,242 @@ +""" +code_search.py — G2 code file discovery for bm25-memory. + +Provides: + extract_keywords(prompt) -> list[str] + find_db(project_dir) -> str|None + log_retrieved_nodes(project_dir, session_id, prompt, block, items) + check_and_trigger_reindex(project_dir, db_path) -> str|None + search_graph_for_prompt(db_path, keywords, limit=5) -> list[tuple] + search_files_by_grep(project_dir, keywords, limit=5) -> list[str] +""" +import json +import os +import re +import subprocess + +# ── Code keyword / mapping ─────────────────────────────────────────────────── + +_STOP_WORDS = { + "the","a","an","is","are","was","were","be","been","have","has","had", + "do","does","did","will","would","could","should","may","might","can", + "to","of","in","for","on","with","at","by","from","as","into", + "it","this","that","i","you","he","she","we","they","me", + "and","or","but","not","no","if","then","else","when","where","how","what", + "해줘","해","바람","좀","것","수","있","없","하다","되다","이","그","저","뭐","어떻게", + "기능","작업","관련","파일","코드","문서","수정","추가","변경","확인","돌려봐", + "올려","실행","해봐","분석","개선","확인해", +} +_KO_EN = { + "검색": "search,retrieve,find", "엔진": "engine,retriever", + "벤치마크": "benchmark,eval", "평가": "eval,evaluate", + "트리거": "trigger", "분류": "classify,classifier", + "밀도": "dense,density", "테스트": "test", + "결과": "result", "스코어": "score", + "그래프": "graph", "다운스트림": "downstream", + "외부": "external,reeval", "정확도": "accuracy,precision", + "이메일": "email,mail", "발송": "send,outreach", + "대시보드": "dashboard,admin", "구독": "subscription,subscribe", + "인증": "auth,authenticate", "로그인": "login,signin", + "사용자": "user,member", "데이터베이스": "database,schema", + "함수": "function,handler", "컴포넌트": "component", + "페이지": "page,route", "설정": "config,settings", + "환경": "env,environment", "서버": "server,backend", + "실험": "experiment,trial", "배포": "deploy,deployment", + "오류": "error,exception", "버그": "bug,error", + "성능": "performance,latency", "최적화": "optimize,cache", + "알림": "notification,alert", "권한": "permission,auth", + "훅": "hook", "메모리": "memory", "인덱스": "index", +} +_CODE_EXT = { + ".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".java", + ".sh", ".bash", ".yaml", ".yml", ".toml", ".sql", ".css", ".html", + ".c", ".cpp", ".h", ".rb", ".php", ".swift", ".kt", +} +_SKIP_PREFIXES = (".omc/", "docs/", "benchmarks/results/", "tests/fixtures/") + +_REINDEX_LOCK = os.path.expanduser("~/.cache/codebase-memory-mcp/.reindex_in_progress") +_STALE_THRESHOLD_HOURS = 24 + + +# ── Keyword extraction ──────────────────────────────────────────────────────── + +def extract_keywords(prompt): + """Extract meaningful keywords from prompt; expand Korean→English.""" + words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{2,}|[가-힣]{2,}', prompt) + keywords = [] + for w in words: + if w.lower() in _STOP_WORDS or len(w) < 2: + continue + if re.match(r'[가-힣]', w) and w in _KO_EN: + keywords.extend(_KO_EN[w].split(",")) + else: + keywords.append(w) + return keywords[:8] + + +# ── DB discovery ───────────────────────────────────────────────────────────── + +def find_db(project_dir): + """Locate codebase-memory-mcp SQLite DB for this project.""" + cache_dir = os.path.expanduser("~/.cache/codebase-memory-mcp") + if not os.path.isdir(cache_dir): + return None + slug = project_dir.replace("/", "-").lstrip("-") + db_path = os.path.join(cache_dir, f"{slug}.db") + if os.path.exists(db_path): + return db_path + for f in os.listdir(cache_dir): + if f.endswith(".db") and os.path.basename(project_dir).lower() in f.lower(): + return os.path.join(cache_dir, f) + return None + + +# ── Citation probe ─────────────────────────────────────────────────────────── + +def log_retrieved_nodes(project_dir, session_id, prompt, block, items): + """ + Append a retrieval event to .omc/retrieval_log.jsonl. + + Args: + project_dir: project root path + session_id: Claude session ID (from input_data) + prompt: user prompt (first 120 chars stored) + block: "g1_decisions" | "g2_docs" | "g2_prefetch" | "g2_hooks" + items: list of dicts, each with at minimum {"id": str, "text": str} + g1: {"id": hash, "text": subject, "date": date} + g2_docs: {"id": filename, "text": unit_preview} + g2_prefetch: {"id": fpath, "text": f"{label}:{name}"} + """ + if not items: + return + try: + import time as _t + log_path = os.path.join(project_dir, ".omc", "retrieval_log.jsonl") + entry = { + "ts": _t.time(), + "session_id": session_id, + "prompt_prefix": prompt[:120], + "block": block, + "items": items[:10], + } + with open(log_path, "a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + except Exception: + pass + + +# ── Staleness check + reindex ───────────────────────────────────────────────── + +def check_and_trigger_reindex(project_dir, db_path): + """ + Check if codebase-memory-mcp DB is stale (>24h). If so, spawn an incremental + reindex in the background (non-blocking). Returns a warning string if stale, + or None if fresh. + """ + try: + import time as _t_mod + age_hours = (_t_mod.time() - os.path.getmtime(db_path)) / 3600 + except OSError: + return None + + if age_hours < _STALE_THRESHOLD_HOURS: + return None + + age_str = f"{age_hours:.0f}h" if age_hours < 48 else f"{age_hours/24:.1f}d" + + if os.path.exists(_REINDEX_LOCK): + try: + import time as _t_mod + lock_age = (_t_mod.time() - os.path.getmtime(_REINDEX_LOCK)) / 60 + if lock_age < 10: + return f"⚠ G2-CODE DB stale ({age_str}) — reindex already running" + except OSError: + pass + + try: + args = json.dumps({"repo_path": project_dir, "mode": "fast"}) + cmd = ["codebase-memory-mcp", "cli", "index_repository", args] + subprocess.Popen( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + open(_REINDEX_LOCK, "w").close() + return f"⚠ G2-CODE DB stale ({age_str}) — auto-reindex triggered (fast mode, background)" + except Exception: + return f"⚠ G2-CODE DB stale ({age_str}) — run: codebase-memory-mcp cli index_repository to reindex" + + +# ── Graph and grep search ───────────────────────────────────────────────────── + +def search_graph_for_prompt(db_path, keywords, limit=5): + """Query codebase graph nodes matching keywords.""" + if not keywords: + return [] + try: + import sqlite3 + db = sqlite3.connect(db_path) + results, seen = [], set() + for kw in keywords: + rows = db.execute( + "SELECT DISTINCT label, name, file_path FROM nodes " + "WHERE name LIKE ? AND label IN ('Function','Method','Class') " + "ORDER BY length(name) ASC LIMIT ?", + (f"%{kw}%", 3), + ).fetchall() + for r in rows: + key = (r[1], r[2]) + if key not in seen: + seen.add(key) + results.append(r) + if len(results) < limit: + frows = db.execute( + "SELECT DISTINCT label, name, file_path FROM nodes " + "WHERE file_path LIKE ? AND label IN ('Module','File') " + "ORDER BY length(file_path) ASC LIMIT ?", + (f"%{kw}%", 2), + ).fetchall() + for r in frows: + key = (r[1], r[2]) + if key not in seen: + seen.add(key) + results.append(r) + db.close() + return results[:limit] + except Exception: + return [] + + +def search_files_by_grep(project_dir, keywords, limit=5): + """Fallback: git grep -c to rank files by keyword match count.""" + long_kws = [k for k in keywords if len(k) >= 4 and not re.match(r'[가-힣]', k)] + if not long_kws: + return [] + try: + pattern = "|".join(re.escape(k) for k in long_kws[:4]) + r = subprocess.run( + ["git", "grep", "-c", "-E", "-i", "--", pattern], + cwd=project_dir, capture_output=True, text=True, timeout=3, + ) + if r.returncode != 0: + return [] + scored = [] + for line in r.stdout.strip().split("\n"): + if not line.strip(): + continue + try: + fpath, count = line.rsplit(":", 1) + scored.append((int(count), fpath.strip())) + except ValueError: + continue + scored.sort(key=lambda x: (-x[0], x[1])) + files = [f for _, f in scored] + code = [ + f for f in files + if any(f.endswith(ext) for ext in _CODE_EXT) + and not any(f.startswith(p) for p in _SKIP_PREFIXES) + ] + return code[:limit] + except Exception: + return [] diff --git a/src/hooks/_bm25/corpus.py b/src/hooks/_bm25/corpus.py new file mode 100644 index 0000000..3f83055 --- /dev/null +++ b/src/hooks/_bm25/corpus.py @@ -0,0 +1,215 @@ +""" +corpus.py — G1 Decision Corpus build/cache for bm25-memory. + +Provides: + get_git_head(project_dir) -> str|None + build_decision_corpus(project_dir, n=500) -> list[dict] + embed_corpus_items(corpus) -> int (vec-daemon, modifies in-place) + get_decision_corpus(project_dir) -> list[dict] + _classify_query_type(prompt) -> str (TEMPORAL/KEYWORD/SEMANTIC) +""" +import json +import re +import subprocess +from pathlib import Path + +from .rerank import vec_embed as _vec_embed + +# ── Decision commit detection ──────────────────────────────────────────────── + +_CONV_PREFIXES = ( + "feat:", "fix:", "refactor:", "perf:", "security:", "design:", "test:", + "feat(", "fix(", "refactor(", "perf(", +) +_VERSION_RE = re.compile(r"^v\d+\.\d+") +_DECISION_KEYWORDS = ( + "pivot", "revert", "dead-end", "rejected", "chose", "switched", + "CONVERGED", "failed", "success", "fix", "improvement", + "benchmark", "eval", "decision", "iter", +) +_NOISE_PREFIXES = ("# ", "wip:", "merge ", 'revert "') +_STRICT_VERSION_RE = re.compile(r"^v\d+\.\d+\.\d+") +_OMC_ITER_RE = re.compile(r"^(omc-live|live-inf)\s+iter", re.IGNORECASE) +_EMBEDDED_DECISION_RE = re.compile( + r"\s[-—]\s*(feat|fix|refactor|perf|security|design|implement|add|remove|replace|switch|migrate)", + re.IGNORECASE, +) +_YYYYMMDD_RE = re.compile(r"^\d{8}\s") # CTX-style: "20260408 G1 temporal..." + + +def _is_structural_noise(subject): + s = subject.strip() + if _OMC_ITER_RE.match(s): + return True + if _STRICT_VERSION_RE.match(s): + return not bool(_EMBEDDED_DECISION_RE.search(s)) + return False + + +def _is_decision(subject): + """Detect decision commits: conventional, version-tagged, YYYYMMDD, or keyword.""" + s = subject.strip() + if not s: + return False + sl = s.lower() + if any(sl.startswith(p) for p in _NOISE_PREFIXES): + return False + if any(sl.startswith(p) for p in _CONV_PREFIXES): + return True + if _VERSION_RE.match(s): + return True + if _YYYYMMDD_RE.match(s): # CTX-style date-prefixed commits + return True + return any(kw.lower() in sl for kw in _DECISION_KEYWORDS) + + +# ── Query-type classification (for retrieval_event schema v1.1) ───────────── + +_TEMPORAL_KW = frozenset([ + "when", "history", "timeline", "progression", "what happened", "progress", + "previously", "before", "after", "last time", "since", "ago", "recent", + "changed", "evolution", "how long", "session", "yesterday", "last week", + "진행", "역사", "이전", "지난", "타임라인", "최근", "변경", "이번", +]) + + +def _classify_query_type(prompt: str) -> str: + """Classify prompt into TEMPORAL / KEYWORD / SEMANTIC. + + TEMPORAL — query is about history/timeline/progression + KEYWORD — short technical lookup (≤60 chars) or pure symbol/identifier + SEMANTIC — natural language conceptual query (default) + """ + if not prompt: + return "KEYWORD" + pl = prompt.lower() + if any(kw in pl for kw in _TEMPORAL_KW): + return "TEMPORAL" + words = pl.split() + if len(words) <= 6: + return "KEYWORD" + return "SEMANTIC" + + +# ── Git helpers ────────────────────────────────────────────────────────────── + +def get_git_head(project_dir): + try: + r = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=project_dir, capture_output=True, text=True, timeout=3, + ) + return r.stdout.strip() if r.returncode == 0 else None + except Exception: + return None + + +def build_decision_corpus(project_dir, n=500): + """Extract all decision commits from git log (no cap).""" + try: + r = subprocess.run( + ["git", "log", f"-{n}", "--format=%H\x1f%s\x1f%ai"], + cwd=project_dir, capture_output=True, text=True, timeout=10, + ) + if r.returncode != 0: + return [] + except Exception: + return [] + + corpus = [] + seen = set() + for line in r.stdout.strip().split("\n"): + if not line.strip(): + continue + parts = line.strip().split("\x1f", 2) + if len(parts) < 2: + continue + commit_hash = parts[0] + subject = parts[1][:120] + date = parts[2][:10] if len(parts) == 3 else "" + + if _is_structural_noise(subject): + continue + key = subject[:60] + if key in seen: + continue + seen.add(key) + + if _is_decision(subject): + # 고우선순위 패턴 → text 중복 삽입으로 BM25 가중치 증폭 + is_milestone = any(p in subject for p in [ + "CONVERGED", "pivot", "완성", "완료", "검증", "수렴", "FAILED", "KILL" + ]) + text = f"{date} {subject}" + if is_milestone: + text = f"{text}\n{text}" # 2배 가중치 + corpus.append({ + "hash": commit_hash, + "subject": subject, + "date": date, + "text": text, + }) + + return corpus + + +def embed_corpus_items(corpus): + """Add 'emb' field to corpus items using vec-daemon. Modifies in-place. + + Only embeds items missing 'emb'. Returns count of newly embedded items. + Fail-safe: if vec-daemon is down, items are left without 'emb' and + dense_rank_decisions will return [] (BM25-only fallback). + """ + embedded = 0 + for item in corpus: + if item.get("emb"): + continue + text = (item.get("subject") or item.get("text") or "")[:400] + if not text: + continue + emb = _vec_embed(text) + if emb: + item["emb"] = emb + embedded += 1 + return embedded + + +def get_decision_corpus(project_dir): + """Return cached corpus or rebuild if git HEAD changed. + + Extended (2026-04-26): also pre-embeds corpus items via vec-daemon and caches + embeddings in the same file under an 'emb_head' sentinel. Embeddings allow + dense first-stage retrieval (dense_rank_decisions) without per-query N socket + calls. Falls back gracefully: if vec-daemon is down, items lack 'emb' field + and dense_rank_decisions returns []. + """ + cache_path = Path(project_dir) / ".omc" / "decision_corpus.json" + head = get_git_head(project_dir) + + if cache_path.exists() and head: + try: + cached = json.loads(cache_path.read_text()) + if cached.get("head") == head: + corpus = cached["corpus"] + # Check if embeddings are fresh for this HEAD + if cached.get("emb_head") != head: + n = embed_corpus_items(corpus) + if n > 0: + cache_path.write_text(json.dumps({ + "head": head, "corpus": corpus, "emb_head": head + })) + return corpus + except Exception: + pass + + corpus = build_decision_corpus(project_dir) + if head and corpus: + try: + cache_path.parent.mkdir(exist_ok=True) + embed_corpus_items(corpus) + cache_path.write_text(json.dumps({ + "head": head, "corpus": corpus, "emb_head": head + })) + except Exception: + pass + return corpus diff --git a/src/hooks/_bm25/docs_search.py b/src/hooks/_bm25/docs_search.py new file mode 100644 index 0000000..5cedf14 --- /dev/null +++ b/src/hooks/_bm25/docs_search.py @@ -0,0 +1,281 @@ +""" +docs_search.py — G2-DOCS BM25+hybrid search for bm25-memory. + +Provides: + _extra_doc_files(project_dir) -> list[str] + chunk_document(filename, content) -> list[str] + build_docs_bm25(project_dir) -> (BM25Okapi|None, list[str]) + bm25_search_docs(project_dir, query, top_k=5) -> list[str] + embed_docs_units(units, cache_path) -> list[dict] + dense_rank_docs(units_emb, query, top_k=10) -> list[dict] + hybrid_search_docs(project_dir, query, top_k=5) -> list[str] +""" +import json +import os +import re +from pathlib import Path + +try: + from rank_bm25 import BM25Okapi + _HAS_BM25 = True +except ImportError: + BM25Okapi = None # type: ignore + _HAS_BM25 = False + +from .tokenizer import tokenize +from .rerank import vec_embed as _vec_embed, cosine as _cosine, semantic_rerank_filter +from .ranker import rrf_merge, last_retrieval_scores as _last_retrieval_scores + +# ── Korean→English expansion for G2-DOCS BM25 path (iter 44) ──────────────── + +_KO_EN_DOCS = { + "하이브리드": "hybrid", "밀집": "dense", "검색": "search,retrieve", + "재색인": "reindex", "인용": "citation", "거짓": "false", + "양성": "positive", "시멘틱": "semantic", "지연": "latency", + "시간": "time,latency", "수준": "tier,level", + "벡터": "vector,embedding", "마이그레이션": "migration", + "임베딩": "embedding", "벤치마크": "benchmark,eval", + "메모리": "memory", "코드베이스": "codebase", + "데이터베이스": "database", "오래된": "stale,staleness", + "측정": "measure,probe", "비율": "rate,ratio", + "성능": "performance,latency", "업그레이드": "upgrade", + "노드": "node", "병합": "merge", "구현": "implementation", + "분석": "analysis,evaluation", "아키텍처": "architecture", + "평가": "eval,evaluate,benchmark", "프레임워크": "framework", + "알고리즘": "algorithm", "최적화": "optimize,optimization", + "자동": "auto,automatic", "색인": "index", "인덱스": "index", +} + + +def _expand_ko_en_docs(tokens): + """Expand Korean tokens via _KO_EN_DOCS for G2-DOCS BM25 queries.""" + expanded = list(tokens) + for t in tokens: + mapping = _KO_EN_DOCS.get(t) + if mapping: + expanded.extend(mapping.split(",")) + return list(dict.fromkeys(expanded)) + + +# ── Doc corpus helpers ─────────────────────────────────────────────────────── + +def _extra_doc_files(project_dir): + """Return extra files to include in the docs index (project-agnostic).""" + slug = project_dir.replace("/", "-") + memory_md = os.path.expanduser(f"~/.claude/projects/{slug}/memory/MEMORY.md") + candidates = [ + os.path.join(project_dir, "CLAUDE.md"), + os.path.join(project_dir, "README.md"), + memory_md, + ] + return [p for p in candidates if os.path.exists(p)] + + +def chunk_document(filename, content): + """Split by ## headers; each chunk = 'filename § header\\nbody'.""" + chunks = [] + parts = re.split(r"\n(?=## )", content) + for part in parts: + part = part.strip() + if not part: + continue + lines = part.split("\n", 1) + header = re.sub(r"^#+\s*", "", lines[0].strip()) + body = lines[1].strip() if len(lines) > 1 else "" + text = f"{filename} § {header}\n{body}" + if len(text) > 50: + chunks.append(text[:2500]) + return chunks + + +def build_docs_bm25(project_dir): + """Build BM25 index over docs/research/*.md + CLAUDE.md + MEMORY.md. + Strategy: full-doc (no chunking) — A/B test 2026-04-11 confirms +9.1% recall@5 + vs header-chunked approach (0.758 vs 0.667 on 33 paraphrase pairs). + Full-doc wins on temporal/open-set/perf queries where answers span multiple sections. + + Name-collision dedup: extra files (root README/CLAUDE/MEMORY) win over + same-named files inside docs/research/ — root README.md is canonical fork + metadata, and docs/research/ may carry placeholder copies that pollute + retrieval results. + """ + units_by_name: dict[str, str] = {} + + docs_dir = Path(project_dir) / "docs" / "research" + if docs_dir.exists(): + for md_file in sorted(docs_dir.glob("*.md")): + try: + text = f"{md_file.name}\n{md_file.read_text()}" + if len(text) > 50: + units_by_name[md_file.name] = text + except Exception: + pass + + for fpath in _extra_doc_files(project_dir): + try: + p = Path(fpath) + text = f"{p.name}\n{p.read_text()}" + if len(text) > 50: + units_by_name[p.name] = text # extra files win on name collision + except Exception: + pass + + all_units = list(units_by_name.values()) + if not all_units or not _HAS_BM25: + return None, [] + tokenized = [tokenize(u) for u in all_units] + return BM25Okapi(tokenized), all_units + + +def bm25_search_docs(project_dir, query, top_k=5): + """Return top-k docs most relevant to query (full-doc BM25, no chunking).""" + if not query.strip(): + return [] + bm25, units = build_docs_bm25(project_dir) + if not bm25: + return [] + query_tokens = tokenize(query, drop_stopwords=True) + query_tokens = _expand_ko_en_docs(query_tokens) + if not query_tokens: + return [] + scores = bm25.get_scores(query_tokens) + ranked = sorted(range(len(units)), key=lambda i: scores[i], reverse=True) + top_score = float(max(scores)) if len(scores) else 0.0 + floor = max(1.0, top_score * 0.35) + bm_filtered = [units[i] for i in ranked[:top_k * 2] if scores[i] >= floor] + if len(bm_filtered) > top_k: + cand_dicts = [{"subject": u.split("\n", 1)[0], "text": u[:400]} for u in bm_filtered] + reranked_dicts = semantic_rerank_filter(cand_dicts, query, top_k=top_k) + subject_to_unit = {u.split("\n", 1)[0]: u for u in bm_filtered} + return [subject_to_unit[d["subject"]] for d in reranked_dicts if d["subject"] in subject_to_unit] + return bm_filtered[:top_k] + + +# ── Hybrid BM25+Dense Search ───────────────────────────────────────────────── + +_docs_emb_cache_state: dict = {} # in-memory: {"key": str, "units_emb": [...]} + + +def _docs_cache_key(units): + """Stable cache key based on doc filenames (sorted join → simple hash).""" + filenames = sorted(u.split("\n", 1)[0] for u in units) + key_str = "|".join(filenames) + return str(sum(ord(c) * (i + 1) for i, c in enumerate(key_str)) % (10 ** 10)) + + +def embed_docs_units(units, cache_path): + """Pre-embed docs corpus. Returns list of dicts: + {"hash": filename, "text": unit_string, "emb": list_or_[]}. + + Caches to cache_path; invalidates when doc set changes. + Fail-safe: items without embedding skip dense but still contribute via BM25. + """ + key = _docs_cache_key(units) + + if _docs_emb_cache_state.get("key") == key: + return _docs_emb_cache_state["units_emb"] + + if cache_path.exists(): + try: + cached = json.loads(cache_path.read_text()) + if cached.get("key") == key: + _docs_emb_cache_state.update(cached) + return cached["units_emb"] + except Exception: + pass + + units_emb = [] + for u in units: + filename = u.split("\n", 1)[0] + preview = u[:400] + emb = _vec_embed(preview) + units_emb.append({"hash": filename, "text": u, "emb": emb or []}) + + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_path.write_text(json.dumps({"key": key, "units_emb": units_emb})) + except Exception: + pass + + _docs_emb_cache_state.update({"key": key, "units_emb": units_emb}) + return units_emb + + +def dense_rank_docs(units_emb, query, top_k=10): + """Dense first-stage retrieval for docs corpus. + + units_emb: list of {"hash": filename, "text": unit_str, "emb": list} + Returns top_k dicts ranked by cosine similarity, or [] if vec-daemon down. + """ + q_emb = _vec_embed(query) + if not q_emb: + return [] + scored = [] + for item in units_emb: + emb = item.get("emb") + if not emb: + continue + cos = _cosine(q_emb, emb) + if cos > 0.0: + scored.append((cos, item)) + if not scored: + return [] + scored.sort(key=lambda x: -x[0]) + _last_retrieval_scores["dense_top"] = float(scored[0][0]) + return [item for _, item in scored[:top_k]] + + +def hybrid_search_docs(project_dir, query, top_k=5): + """Hybrid BM25+dense RRF search over docs/research/*.md corpus. + + Pipeline (2026-04-26): + 1. BM25 top-(top_k*2) candidates (threshold filtered) + 2. Dense top-(top_k*2) via pre-embedded corpus (vec-daemon cosine) + 3. RRF merge (k=60) + 4. Semantic rerank (BGE/vec-daemon) on merged pool + + Fail-safe: dense unavailable → BM25+semantic rerank (existing behavior). + Returns list of unit strings — same format as bm25_search_docs(). + """ + bm25, units = build_docs_bm25(project_dir) + if not bm25 or not units or not query.strip(): + return [] + + query_tokens = tokenize(query, drop_stopwords=True) + query_tokens = _expand_ko_en_docs(query_tokens) + if not query_tokens: + return [] + + scores = bm25.get_scores(query_tokens) + top_score = float(max(scores)) if len(scores) else 0.0 + _last_retrieval_scores["bm25_top"] = top_score + if top_score < 1.0: + return [] + floor = max(1.0, top_score * 0.35) + ranked = sorted(range(len(units)), key=lambda i: scores[i], reverse=True) + bm25_filtered = [units[i] for i in ranked[:top_k * 2] if scores[i] >= floor] + if not bm25_filtered: + return [] + + bm25_dicts = [{"hash": u.split("\n", 1)[0], "text": u} for u in bm25_filtered] + + cache_path = Path(project_dir) / ".omc" / "docs_corpus_emb.json" + units_emb = embed_docs_units(units, cache_path) + dense_dicts = dense_rank_docs(units_emb, query, top_k=top_k * 2) + + if not dense_dicts: + if len(bm25_filtered) > top_k: + cand_dicts = [{"subject": u.split("\n", 1)[0], "text": u[:400]} + for u in bm25_filtered] + reranked = semantic_rerank_filter(cand_dicts, query, top_k=top_k) + subj_map = {u.split("\n", 1)[0]: u for u in bm25_filtered} + return [subj_map[d["subject"]] for d in reranked if d["subject"] in subj_map] + return bm25_filtered[:top_k] + + merged = rrf_merge(bm25_dicts, dense_dicts, k_rrf=60) + + if len(merged) >= top_k + 2: + reranked = semantic_rerank_filter(merged, query, top_k=top_k) + return [item.get("text", "") for item in reranked if item.get("text")] + + return [item.get("text", "") for item in merged[:top_k] if item.get("text")] diff --git a/src/hooks/_bm25/hooks_search.py b/src/hooks/_bm25/hooks_search.py new file mode 100644 index 0000000..41ea782 --- /dev/null +++ b/src/hooks/_bm25/hooks_search.py @@ -0,0 +1,87 @@ +""" +hooks_search.py — G2-HOOKS ~/.claude/hooks/*.py BM25 search for bm25-memory. + +Provides: + _build_hook_doc(py_path) -> str + search_hooks_files(query, limit=3) -> list[tuple[Path, float]] + _has_hooks_keywords(prompt) -> bool +""" +from pathlib import Path + +try: + from rank_bm25 import BM25Okapi + _HAS_BM25 = True +except ImportError: + BM25Okapi = None # type: ignore + _HAS_BM25 = False + +from .tokenizer import tokenize + +# ── Hooks dir config ───────────────────────────────────────────────────────── + +_HOOKS_DIR = Path.home() / ".claude" / "hooks" +_HOOKS_TRIGGER_KWS = frozenset({ + # English + "hook", "hooks", "bm25-memory", "bm25_memory", "git-memory", "git_memory", + "auto-index", "auto_index", "g2-augment", "g2_augment", + "userPromptSubmit", "sessionstart", "posttooluse", + # Korean + "훅", "후크", +}) + + +def _build_hook_doc(py_path: Path) -> str: + """Extract file name + docstring + function/class signatures from a hook file.""" + try: + src = py_path.read_text(errors="replace") + except Exception: + return "" + lines = src.split("\n") + header_lines = [] + in_docstring = False + docstring_done = False + for line in lines[:80]: + stripped = line.strip() + if not docstring_done: + if stripped.startswith('"""') or stripped.startswith("'''"): + in_docstring = not in_docstring + header_lines.append(stripped[:200]) + if stripped.count('"""') >= 2 or stripped.count("'''") >= 2: + in_docstring = False + docstring_done = True + continue + if in_docstring: + header_lines.append(stripped[:200]) + if '"""' in stripped or "'''" in stripped: + in_docstring = False + docstring_done = True + continue + else: + docstring_done = True + if stripped.startswith("def ") or stripped.startswith("class "): + header_lines.append(stripped[:120]) + return f"{py_path.name}\n" + "\n".join(header_lines) + + +def search_hooks_files(query: str, limit: int = 3): + """BM25-search ~/.claude/hooks/*.py for hook function/filename matches.""" + if not _HOOKS_DIR.exists() or not _HAS_BM25: + return [] + py_files = sorted(_HOOKS_DIR.glob("*.py")) + if not py_files: + return [] + docs = [(p, _build_hook_doc(p)) for p in py_files] + docs = [(p, d) for p, d in docs if d] + if not docs: + return [] + tokenized = [tokenize(d) for _, d in docs] + bm25 = BM25Okapi(tokenized) + scores = bm25.get_scores(tokenize(query)) + ranked = sorted(range(len(docs)), key=lambda i: scores[i], reverse=True) + return [(docs[i][0], scores[i]) for i in ranked[:limit] if scores[i] > 0] + + +def _has_hooks_keywords(prompt: str) -> bool: + """Return True if prompt mentions hook-related terms.""" + low = prompt.lower() + return any(kw in low for kw in _HOOKS_TRIGGER_KWS) diff --git a/src/hooks/_bm25/injection.py b/src/hooks/_bm25/injection.py new file mode 100644 index 0000000..de0b3b2 --- /dev/null +++ b/src/hooks/_bm25/injection.py @@ -0,0 +1,142 @@ +""" +injection.py — P1 injection tracking for bm25-memory utility-rate measurement. + +Provides: + write_injection_record(prompt, lines, retrieval_meta, vec_sock, vec_disabled, + bge_sock, session_id) -> None +""" +import json +import os +from pathlib import Path + +# Meta/filler words that are never meaningful injection tokens +_META_WORDS = frozenset([ + "live-infinite", "live-inf", "omc-live", "iter", "live", + "goal_v1", "goal_v2", "goal_v3", "goal", + "feat", "fix", "refactor", "perf", "docs", "test", "chore", + "success", "section", "update", "add", "remove", "change", + "fixed", "added", "removed", "completed", +]) + + +def _is_header_line(s: str) -> bool: + return s.startswith("> **") and "** (" in s + + +def _extract_content_tokens(subject: str, n: int = 5) -> list: + """Pick up to N distinctive content tokens from a commit subject.""" + candidates = [] + for w in subject.split(): + w_clean = w.strip(".,()[]{}:;!?\"'").lower() + if len(w_clean) < 4: + continue + if w_clean in _META_WORDS: + continue + if w_clean.replace("/", "").replace(".", "").replace("-", "").isdigit(): + continue + candidates.append(w.strip(".,()[]{}:;!?\"'")) + seen: set = set() + uniq = [t for t in candidates if not (t.lower() in seen or seen.add(t.lower()))] + uniq.sort(key=lambda t: -len(t)) + return uniq[:n] + + +def _collect_items(lines: list) -> list: + """Parse injected output lines into structured items for utility tracking.""" + items = [] + for line in lines: + s = line.strip() + if _is_header_line(s): + continue + # G1 decisions: "> [YYYY-MM-DD] subject" + if s.startswith("> [") and "]" in s: + close_idx = s.index("]") + date_str = s[3:close_idx] + subj = s[close_idx + 1:].strip() + tokens = _extract_content_tokens(subj, n=5) + if tokens: + item: dict = { + "block": "g1_decisions", + "tokens": tokens, + "subject": subj[:200], + } + if len(date_str) == 10 and date_str[4] == "-" and date_str[7] == "-": + item["date"] = date_str + items.append(item) + # G2-DOCS entries: " > filename.md § section" + elif s.startswith("> ") and (".md" in s or s.endswith(".py")): + fname = s.lstrip("> ").strip().split(" §")[0].split()[0] + if fname: + stem = fname.rsplit(".", 1)[0] + parts = [p for p in stem.replace("-", " ").replace("_", " ").split() + if len(p) >= 4 and not p.isdigit()] + tokens = [fname] + parts[:4] + subject = " ".join(parts) if parts else fname + items.append({"block": "g2_docs", "tokens": tokens, "subject": subject[:200]}) + # G2-PREFETCH: "Function: name @ path" + elif ": " in s and "@" in s and any(k in s for k in + ("Function:", "Class:", "Method:", "Module:", "File:")): + try: + name = s.split(":", 1)[1].split("@")[0].strip() + path = s.split("@", 1)[1].strip() if "@" in s else "" + path_base = path.rsplit("/", 1)[-1] if path else "" + tokens = [t for t in [name, path_base] if t and len(t) >= 4] + if tokens: + items.append({ + "block": "g2_prefetch", + "tokens": tokens, + "subject": f"{name} in {path}"[:200], + }) + except Exception: + pass + return items + + +def write_injection_record( + prompt: str, + lines: list, + retrieval_meta: dict, + vec_sock, + vec_disabled: bool, + bge_sock, + session_id: str, +) -> None: + """Write ~/.claude/last-injection.json and ~/.claude/last-retrieval-meta.json. + + Silent no-op if CTX_DASHBOARD_INTERNAL=1 or any exception occurs. + """ + if os.environ.get("CTX_DASHBOARD_INTERNAL") == "1": + return + try: + import time as _t + preview = (prompt or "")[:120].replace("\n", " ").replace("\r", " ") + prompt_full_str = (prompt or "").replace("\r", "") + try: + _proj = os.environ.get("CLAUDE_PROJECT_DIR") or os.getcwd() + _project_name = os.path.basename(_proj.rstrip("/")) if _proj else None + except Exception: + _project_name = None + + injection = { + "ts": _t.time(), + "prompt_len": len(prompt) if prompt else 0, + "prompt_preview": preview, + "prompt_full": prompt_full_str, + "project": _project_name, + "items": _collect_items(lines), + } + Path(os.path.expanduser("~/.claude/last-injection.json")).write_text( + json.dumps(injection) + ) + + retrieval_meta["vec_daemon_up"] = vec_sock.exists() and not vec_disabled + retrieval_meta["bge_daemon_up"] = bge_sock.exists() and bool( + os.environ.get("CTX_CROSS_ENCODER", "1") != "0" + ) + retrieval_meta["query_char_count"] = len(prompt) if prompt else 0 + retrieval_meta["session_id"] = session_id or "" + Path(os.path.expanduser("~/.claude/last-retrieval-meta.json")).write_text( + json.dumps(retrieval_meta) + ) + except Exception: + pass diff --git a/src/hooks/_bm25/output.py b/src/hooks/_bm25/output.py new file mode 100644 index 0000000..d356178 --- /dev/null +++ b/src/hooks/_bm25/output.py @@ -0,0 +1,76 @@ +""" +output.py — Output formatting and emission for bm25-memory orchestrator. + +Provides: + build_header_lines(g1_header, g2_files, g2_keywords, vec_sock, vec_disabled, + bge_sock, use_cross_encoder, auto_tune, auto_tune_active) + -> list[str] + emit_output(lines, header_lines) -> None +""" +import json +import sys + + +def build_header_lines( + g1_header: str, + g2_files: list, + g2_keywords: list, + vec_sock, + vec_disabled: bool, + bge_sock, + use_cross_encoder: bool, + auto_tune: dict, + auto_tune_active: bool, +) -> list: + """Build the forced-display header block prepended to injection output.""" + header_lines = [] + if g1_header: + header_lines.append(g1_header) + if g2_files or g2_keywords: + files_str = ", ".join(f"`{f}`" for f in g2_files[:3]) if g2_files else "(docs BM25)" + kw_str = " ".join(g2_keywords[:3]) if g2_keywords else "" + via_str = f' — found via "{kw_str}"' if kw_str else "" + header_lines.append(f"> **G2** (space search): {files_str}{via_str}") + # Daemon degradation warnings + daemon_warns = [] + if not vec_disabled and not vec_sock.exists(): + daemon_warns.append("vec-daemon down — BM25-only mode (semantic rerank disabled)") + if use_cross_encoder and not bge_sock.exists(): + daemon_warns.append("bge-daemon down — cross-encoder rerank disabled") + if daemon_warns: + header_lines.append("> **⚠ Semantic layer**: " + " | ".join(daemon_warns)) + # Auto-tune active badge + if auto_tune_active: + n_rec = auto_tune.get("based_on_n", "?") + prefer_hybrid = auto_tune.get("prefer_hybrid_G1", False) + temporal_gap = auto_tune.get("temporal_utility_gap") + proj_hint = auto_tune.get("project_type_hint") + proj_conf = auto_tune.get("project_type_confidence", "LOW") + parts = [f"n={n_rec}"] + if prefer_hybrid: + parts.append("hybrid✓") + if temporal_gap and temporal_gap > 0.05: + parts.append(f"temporal-gap={temporal_gap*100:.0f}pp") + if proj_hint and proj_hint != "multi_lang" and proj_conf in ("HIGH", "MEDIUM"): + parts.append(proj_hint) + header_lines.append( + f"> **CTX auto-tune** [{', '.join(parts)}] — run `ctx-telemetry tune` to refresh" + ) + return header_lines + + +def emit_output(lines: list, header_lines: list) -> None: + """Emit hook output to stdout + header summary to stderr.""" + if header_lines: + lines = header_lines + [""] + lines + output = { + "hookSpecificOutput": { + "hookEventName": "UserPromptSubmit", + "additionalContext": "\n".join(lines), + } + } + json.dump(output, sys.stdout) + sys.stdout.flush() + if header_lines: + print("\n".join(header_lines), file=sys.stderr) + sys.stderr.flush() diff --git a/src/hooks/_bm25/ranker.py b/src/hooks/_bm25/ranker.py new file mode 100644 index 0000000..4d7573f --- /dev/null +++ b/src/hooks/_bm25/ranker.py @@ -0,0 +1,234 @@ +""" +ranker.py — G1 BM25/dense/hybrid ranker for bm25-memory. + +Provides: + dense_rank_decisions(corpus, query, top_k=20) -> list[dict] + rrf_merge(list_a, list_b, k_rrf=60) -> list[dict] + bm25_rank_decisions(corpus, query, top_k=7, ...) -> list[dict] + hybrid_rank_decisions(corpus, query, top_k=7) -> list[dict] + +Module-level mutable: + last_retrieval_scores: dict — bm25_top / dense_top captured per call, + read by orchestrator for telemetry. +""" +import re + +try: + from rank_bm25 import BM25Okapi + HAS_BM25 = True +except ImportError: + HAS_BM25 = False + +from .tokenizer import tokenize, expand_query_tokens +from .rerank import vec_embed as _vec_embed, cosine as _cosine, semantic_rerank_filter + +# Module-level score capture — read by orchestrator via last_retrieval_scores +last_retrieval_scores: dict = {} + + +def dense_rank_decisions(corpus, query, top_k=20): + """Dense first-stage retrieval: cosine similarity between query embedding + and pre-computed corpus embeddings (from embed_corpus_items). + + Returns top-k items by cosine, or [] if vec-daemon unavailable or corpus + has no embeddings (BM25-only fallback). + """ + q_emb = _vec_embed(query) + if not q_emb: + return [] + scored = [] + for item in corpus: + emb = item.get("emb") + if not emb: + continue + cos = _cosine(q_emb, emb) + if cos > 0.0: + scored.append((cos, item)) + if not scored: + return [] + # Tiebreak by stable item key so equal-cosine items have a deterministic + # order even if Python's sort stability is bypassed (e.g. PyPy, future + # dict-iteration changes, or different input orderings). + scored.sort(key=lambda x: (-x[0], x[1].get("hash") or (x[1].get("text") or "")[:20])) + last_retrieval_scores["dense_top"] = float(scored[0][0]) + return [item for _, item in scored[:top_k]] + + +def rrf_merge(list_a, list_b, k_rrf=60): + """Reciprocal Rank Fusion of two ranked lists. + + k_rrf=60: optimal constant per BEIR paper (arXiv:2104.08663) — controls + score distribution across rank positions. + + Uses commit 'hash' as dedup key; falls back to first-20-chars of 'text'. + Returns merged list ordered by RRF score (descending). + """ + scores = {} + hash_to_item = {} + + def _key(item): + return item.get("hash") or (item.get("text") or "")[:20] + + for rank, item in enumerate(list_a, 1): + k = _key(item) + scores[k] = scores.get(k, 0.0) + 1.0 / (k_rrf + rank) + hash_to_item[k] = item + + for rank, item in enumerate(list_b, 1): + k = _key(item) + scores[k] = scores.get(k, 0.0) + 1.0 / (k_rrf + rank) + hash_to_item[k] = item + + # Tiebreak by hash key so equal-RRF items don't depend on dict insertion order + # (which itself depends on whether list_a or list_b saw the hash first). + merged_keys = sorted(scores.keys(), key=lambda h: (-scores[h], h)) + return [hash_to_item[h] for h in merged_keys] + + +def score_corpus_bm25(tokenized_corpus, query_tokens): + """Generic BM25 scorer — returns raw numpy score array. + + This is the canonical low-level primitive used by both the eval pipeline + (adaptive_trigger.py) and the production hook (bm25_rank_decisions). + Callers that need top-k with MMR/dedup should use bm25_rank_decisions(); + callers that need the full score vector for blending should use this. + + Args: + tokenized_corpus: list[list[str]] — pre-tokenized documents. + query_tokens: list[str] — pre-tokenized query (tokenize() output). + + Returns: + np.ndarray of shape (len(tokenized_corpus),) or None if rank_bm25 unavailable. + """ + if not HAS_BM25 or not tokenized_corpus or not query_tokens: + return None + try: + import numpy as np + bm25 = BM25Okapi(tokenized_corpus) + return np.array(bm25.get_scores(query_tokens)) + except Exception: + return None + + +def bm25_rank_decisions(corpus, query, top_k=7, min_score=0.5, + adaptive_floor_ratio=0.35, mmr_jaccard_threshold=0.70, + skip_rerank=False): + """BM25-rank decision corpus against query, return top-k. + + Stopwords are dropped from the query (not the corpus) so conversational + fillers like "i/to/how/would" don't dominate the ranking. + + `min_score`: if the best-matching decision scores below this, return []. + Prevents the "no-topic-match → fallback to most-recent-7" anti-pattern + where zero-score or near-zero queries got ranked purely by git-log order. + + `adaptive_floor_ratio` (NEW 2026-04-24): candidates below + top_score * adaptive_floor_ratio are dropped. Eliminates the + "surface-token match" noise where a hit scores just above min_score + but is 3-5× worse than the actual best hit (e.g., 'iter 47/∞: token%' + scoring 1.2 when the real match scores 4.0). + + `mmr_jaccard_threshold` (NEW 2026-04-24): if a candidate's token set has + Jaccard similarity >= threshold with any already-selected item, skip it. + Collapses clustered noise like multiple 'live-infinite iter N/∞' entries + that are near-duplicates — keeps only the best of each cluster. + """ + if not corpus: + return [] + if not HAS_BM25 or not query.strip(): + return [] + + query_tokens = tokenize(query, drop_stopwords=True) + if not query_tokens: + return [] + + # Layer 2 (2026-04-24): synonym expansion to bridge KO↔EN + concept gaps + query_tokens = expand_query_tokens(query_tokens) + + tokenized = [tokenize(c["text"]) for c in corpus] + bm25 = BM25Okapi(tokenized) + scores = bm25.get_scores(query_tokens) + if len(scores) == 0 or float(max(scores)) < min_score: + return [] + + top_score = float(max(scores)) + last_retrieval_scores["bm25_top"] = top_score + adaptive_floor = max(min_score, top_score * adaptive_floor_ratio) + + # Tiebreak by index ascending so equal-score corpus entries have a + # deterministic order regardless of sort stability guarantees. + ranked_idx = sorted(range(len(corpus)), key=lambda i: (-scores[i], i)) + + # Cluster signature: normalizes "live-infinite iter N/∞: goal_vM" boilerplate + # so different iter-numbers don't escape MMR dedup. + def _cluster_sig(subject: str) -> str: + s = subject.lower() + s = re.sub(r'\b\d{4,}\b|\b\d+/\d+\b|\b\d+/∞\b|goal_v\d+', '', s) + s = re.sub(r'iter\s*\d+', 'iter', s) + s = re.sub(r'[^a-z가-힣\s]', ' ', s) + s = re.sub(r'\s+', ' ', s).strip() + return ' '.join(s.split()[:4]) + + selected = [] + selected_token_sets = [] + selected_cluster_sigs = set() + for idx in ranked_idx: + if scores[idx] < adaptive_floor: + break + cand_tokens = set(tokenized[idx]) + if not cand_tokens: + continue + cand_sig = _cluster_sig(corpus[idx].get("subject", corpus[idx].get("text", ""))) + if cand_sig and cand_sig in selected_cluster_sigs: + continue + is_near_dup = False + for prev_tokens in selected_token_sets: + union = cand_tokens | prev_tokens + if not union: + continue + jaccard = len(cand_tokens & prev_tokens) / len(union) + if jaccard >= mmr_jaccard_threshold: + is_near_dup = True + break + if is_near_dup: + continue + selected.append(corpus[idx]) + selected_token_sets.append(cand_tokens) + if cand_sig: + selected_cluster_sigs.add(cand_sig) + if len(selected) >= top_k * 2: + break + if not skip_rerank and len(selected) >= top_k + 2: + selected = semantic_rerank_filter(selected, query, top_k=top_k) + return selected[:top_k] + + +def hybrid_rank_decisions(corpus, query, top_k=7): + """Hybrid BM25+dense retrieval with RRF merge — SOTA method per MAB/LongMemEval. + + Pipeline (2026-04-26): + 1. BM25 top-(top_k*2) with MMR/cluster dedup, NO semantic rerank yet + 2. Dense top-(top_k*2) using pre-embedded corpus via vec-daemon cosine + 3. RRF merge (k=60) — union of both candidate pools + 4. Semantic rerank (BGE cross-encoder → vec-daemon bi-encoder fallback) + + Fail-safe: if dense_rank_decisions() returns [] (vec-daemon down or no embeddings), + falls back to BM25-only + semantic rerank (existing behavior). + """ + bm25_cands = bm25_rank_decisions(corpus, query, top_k=top_k * 2, skip_rerank=True) + if not bm25_cands: + return [] + + dense_cands = dense_rank_decisions(corpus, query, top_k=top_k * 2) + + if not dense_cands: + if len(bm25_cands) >= top_k + 2: + bm25_cands = semantic_rerank_filter(bm25_cands, query, top_k=top_k) + return bm25_cands[:top_k] + + merged = rrf_merge(bm25_cands, dense_cands, k_rrf=60) + + if len(merged) >= top_k + 2: + merged = semantic_rerank_filter(merged, query, top_k=top_k) + + return merged[:top_k] diff --git a/src/hooks/_bm25/rerank.py b/src/hooks/_bm25/rerank.py new file mode 100644 index 0000000..97e67d9 --- /dev/null +++ b/src/hooks/_bm25/rerank.py @@ -0,0 +1,188 @@ +""" +rerank.py — Semantic rerank helpers for bm25-memory. + +Three layers: + 1. bi-encoder rerank via vec-daemon (e5-small, CPU-friendly, ~20ms/candidate) + 2. Korean-English synonym expansion (zero-cost lexical bridge) — in tokenizer.py + 3. BGE cross-encoder rerank (GPU, ~50ms for top-20, +15-25%p quality) +Layer 3 is opt-in via CTX_CROSS_ENCODER=1 env var. + +Provides: + VEC_SOCK, VEC_DISABLED — vec-daemon config (read by orchestrator) + BGE_SOCK, USE_CROSS_ENCODER — bge-daemon config (read by orchestrator) + vec_embed(text) -> list|None + cosine(a, b) -> float + semantic_rerank_filter(candidates, query, top_k, ...) -> list +""" +import json +import os +import socket +from pathlib import Path + +# ── Vec-daemon config ──────────────────────────────────────────────────────── + +VEC_SOCK = Path.home() / ".local/share/claude-vault/vec-daemon.sock" +VEC_TIMEOUT = 0.8 # seconds — fail fast if daemon is down +VEC_DISABLED = os.environ.get("CTX_DISABLE_SEMANTIC_RERANK") == "1" + +# ── BGE cross-encoder config ───────────────────────────────────────────────── + +BGE_SOCK = Path.home() / ".local/share/claude-vault/bge-daemon.sock" +BGE_TIMEOUT = 2.0 # seconds — rerank 20 cands typically <80ms, give slack +USE_CROSS_ENCODER = os.environ.get("CTX_CROSS_ENCODER", "1") != "0" + +# Windows fallback: AF_UNIX missing on MSVC-built CPython → TCP loopback. +USE_TCP = not hasattr(socket, "AF_UNIX") +VEC_PORT = int(os.environ.get("CTX_VEC_PORT", "29501")) +BGE_PORT = int(os.environ.get("CTX_BGE_PORT", "29502")) + + +def _bge_rerank(query: str, docs: list): + """Query the running bge-daemon for cross-encoder scores. + + Returns list[float] (raw logits, same length as docs) or None on failure. + Caller applies sigmoid + filtering. Fail-fast: 2s timeout keeps the hook + responsive if the daemon is wedged. + """ + if not USE_CROSS_ENCODER: + return None + if not USE_TCP and not BGE_SOCK.exists(): + return None + try: + if USE_TCP: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(BGE_TIMEOUT) + s.connect(("127.0.0.1", BGE_PORT)) + else: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.settimeout(BGE_TIMEOUT) + s.connect(str(BGE_SOCK)) + payload = (json.dumps({"query": query[:400], + "docs": [str(d)[:400] for d in docs]}) + "\n").encode("utf-8") + s.sendall(payload) + buf = b"" + while b"\n" not in buf: + chunk = s.recv(65536) + if not chunk: + break + buf += chunk + s.close() + resp = json.loads(buf.split(b"\n")[0].decode("utf-8")) + if resp.get("ok"): + return resp.get("scores") + except Exception: + return None + return None + + +def vec_embed(text: str): + """Query the running vec-daemon for an embedding. Returns list[float] or None. + Uses the same Unix socket protocol as chat-memory.py; 0 if daemon is down.""" + if VEC_DISABLED: + return None + if not USE_TCP and not VEC_SOCK.exists(): + return None + try: + if USE_TCP: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(VEC_TIMEOUT) + s.connect(("127.0.0.1", VEC_PORT)) + else: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.settimeout(VEC_TIMEOUT) + s.connect(str(VEC_SOCK)) + payload = (json.dumps({"q": text[:1000]}) + "\n").encode("utf-8") + s.sendall(payload) + buf = b"" + while b"\n" not in buf: + chunk = s.recv(8192) + if not chunk: + break + buf += chunk + s.close() + line = buf.split(b"\n")[0] + resp = json.loads(line.decode("utf-8")) + if resp.get("ok"): + return resp.get("emb") + except Exception: + return None + return None + + +def cosine(a, b): + if not a or not b or len(a) != len(b): + return 0.0 + import math # noqa: F401 (math.exp not used here but kept for clarity) + dot = sum(x * y for x, y in zip(a, b)) + # embeddings from vec-daemon are already normalized → dot = cosine + return max(0.0, min(1.0, dot)) + + +def semantic_rerank_filter(candidates, query, top_k, alpha_bm25=0.6, + cosine_min=0.55, bm25_scores=None): + """Rerank a list of candidate items by blended BM25 + cosine semantic. + + candidates: list of dicts, each with a 'text' or 'subject' field + query: user query string + top_k: final count + alpha_bm25: weight of BM25 score in blend (1-alpha = semantic weight) + cosine_min: hard floor — items below this cosine get dropped even if BM25 is high + bm25_scores: optional pre-computed BM25 scores (normalized 0-1); if None, + assume candidates are already ordered by BM25 → use rank position + + Fail-safe: if vec-daemon is down, returns candidates[:top_k] (no-op). + + Layer 3 (2026-04-24): prefer BGE cross-encoder when available — it scores + (query, candidate) jointly instead of computing independent embeddings + + cosine. Much stronger semantic judgement on short commit subjects. + Falls back to bi-encoder cosine path if cross-encoder fails to load. + """ + # ── Layer 3: bge-daemon cross-encoder path (strongest semantic signal) ─── + kept = [] + doc_texts = [] + for i, c in enumerate(candidates): + text = c.get("subject") or c.get("text") or "" + if not text: + continue + kept.append((i, c)) + doc_texts.append(text[:400]) + if doc_texts: + ce_scores = _bge_rerank(query, doc_texts) + if ce_scores is not None and len(ce_scores) == len(kept): + import math + def _sig(x): return 1.0 / (1.0 + math.exp(-float(x))) + rescored = [] + ce_min = 0.35 + for (i, c), s in zip(kept, ce_scores): + ce_norm = _sig(s) + if ce_norm < ce_min: + continue + bm25_norm = (bm25_scores[i] if bm25_scores else (len(candidates) - i) / max(1, len(candidates))) + blend = alpha_bm25 * bm25_norm + (1.0 - alpha_bm25) * ce_norm + rescored.append((blend, ce_norm, c)) + rescored.sort(key=lambda x: -x[0]) + if rescored: + return [c for _, _, c in rescored[:top_k]] + # CE filtered everything → fall back to bi-encoder + + # ── Bi-encoder fallback (original path) ─── + q_emb = vec_embed(query) + if not q_emb: + return candidates[:top_k] # daemon down → no-op + + rescored = [] + for i, c in enumerate(candidates): + text = c.get("subject") or c.get("text") or "" + if not text: + continue + c_emb = vec_embed(text[:400]) # short for speed + if not c_emb: + continue + cos = cosine(q_emb, c_emb) + if cos < cosine_min: + continue # hard drop — semantic dissimilarity overrides BM25 rank + bm25_norm = (bm25_scores[i] if bm25_scores else (len(candidates) - i) / max(1, len(candidates))) + blend = alpha_bm25 * bm25_norm + (1.0 - alpha_bm25) * cos + rescored.append((blend, cos, c)) + rescored.sort(key=lambda x: -x[0]) + return [c for _, _, c in rescored[:top_k]] diff --git a/src/hooks/_bm25/session.py b/src/hooks/_bm25/session.py new file mode 100644 index 0000000..bb4b858 --- /dev/null +++ b/src/hooks/_bm25/session.py @@ -0,0 +1,90 @@ +""" +session.py — Session-scoped helpers for bm25-memory orchestrator. + +Provides: + get_world_model(project_dir) -> (dead_ends, facts) + get_session_decisions(project_dir) -> list[str] + consume_pending_decisions(project_dir) -> list[str] +""" +import json +from pathlib import Path + + +def get_world_model(project_dir): + """Load dead-ends and facts from .omc/world-model.json (--rich mode).""" + wm_path = Path(project_dir) / ".omc" / "world-model.json" + if not wm_path.exists(): + return [], [] + try: + wm = json.loads(wm_path.read_text()) + except Exception: + return [], [] + raw_de = wm.get("dead_ends", []) + if isinstance(raw_de, dict): + raw_de = [] + dead_ends = [ + f" x {de.get('goal','')[:60]} -- {de.get('reason','')[:80]}" + for de in raw_de[-5:] + ] + facts = [] + for fact in wm.get("known_facts", []): + if isinstance(fact, dict): + facts.append(f" * {fact['fact'][:80]}") + elif isinstance(fact, str) and not any( + fact.startswith(p) for p in ("paper:", "README:", "uncertain:") + ): + facts.append(f" * {fact[:80]}") + return dead_ends, facts[-8:] + + +def get_session_decisions(project_dir): + """Read .omc/session-decisions.md for uncommitted decisions.""" + p = Path(project_dir) / ".omc" / "session-decisions.md" + if not p.exists(): + return [] + try: + lines = p.read_text().strip().split("\n") + return [l.strip() for l in lines if l.strip().startswith(">")][-5:] + except Exception: + return [] + + +def consume_pending_decisions(project_dir: str): + """ + 이전 세션 stop-decision-capture.py가 남긴 .pending-decisions.json 읽고 삭제. + Returns list of formatted strings for additionalContext injection, or []. + """ + slug = project_dir.replace("/", "-") + pending_path = ( + Path.home() / ".claude" / "projects" / slug / "memory" / ".pending-decisions.json" + ) + if not pending_path.exists(): + return [] + + try: + payload = json.loads(pending_path.read_text(encoding="utf-8")) + decisions = payload.get("decisions", []) + captured_at = payload.get("captured_at", "")[:16] # "2026-04-14T12:34" + except Exception: + try: + pending_path.unlink(missing_ok=True) + except Exception: + pass + return [] + + # 읽은 즉시 삭제 (1회성 전달) + try: + pending_path.unlink(missing_ok=True) + except Exception: + pass + + if not decisions: + return [] + + lines = [f"[PENDING MEMORY] 이전 세션({captured_at}) 결정 후보 — MEMORY.md 업데이트 필요 여부 판단:"] + for d in decisions: + tag = d.get("tag", "") + ctx = d.get("context", "") + lines.append(f" [{tag}] {ctx}") + lines.append("→ 위 항목 중 MEMORY.md에 없는 중요한 결정이 있으면 즉시 Edit tool로 기록하세요.") + return lines diff --git a/src/hooks/_bm25/tokenizer.py b/src/hooks/_bm25/tokenizer.py new file mode 100644 index 0000000..59a8897 --- /dev/null +++ b/src/hooks/_bm25/tokenizer.py @@ -0,0 +1,140 @@ +""" +tokenizer.py — BM25 tokenizer for bm25-memory. + +Provides: + tokenize(text, drop_stopwords=False) -> list[str] + expand_query_tokens(query_tokens) -> list[str] + +Korean particle stripping + Porter stemmer (opt-in via CTX_STEM=1, default ON). +Synonym expansion bridges Korean<->English lexical gaps. +""" +import os +import re + +# ── Korean particle stripper ───────────────────────────────────────────────── + +_KO_PARTICLES = re.compile( + r'(와|과|이|가|은|는|을|를|의|에서|으로|에게|부터|까지|처럼|같이|보다|이나|며|에|로|도|만|나|고)$' +) + +# Conversational stopwords — filtered from QUERIES only (not the corpus). +# These appear in nearly every conversational prompt and make BM25 return +# noise matches on common words instead of real topic terms. +# Kept conservative — only words that are almost never content-bearing in +# a software-engineering commit subject. +_STOPWORDS = frozenset([ + # English function words + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "am", "do", "does", "did", "have", "has", "had", "will", "would", + "could", "should", "may", "might", "can", "to", "of", "in", "on", + "at", "by", "for", "with", "from", "as", "into", "about", + "and", "or", "but", "if", "then", "than", "so", "not", "no", + "i", "you", "we", "he", "she", "it", "they", "me", "my", "your", + "our", "his", "her", "their", "this", "that", "these", "those", + "there", "here", "what", "which", "when", "where", "why", "how", + "who", "whom", "some", "any", "all", "each", "every", "both", + "more", "most", "less", "few", "much", "many", + "just", "only", "very", "too", "also", "even", "still", "yet", + "now", "then", "up", "down", "out", "over", "again", + # Conversational fillers + "ok", "yeah", "yep", "pls", "please", "thanks", "thank", + "hi", "hey", "hello", "want", "like", "think", "need", + "make", "use", "using", "try", "trying", "get", "got", + # Korean fillers (particles already stripped; these are standalone) + "음", "어", "아", "그", "저", "이거", "저거", "그거", +]) + +# Small Korean-English synonym map for query expansion (Layer 2). +# Keys are case-folded. Additions expand the query token set — BM25 will match +# commits mentioning either side of each pair. Focused on CTX domain vocabulary +# that commonly appears in Korean prompts but English commits (and vice versa). +_SYNONYM_EXPANSION = { + "cross-session": ["long-term", "persistent", "inter-session", "장기기억"], + "long-term": ["cross-session", "persistent", "장기", "장기기억"], + "memory": ["recall", "retrieval", "기억"], + "retrieval": ["search", "recall", "fetch", "검색", "조회"], + "search": ["retrieval", "lookup", "검색"], + "hook": ["plugin", "extension", "훅"], + "embed": ["embedding", "vector", "임베딩"], + "embedding": ["embed", "vector", "임베딩"], + "rerank": ["rank", "reorder", "재정렬", "순위"], + "semantic": ["vector", "dense", "의미"], + "context": ["memory", "state", "컨텍스트"], + "prompt": ["query", "question", "프롬프트"], + "improve": ["enhance", "boost", "optimize", "개선", "향상"], + "quality": ["accuracy", "score", "품질"], + "noise": ["garbage", "irrelevant", "노이즈"], + "cluster": ["group", "dedup", "중복"], + "dashboard": ["ui", "visualization", "대시보드"], + "bootstrap": ["install", "setup", "부트스트랩"], + "gpu": ["cuda", "device", "가속"], + "claude": ["anthropic", "llm"], + "korean": ["한국어", "ko", "hangul"], + "기억": ["memory", "recall"], + "검색": ["search", "retrieval"], + "장기기억": ["long-term memory", "cross-session", "persistent"], + "의사결정": ["decision", "choice"], + "훅": ["hook", "plugin"], + "임베딩": ["embedding", "vector"], +} + +# ── Porter stemmer ─────────────────────────────────────────────────────────── +# opt-in via CTX_STEM=1, default ON 2026-04-24 after G1 regression showed +0.034 +# improvement on Recall@7 with zero losses. +_USE_STEMMER = os.environ.get("CTX_STEM", "1") != "0" +_STEMMER = None +if _USE_STEMMER: + try: + from nltk.stem.porter import PorterStemmer as _PS + _STEMMER = _PS() + except ImportError: + _STEMMER = None # stemming silently disabled if nltk not installed + + +def tokenize(text: str, drop_stopwords: bool = False): + """Preserve decimal numbers (0.724) and numeric ranges (7-30) as single tokens. + Also strips Korean particles from mixed Korean-ASCII tokens (e.g. 'BM25와' → 'bm25' + 'bm25와') + so that Korean queries match English commit subjects correctly. + + When `drop_stopwords=True` (query-side only), conversational fillers are + removed to prevent BM25 from matching on common words like "i", "to", "how", + "would", etc. Corpus tokenization never drops stopwords — IDF handles those. + + Porter stemmer (2026-04-24): adds stemmed variant for each token so "logs" + matches "logging". Preserves the original token too so exact-match precision + is never lost (dedup handles duplicates). Opt-out via CTX_STEM=0. + """ + raw = re.findall(r'\d+[-–]\d+|\d+\.\d+|\w+', text.lower()) + result = [] + for tok in raw: + if drop_stopwords and tok in _STOPWORDS: + continue + cleaned = _KO_PARTICLES.sub('', tok) + if cleaned and cleaned != tok: + if not (drop_stopwords and cleaned in _STOPWORDS): + result.append(cleaned) + result.append(tok) + # Porter stem — adds a THIRD variant. Dedup at return preserves order + # so original tokens remain ranked; stem is a recall-rescue fallback. + if _STEMMER is not None and tok.isalpha() and len(tok) > 3: + stemmed = _STEMMER.stem(tok) + if stemmed != tok: + result.append(stemmed) + return list(dict.fromkeys(result)) + + +def expand_query_tokens(query_tokens): + """Layer 2: bridge Korean<->English lexical gaps via synonym map. + Returns the original tokens + synonym expansions (capped at 2x length).""" + out = list(query_tokens) + for t in query_tokens: + syns = _SYNONYM_EXPANSION.get(t.lower()) + if syns: + out.extend(syns) + # Dedupe while preserving order + seen = set(); uniq = [] + for t in out: + k = t.lower() + if k not in seen: + seen.add(k); uniq.append(t) + return uniq[:len(query_tokens) * 2 + 5] # cap growth diff --git a/src/hooks/_ctx_telemetry.py b/src/hooks/_ctx_telemetry.py index 414469b..840e5cf 100755 --- a/src/hooks/_ctx_telemetry.py +++ b/src/hooks/_ctx_telemetry.py @@ -134,6 +134,21 @@ def _maybe_notify_once(): # A/B scaffold: UserPromptSubmit hook skipped injection because CTX_AB_DISABLE=1. # Presence of these events lets the dashboard compute control-arm sample counts. "ab_skipped": {"hook", "reason"}, + # bm25-memory: overall invocation summary (emitted once per hook run) + "hook_complete": { + "hook", "latency_ms", "exit_code", + "query_type", + "g1_top_score_bm25", "g1_top_score_dense", "g1_count", + "g2_docs_count", "g2_code_count", "g2_hooks_count", + "fallback_reasons", "blocks_fired", + }, + # bm25-memory: optional per-stage events + "g1_done": {"hook", "g1_top_score_bm25", "g1_top_score_dense", "g1_count", "duration_ms"}, + "g2_docs_done": {"hook", "g2_docs_count", "top_score", "duration_ms"}, + "g2_code_done": {"hook", "g2_code_count", "fallback_reason", "duration_ms"}, + "g2_hooks_done": {"hook", "g2_hooks_count", "duration_ms"}, + "fallback_emitted": {"hook", "reason"}, + "prompt_received": {"hook", "query_type", "prompt_len"}, } diff --git a/src/hooks/bge-daemon.py b/src/hooks/bge-daemon.py index e97b10a..8ddf4d5 100755 --- a/src/hooks/bge-daemon.py +++ b/src/hooks/bge-daemon.py @@ -35,6 +35,10 @@ LOG_FILE = Path.home() / ".local/share/claude-vault/bge-daemon.log" MODEL_NAME = os.environ.get("CTX_BGE_MODEL", "BAAI/bge-reranker-v2-m3") +# Windows fallback: AF_UNIX is not exposed by MSVC-built CPython. Use TCP loopback. +USE_TCP = not hasattr(socket, "AF_UNIX") +BGE_PORT = int(os.environ.get("CTX_BGE_PORT", "29502")) + # ── control flags ────────────────────────────────────────────── if "--stop" in sys.argv: STOP_FILE.write_text("stop") @@ -54,9 +58,14 @@ # socket probe ok = False try: - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.settimeout(2.0) - s.connect(str(SOCKET_PATH)) + if USE_TCP: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(2.0) + s.connect(("127.0.0.1", BGE_PORT)) + else: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.settimeout(2.0) + s.connect(str(SOCKET_PATH)) s.sendall(b'{"query":"ping","docs":["pong"]}\n') resp = b"" while b"\n" not in resp: @@ -79,8 +88,8 @@ os.kill(existing_pid, 0) print(f"[bge-daemon] Already running (PID {existing_pid}). Exiting.") sys.exit(0) - except (ProcessLookupError, ValueError): - pass # stale PID file, continue + except (ProcessLookupError, ValueError, OSError): + pass # stale PID file (or Windows os.kill not supported), continue def log(msg): line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {msg}" @@ -167,16 +176,30 @@ def main(): except Exception as e: log(f"warmup failed: {e}") - if SOCKET_PATH.exists(): - SOCKET_PATH.unlink() PID_FILE.write_text(str(os.getpid())) - srv = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - srv.bind(str(SOCKET_PATH)) - os.chmod(str(SOCKET_PATH), 0o600) + if USE_TCP: + srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # Skip SO_REUSEADDR on Windows: semantics differ (allows multiple + # bind to same port → port hijacking risk). Linux/macOS keep TIME_WAIT + # rebinding behavior. + if sys.platform != "win32": + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", BGE_PORT)) + listen_target = f"127.0.0.1:{BGE_PORT}" + else: + if SOCKET_PATH.exists(): + SOCKET_PATH.unlink() + srv = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + srv.bind(str(SOCKET_PATH)) + try: + os.chmod(str(SOCKET_PATH), 0o600) + except OSError: + pass + listen_target = str(SOCKET_PATH) srv.listen(8) srv.settimeout(1.0) - log(f"listening on {SOCKET_PATH}") + log(f"listening on {listen_target}") while True: if STOP_FILE.exists(): @@ -193,7 +216,8 @@ def main(): log(f"accept error: {e}") srv.close() - SOCKET_PATH.unlink(missing_ok=True) + if not USE_TCP: + SOCKET_PATH.unlink(missing_ok=True) PID_FILE.unlink(missing_ok=True) log("stopped") diff --git a/src/hooks/bm25-memory.py b/src/hooks/bm25-memory.py index f289866..ec481ad 100644 --- a/src/hooks/bm25-memory.py +++ b/src/hooks/bm25-memory.py @@ -22,1385 +22,68 @@ """ import json import os -import re -import subprocess import sys from pathlib import Path -try: - from rank_bm25 import BM25Okapi - HAS_BM25 = True -except ImportError: - HAS_BM25 = False - -RICH = "--rich" in sys.argv - - -# ── Tokenizer ──────────────────────────────────────────────────── - -_KO_PARTICLES = re.compile( - r'(와|과|이|가|은|는|을|를|의|에서|으로|에게|부터|까지|처럼|같이|보다|이나|며|에|로|도|만|나|고)$' +# ── _bm25 package import (script entry-point path hack) ────────────────────── +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from _bm25.autotune import ( # noqa: E402 + AUTO_TUNE as _AUTO_TUNE, AUTO_TUNE_ACTIVE as _AUTO_TUNE_ACTIVE, + get_g1_top_k as _get_g1_top_k, get_g2d_top_k as _get_g2d_top_k, ) - -# Conversational stopwords — filtered from QUERIES only (not the corpus). -# These appear in nearly every conversational prompt and make BM25 return -# noise matches on common words instead of real topic terms. -# Kept conservative — only words that are almost never content-bearing in -# a software-engineering commit subject. -_STOPWORDS = frozenset([ - # English function words - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "am", "do", "does", "did", "have", "has", "had", "will", "would", - "could", "should", "may", "might", "can", "to", "of", "in", "on", - "at", "by", "for", "with", "from", "as", "into", "about", - "and", "or", "but", "if", "then", "than", "so", "not", "no", - "i", "you", "we", "he", "she", "it", "they", "me", "my", "your", - "our", "his", "her", "their", "this", "that", "these", "those", - "there", "here", "what", "which", "when", "where", "why", "how", - "who", "whom", "some", "any", "all", "each", "every", "both", - "more", "most", "less", "few", "much", "many", - "just", "only", "very", "too", "also", "even", "still", "yet", - "now", "then", "up", "down", "out", "over", "again", - # Conversational fillers - "ok", "yeah", "yep", "pls", "please", "thanks", "thank", - "hi", "hey", "hello", "want", "like", "think", "need", - "make", "use", "using", "try", "trying", "get", "got", - # Korean fillers (particles already stripped; these are standalone) - "음", "어", "아", "그", "저", "이거", "저거", "그거", -]) - - -# ── Semantic rerank helpers (2026-04-24) ──────────────────────────────── -# Three layers: -# 1. bi-encoder rerank via vec-daemon (e5-small, CPU-friendly, ~20ms/candidate) -# 2. Korean-English synonym expansion (zero-cost lexical bridge) -# 3. BGE cross-encoder rerank (GPU, ~50ms for top-20, +15-25%p quality) -# Layer 3 is opt-in via CTX_CROSS_ENCODER=1 env var; loads lazily on first use. - -_VEC_SOCK = Path.home() / ".local/share/claude-vault/vec-daemon.sock" -_VEC_TIMEOUT = 0.8 # seconds — fail fast if daemon is down -_VEC_DISABLED = os.environ.get("CTX_DISABLE_SEMANTIC_RERANK") == "1" - -# ── Auto-tune: read flywheel parameter recommendations (ctx-telemetry tune output) ── -_AUTO_TUNE_PATH = Path.home() / ".claude" / "ctx-auto-tune.json" -_AUTO_TUNE: dict = {} -_AUTO_TUNE_ACTIVE: bool = False -try: - if _AUTO_TUNE_PATH.exists(): - _auto_tune_raw = json.loads(_AUTO_TUNE_PATH.read_text()) - if isinstance(_auto_tune_raw, dict): - _AUTO_TUNE = _auto_tune_raw - _AUTO_TUNE_ACTIVE = True -except Exception: - pass - -# Retrieval score capture: populated by bm25_rank_decisions / dense_rank_decisions -# so callers can read top_score_bm25 / top_score_dense without signature changes. -_last_retrieval_scores: dict = {} - -# bge-daemon: BGE cross-encoder served over Unix socket (same pattern as vec-daemon). -# Hook stays fast because the 7s model load happens ONCE in the daemon, not per -# UserPromptSubmit. Default ON; disable via CTX_CROSS_ENCODER=0 if daemon is down -# and we don't want even the 0.8s connect-timeout cost per prompt. -_BGE_SOCK = Path.home() / ".local/share/claude-vault/bge-daemon.sock" -_BGE_TIMEOUT = 2.0 # seconds — rerank 20 cands typically <80ms, give slack -_USE_CROSS_ENCODER = os.environ.get("CTX_CROSS_ENCODER", "1") != "0" - -# Small Korean-English synonym map for query expansion (Layer 2). -# Keys are case-folded. Additions expand the query token set — BM25 will match -# commits mentioning either side of each pair. Focused on CTX domain vocabulary -# that commonly appears in Korean prompts but English commits (and vice versa). -_SYNONYM_EXPANSION = { - "cross-session": ["long-term", "persistent", "inter-session", "장기기억"], - "long-term": ["cross-session", "persistent", "장기", "장기기억"], - "memory": ["recall", "retrieval", "기억"], - "retrieval": ["search", "recall", "fetch", "검색", "조회"], - "search": ["retrieval", "lookup", "검색"], - "hook": ["plugin", "extension", "훅"], - "embed": ["embedding", "vector", "임베딩"], - "embedding": ["embed", "vector", "임베딩"], - "rerank": ["rank", "reorder", "재정렬", "순위"], - "semantic": ["vector", "dense", "의미"], - "context": ["memory", "state", "컨텍스트"], - "prompt": ["query", "question", "프롬프트"], - "improve": ["enhance", "boost", "optimize", "개선", "향상"], - "quality": ["accuracy", "score", "품질"], - "noise": ["garbage", "irrelevant", "노이즈"], - "cluster": ["group", "dedup", "중복"], - "dashboard": ["ui", "visualization", "대시보드"], - "bootstrap": ["install", "setup", "부트스트랩"], - "gpu": ["cuda", "device", "가속"], - "claude": ["anthropic", "llm"], - "korean": ["한국어", "ko", "hangul"], - "기억": ["memory", "recall"], - "검색": ["search", "retrieval"], - "장기기억": ["long-term memory", "cross-session", "persistent"], - "의사결정": ["decision", "choice"], - "훅": ["hook", "plugin"], - "임베딩": ["embedding", "vector"], -} - - -def expand_query_tokens(query_tokens): - """Layer 2: bridge Korean<->English lexical gaps via synonym map. - Returns the original tokens + synonym expansions (capped at 2x length).""" - out = list(query_tokens) - for t in query_tokens: - syns = _SYNONYM_EXPANSION.get(t.lower()) - if syns: - out.extend(syns) - # Dedupe while preserving order - seen = set(); uniq = [] - for t in out: - k = t.lower() - if k not in seen: - seen.add(k); uniq.append(t) - return uniq[:len(query_tokens) * 2 + 5] # cap growth - - -def _bge_rerank(query: str, docs: list): - """Query the running bge-daemon for cross-encoder scores. - - Returns list[float] (raw logits, same length as docs) or None on failure. - Caller applies sigmoid + filtering. Fail-fast: 2s timeout keeps the hook - responsive if the daemon is wedged. - """ - if not _USE_CROSS_ENCODER or not _BGE_SOCK.exists(): - return None - try: - import socket as _sk - s = _sk.socket(_sk.AF_UNIX, _sk.SOCK_STREAM) - s.settimeout(_BGE_TIMEOUT) - s.connect(str(_BGE_SOCK)) - payload = (json.dumps({"query": query[:400], - "docs": [str(d)[:400] for d in docs]}) + "\n").encode("utf-8") - s.sendall(payload) - buf = b"" - while b"\n" not in buf: - chunk = s.recv(65536) - if not chunk: - break - buf += chunk - s.close() - resp = json.loads(buf.split(b"\n")[0].decode("utf-8")) - if resp.get("ok"): - return resp.get("scores") - except Exception: - return None - return None - - -def _vec_embed(text: str): - """Query the running vec-daemon for an embedding. Returns list[float] or None. - Uses the same Unix socket protocol as chat-memory.py; 0 if daemon is down.""" - if _VEC_DISABLED or not _VEC_SOCK.exists(): - return None - try: - import socket - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.settimeout(_VEC_TIMEOUT) - s.connect(str(_VEC_SOCK)) - payload = (json.dumps({"q": text[:1000]}) + "\n").encode("utf-8") - s.sendall(payload) - buf = b"" - while b"\n" not in buf: - chunk = s.recv(8192) - if not chunk: - break - buf += chunk - s.close() - line = buf.split(b"\n")[0] - resp = json.loads(line.decode("utf-8")) - if resp.get("ok"): - return resp.get("emb") - except Exception: - return None - return None - - -def _cosine(a, b): - if not a or not b or len(a) != len(b): - return 0.0 - import math - dot = sum(x * y for x, y in zip(a, b)) - # embeddings from vec-daemon are already normalized → dot = cosine - return max(0.0, min(1.0, dot)) - - -def semantic_rerank_filter(candidates, query, top_k, alpha_bm25=0.6, - cosine_min=0.55, bm25_scores=None): - """Rerank a list of candidate items by blended BM25 + cosine semantic. - - candidates: list of dicts, each with a 'text' or 'subject' field - query: user query string - top_k: final count - alpha_bm25: weight of BM25 score in blend (1-alpha = semantic weight) - cosine_min: hard floor — items below this cosine get dropped even if BM25 is high - bm25_scores: optional pre-computed BM25 scores (normalized 0-1); if None, - assume candidates are already ordered by BM25 → use rank position - - Fail-safe: if vec-daemon is down, returns candidates[:top_k] (no-op). - - Layer 3 (2026-04-24): prefer BGE cross-encoder when available — it scores - (query, candidate) jointly instead of computing independent embeddings + - cosine. Much stronger semantic judgement on short commit subjects. - Falls back to bi-encoder cosine path if cross-encoder fails to load. - """ - # ── Layer 3: bge-daemon cross-encoder path (strongest semantic signal) ─── - # Calls the resident bge-daemon over Unix socket. Daemon holds BGE weights - # in GPU memory so hook pays ~50ms per rerank, not the 7s cold-load. - kept = [] - doc_texts = [] - for i, c in enumerate(candidates): - text = c.get("subject") or c.get("text") or "" - if not text: - continue - kept.append((i, c)) - doc_texts.append(text[:400]) - if doc_texts: - ce_scores = _bge_rerank(query, doc_texts) - if ce_scores is not None and len(ce_scores) == len(kept): - import math - def _sig(x): return 1.0 / (1.0 + math.exp(-float(x))) - rescored = [] - ce_min = 0.35 - for (i, c), s in zip(kept, ce_scores): - ce_norm = _sig(s) - if ce_norm < ce_min: - continue - bm25_norm = (bm25_scores[i] if bm25_scores else (len(candidates) - i) / max(1, len(candidates))) - blend = alpha_bm25 * bm25_norm + (1.0 - alpha_bm25) * ce_norm - rescored.append((blend, ce_norm, c)) - rescored.sort(key=lambda x: -x[0]) - if rescored: - return [c for _, _, c in rescored[:top_k]] - # CE filtered everything → fall back to bi-encoder - - # ── Bi-encoder fallback (original path) ─── - q_emb = _vec_embed(query) - if not q_emb: - return candidates[:top_k] # daemon down → no-op - - rescored = [] - for i, c in enumerate(candidates): - text = c.get("subject") or c.get("text") or "" - if not text: - continue - c_emb = _vec_embed(text[:400]) # short for speed - if not c_emb: - continue - cos = _cosine(q_emb, c_emb) - if cos < cosine_min: - continue # hard drop — semantic dissimilarity overrides BM25 rank - # Normalize BM25 to [0,1] by rank position (top = 1.0, bottom = ~0) - bm25_norm = (bm25_scores[i] if bm25_scores else (len(candidates) - i) / max(1, len(candidates))) - blend = alpha_bm25 * bm25_norm + (1.0 - alpha_bm25) * cos - rescored.append((blend, cos, c)) - rescored.sort(key=lambda x: -x[0]) - return [c for _, _, c in rescored[:top_k]] - - -# Porter stemmer (opt-in via CTX_STEM=1, default ON 2026-04-24 after G1 regression -# showed +0.034 improvement on Recall@7 with zero losses. See -# benchmarks/results/g1_regression_ctx_v2.json). -# Rationale: collapses "logs"/"logging"/"logged" → "log" so queries match stem- -# variants in commit subjects. Especially important for conflict-resolution -# (MAB Competency-4) where reversal vocab shifts (e.g. "rerank" → "reranking"). -_USE_STEMMER = os.environ.get("CTX_STEM", "1") != "0" -_STEMMER = None -if _USE_STEMMER: - try: - from nltk.stem.porter import PorterStemmer as _PS - _STEMMER = _PS() - except ImportError: - _STEMMER = None # stemming silently disabled if nltk not installed - - -def tokenize(text: str, drop_stopwords: bool = False): - """Preserve decimal numbers (0.724) and numeric ranges (7-30) as single tokens. - Also strips Korean particles from mixed Korean-ASCII tokens (e.g. 'BM25와' → 'bm25' + 'bm25와') - so that Korean queries match English commit subjects correctly. - - When `drop_stopwords=True` (query-side only), conversational fillers are - removed to prevent BM25 from matching on common words like "i", "to", "how", - "would", etc. Corpus tokenization never drops stopwords — IDF handles those. - - Porter stemmer (2026-04-24): adds stemmed variant for each token so "logs" - matches "logging". Preserves the original token too so exact-match precision - is never lost (dedup handles duplicates). Opt-out via CTX_STEM=0. - """ - raw = re.findall(r'\d+[-\u2013]\d+|\d+\.\d+|\w+', text.lower()) - result = [] - for tok in raw: - if drop_stopwords and tok in _STOPWORDS: - continue - cleaned = _KO_PARTICLES.sub('', tok) - if cleaned and cleaned != tok: - if not (drop_stopwords and cleaned in _STOPWORDS): - result.append(cleaned) - result.append(tok) - # Porter stem — adds a THIRD variant. Dedup at return preserves order - # so original tokens remain ranked; stem is a recall-rescue fallback. - if _STEMMER is not None and tok.isalpha() and len(tok) > 3: - stemmed = _STEMMER.stem(tok) - if stemmed != tok: - result.append(stemmed) - return list(dict.fromkeys(result)) - - -# ── G1: Decision Corpus ────────────────────────────────────────── - -_CONV_PREFIXES = ( - "feat:", "fix:", "refactor:", "perf:", "security:", "design:", "test:", - "feat(", "fix(", "refactor(", "perf(", +from _bm25.rerank import ( # noqa: E402 + VEC_SOCK as _VEC_SOCK, VEC_DISABLED as _VEC_DISABLED, + BGE_SOCK as _BGE_SOCK, USE_CROSS_ENCODER as _USE_CROSS_ENCODER, ) -_VERSION_RE = re.compile(r"^v\d+\.\d+") -_DECISION_KEYWORDS = ( - "pivot", "revert", "dead-end", "rejected", "chose", "switched", - "CONVERGED", "failed", "success", "fix", "improvement", - "benchmark", "eval", "decision", "iter", +from _bm25.corpus import get_decision_corpus, _classify_query_type # noqa: E402 +from _bm25.ranker import ( # noqa: E402 + hybrid_rank_decisions, last_retrieval_scores as _ranker_scores, ) -_NOISE_PREFIXES = ("# ", "wip:", "merge ", 'revert "') -_STRICT_VERSION_RE = re.compile(r"^v\d+\.\d+\.\d+") -_OMC_ITER_RE = re.compile(r"^(omc-live|live-inf)\s+iter", re.IGNORECASE) -_EMBEDDED_DECISION_RE = re.compile( - r"\s[-\u2014]\s*(feat|fix|refactor|perf|security|design|implement|add|remove|replace|switch|migrate)", - re.IGNORECASE, +from _bm25.docs_search import build_docs_bm25, hybrid_search_docs # noqa: E402 +from _bm25.code_search import ( # noqa: E402 + extract_keywords, find_db, log_retrieved_nodes, + check_and_trigger_reindex, search_graph_for_prompt, search_files_by_grep, ) -_YYYYMMDD_RE = re.compile(r"^\d{8}\s") # CTX-style: "20260408 G1 temporal..." - - -def _is_structural_noise(subject): - s = subject.strip() - if _OMC_ITER_RE.match(s): - return True - if _STRICT_VERSION_RE.match(s): - return not bool(_EMBEDDED_DECISION_RE.search(s)) - return False - - -def _is_decision(subject): - """Detect decision commits: conventional, version-tagged, YYYYMMDD, or keyword.""" - s = subject.strip() - if not s: - return False - sl = s.lower() - if any(sl.startswith(p) for p in _NOISE_PREFIXES): - return False - if any(sl.startswith(p) for p in _CONV_PREFIXES): - return True - if _VERSION_RE.match(s): - return True - if _YYYYMMDD_RE.match(s): # CTX-style date-prefixed commits - return True - return any(kw.lower() in sl for kw in _DECISION_KEYWORDS) - - -# ── query_type classification (for retrieval_event schema v1.1) ────────────── -_TEMPORAL_KW = frozenset([ - "when", "history", "timeline", "progression", "what happened", "progress", - "previously", "before", "after", "last time", "since", "ago", "recent", - "changed", "evolution", "how long", "session", "yesterday", "last week", - "진행", "역사", "이전", "지난", "타임라인", "최근", "변경", "이번", -]) - -def _classify_query_type(prompt: str) -> str: - """Classify prompt into TEMPORAL / KEYWORD / SEMANTIC. - - TEMPORAL — query is about history/timeline/progression - KEYWORD — short technical lookup (≤60 chars) or pure symbol/identifier - SEMANTIC — natural language conceptual query (default) - """ - if not prompt: - return "KEYWORD" - pl = prompt.lower() - if any(kw in pl for kw in _TEMPORAL_KW): - return "TEMPORAL" - words = pl.split() - if len(words) <= 6: - return "KEYWORD" - return "SEMANTIC" - - -def get_git_head(project_dir): - try: - r = subprocess.run( - ["git", "rev-parse", "HEAD"], - cwd=project_dir, capture_output=True, text=True, timeout=3, - ) - return r.stdout.strip() if r.returncode == 0 else None - except Exception: - return None - - -def build_decision_corpus(project_dir, n=500): - """Extract all decision commits from git log (no cap).""" - try: - r = subprocess.run( - ["git", "log", f"-{n}", "--format=%H\x1f%s\x1f%ai"], - cwd=project_dir, capture_output=True, text=True, timeout=10, - ) - if r.returncode != 0: - return [] - except Exception: - return [] - - corpus = [] - seen = set() - for line in r.stdout.strip().split("\n"): - if not line.strip(): - continue - parts = line.strip().split("\x1f", 2) - if len(parts) < 2: - continue - commit_hash = parts[0] - subject = parts[1][:120] - date = parts[2][:10] if len(parts) == 3 else "" - - if _is_structural_noise(subject): - continue - key = subject[:60] - if key in seen: - continue - seen.add(key) - - if _is_decision(subject): - # 고우선순위 패턴 → text 중복 삽입으로 BM25 가중치 증폭 - is_milestone = any(p in subject for p in [ - "CONVERGED", "pivot", "완성", "완료", "검증", "수렴", "FAILED", "KILL" - ]) - text = f"{date} {subject}" - if is_milestone: - text = f"{text}\n{text}" # 2배 가중치 - corpus.append({ - "hash": commit_hash, - "subject": subject, - "date": date, - "text": text, - }) - - return corpus - - -def get_decision_corpus(project_dir): - """Return cached corpus or rebuild if git HEAD changed. - - Extended (2026-04-26): also pre-embeds corpus items via vec-daemon and caches - embeddings in the same file under an 'emb_head' sentinel. Embeddings allow - dense first-stage retrieval (dense_rank_decisions) without per-query N socket - calls. Falls back gracefully: if vec-daemon is down, items lack 'emb' field - and dense_rank_decisions returns []. - """ - cache_path = Path(project_dir) / ".omc" / "decision_corpus.json" - head = get_git_head(project_dir) - - if cache_path.exists() and head: - try: - cached = json.loads(cache_path.read_text()) - if cached.get("head") == head: - corpus = cached["corpus"] - # Check if embeddings are fresh for this HEAD - if cached.get("emb_head") != head: - n = embed_corpus_items(corpus) - if n > 0: - cache_path.write_text(json.dumps({ - "head": head, "corpus": corpus, "emb_head": head - })) - return corpus - except Exception: - pass - - corpus = build_decision_corpus(project_dir) - if head and corpus: - try: - cache_path.parent.mkdir(exist_ok=True) - embed_corpus_items(corpus) - cache_path.write_text(json.dumps({ - "head": head, "corpus": corpus, "emb_head": head - })) - except Exception: - pass - return corpus - - -def embed_corpus_items(corpus): - """Add 'emb' field to corpus items using vec-daemon. Modifies in-place. - - Only embeds items missing 'emb'. Returns count of newly embedded items. - Fail-safe: if vec-daemon is down, items are left without 'emb' and - dense_rank_decisions will return [] (BM25-only fallback). - """ - embedded = 0 - for item in corpus: - if item.get("emb"): - continue - text = (item.get("subject") or item.get("text") or "")[:400] - if not text: - continue - emb = _vec_embed(text) - if emb: - item["emb"] = emb - embedded += 1 - return embedded - - -def dense_rank_decisions(corpus, query, top_k=20): - """Dense first-stage retrieval: cosine similarity between query embedding - and pre-computed corpus embeddings (from embed_corpus_items). - - Returns top-k items by cosine, or [] if vec-daemon unavailable or corpus - has no embeddings (BM25-only fallback). - """ - q_emb = _vec_embed(query) - if not q_emb: - return [] - scored = [] - for item in corpus: - emb = item.get("emb") - if not emb: - continue - cos = _cosine(q_emb, emb) - if cos > 0.0: - scored.append((cos, item)) - if not scored: - return [] - scored.sort(key=lambda x: -x[0]) - _last_retrieval_scores["dense_top"] = float(scored[0][0]) - return [item for _, item in scored[:top_k]] - - -def rrf_merge(list_a, list_b, k_rrf=60): - """Reciprocal Rank Fusion of two ranked lists. - - k_rrf=60: optimal constant per BEIR paper (arXiv:2104.08663) — controls - score distribution across rank positions. - - Uses commit 'hash' as dedup key; falls back to first-20-chars of 'text'. - Returns merged list ordered by RRF score (descending). - """ - scores = {} - hash_to_item = {} - - def _key(item): - return item.get("hash") or (item.get("text") or "")[:20] - - for rank, item in enumerate(list_a, 1): - k = _key(item) - scores[k] = scores.get(k, 0.0) + 1.0 / (k_rrf + rank) - hash_to_item[k] = item - - for rank, item in enumerate(list_b, 1): - k = _key(item) - scores[k] = scores.get(k, 0.0) + 1.0 / (k_rrf + rank) - hash_to_item[k] = item - - merged_keys = sorted(scores.keys(), key=lambda h: -scores[h]) - return [hash_to_item[h] for h in merged_keys] - - -def bm25_rank_decisions(corpus, query, top_k=7, min_score=0.5, - adaptive_floor_ratio=0.35, mmr_jaccard_threshold=0.70, - skip_rerank=False): - """BM25-rank decision corpus against query, return top-k. - - Stopwords are dropped from the query (not the corpus) so conversational - fillers like "i/to/how/would" don't dominate the ranking. - - `min_score`: if the best-matching decision scores below this, return []. - Prevents the "no-topic-match → fallback to most-recent-7" anti-pattern - where zero-score or near-zero queries got ranked purely by git-log order. - - `adaptive_floor_ratio` (NEW 2026-04-24): candidates below - top_score * adaptive_floor_ratio are dropped. Eliminates the - "surface-token match" noise where a hit scores just above min_score - but is 3-5× worse than the actual best hit (e.g., 'iter 47/∞: token%' - scoring 1.2 when the real match scores 4.0). - - `mmr_jaccard_threshold` (NEW 2026-04-24): if a candidate's token set has - Jaccard similarity >= threshold with any already-selected item, skip it. - Collapses clustered noise like multiple 'live-infinite iter N/∞' entries - that are near-duplicates — keeps only the best of each cluster. - """ - if not corpus: - return [] - if not HAS_BM25 or not query.strip(): - return [] - - query_tokens = tokenize(query, drop_stopwords=True) - if not query_tokens: - return [] - - # Layer 2 (2026-04-24): synonym expansion to bridge KO↔EN + concept gaps - # (e.g. "cross-session memory" now matches "persistent long-term 장기기억" too). - query_tokens = expand_query_tokens(query_tokens) - - tokenized = [tokenize(c["text"]) for c in corpus] - bm25 = BM25Okapi(tokenized) - scores = bm25.get_scores(query_tokens) - if len(scores) == 0 or float(max(scores)) < min_score: - return [] - - top_score = float(max(scores)) - _last_retrieval_scores["bm25_top"] = top_score - adaptive_floor = max(min_score, top_score * adaptive_floor_ratio) - - ranked_idx = sorted(range(len(corpus)), key=lambda i: scores[i], reverse=True) - - # Cluster signature: normalizes "live-infinite iter N/∞: goal_vM" boilerplate - # so different iter-numbers don't escape MMR dedup (MEMORY.md: "live-inf iter - # N/∞ topic-dedup collapse" known issue). - import re as _re - def _cluster_sig(subject: str) -> str: - s = subject.lower() - s = _re.sub(r'\b\d{4,}\b|\b\d+/\d+\b|\b\d+/∞\b|goal_v\d+', '', s) - s = _re.sub(r'iter\s*\d+', 'iter', s) - s = _re.sub(r'[^a-z가-힣\s]', ' ', s) - s = _re.sub(r'\s+', ' ', s).strip() - # First 4 distinctive words form the cluster signature - return ' '.join(s.split()[:4]) - - selected = [] - selected_token_sets = [] - selected_cluster_sigs = set() - for idx in ranked_idx: - if scores[idx] < adaptive_floor: - break - cand_tokens = set(tokenized[idx]) - if not cand_tokens: - continue - cand_sig = _cluster_sig(corpus[idx].get("subject", corpus[idx].get("text", ""))) - # Cluster dedup: skip if any selected item has the same normalized sig - if cand_sig and cand_sig in selected_cluster_sigs: - continue - # MMR-lite: skip if too similar to already-selected items - is_near_dup = False - for prev_tokens in selected_token_sets: - union = cand_tokens | prev_tokens - if not union: - continue - jaccard = len(cand_tokens & prev_tokens) / len(union) - if jaccard >= mmr_jaccard_threshold: - is_near_dup = True - break - if is_near_dup: - continue - selected.append(corpus[idx]) - selected_token_sets.append(cand_tokens) - if cand_sig: - selected_cluster_sigs.add(cand_sig) - # Keep 2x the target so semantic rerank has room to re-order/filter - if len(selected) >= top_k * 2: - break - # Layer 1 (2026-04-24): lowered gate — rerank fires for 60%+ of queries now, - # was ~30% with `> top_k` (many queries returned exactly top_k candidates). - if not skip_rerank and len(selected) >= top_k + 2: - selected = semantic_rerank_filter(selected, query, top_k=top_k) - return selected[:top_k] - - -def hybrid_rank_decisions(corpus, query, top_k=7): - """Hybrid BM25+dense retrieval with RRF merge — SOTA method per MAB/LongMemEval. - - Pipeline (2026-04-26): - 1. BM25 top-(top_k*2) with MMR/cluster dedup, NO semantic rerank yet - 2. Dense top-(top_k*2) using pre-embedded corpus via vec-daemon cosine - 3. RRF merge (k=60) — union of both candidate pools - 4. Semantic rerank (BGE cross-encoder → vec-daemon bi-encoder fallback) - - Advantage over BM25-only: recovers nodes that BM25 misses entirely (zero score) - but are semantically close to the query (e.g. synonyms, paraphrases, concept drift). - - Fail-safe: if dense_rank_decisions() returns [] (vec-daemon down or no embeddings), - falls back to BM25-only + semantic rerank (existing behavior). - """ - # Step 1: BM25 candidates (skip rerank here — we'll do it after RRF) - bm25_cands = bm25_rank_decisions( - corpus, query, top_k=top_k * 2, - skip_rerank=True - ) - if not bm25_cands: - return [] - - # Step 2: Dense candidates - dense_cands = dense_rank_decisions(corpus, query, top_k=top_k * 2) - - if not dense_cands: - # Dense unavailable — fall back to BM25 with rerank - if len(bm25_cands) >= top_k + 2: - bm25_cands = semantic_rerank_filter(bm25_cands, query, top_k=top_k) - return bm25_cands[:top_k] - - # Step 3: RRF merge - merged = rrf_merge(bm25_cands, dense_cands, k_rrf=60) - - # Step 4: Semantic rerank on merged pool - if len(merged) >= top_k + 2: - merged = semantic_rerank_filter(merged, query, top_k=top_k) - - return merged[:top_k] - - -# ── G2: Docs BM25 ──────────────────────────────────────────────── - -def _extra_doc_files(project_dir): - """Return extra files to include in the docs index (project-agnostic).""" - # MEMORY.md: ~/.claude/projects/{slug}/memory/MEMORY.md - # Claude uses leading-dash slug: /home/foo/bar → -home-foo-bar - slug = project_dir.replace("/", "-") - memory_md = os.path.expanduser(f"~/.claude/projects/{slug}/memory/MEMORY.md") - candidates = [ - os.path.join(project_dir, "CLAUDE.md"), - os.path.join(project_dir, "README.md"), - memory_md, - ] - return [p for p in candidates if os.path.exists(p)] - - -def chunk_document(filename, content): - """Split by ## headers; each chunk = 'filename § header\\nbody'.""" - chunks = [] - parts = re.split(r"\n(?=## )", content) - for part in parts: - part = part.strip() - if not part: - continue - lines = part.split("\n", 1) - header = re.sub(r"^#+\s*", "", lines[0].strip()) - body = lines[1].strip() if len(lines) > 1 else "" - text = f"{filename} § {header}\n{body}" - if len(text) > 50: - chunks.append(text[:2500]) - return chunks - - -def build_docs_bm25(project_dir): - """Build BM25 index over docs/research/*.md + CLAUDE.md + MEMORY.md. - Strategy: full-doc (no chunking) — A/B test 2026-04-11 confirms +9.1% recall@5 - vs header-chunked approach (0.758 vs 0.667 on 33 paraphrase pairs). - Full-doc wins on temporal/open-set/perf queries where answers span multiple sections. - """ - all_units = [] - docs_dir = Path(project_dir) / "docs" / "research" - if docs_dir.exists(): - for md_file in sorted(docs_dir.glob("*.md")): - try: - text = f"{md_file.name}\n{md_file.read_text()}" - if len(text) > 50: - all_units.append(text) - except Exception: - pass - - for fpath in _extra_doc_files(project_dir): - try: - p = Path(fpath) - text = f"{p.name}\n{p.read_text()}" - if len(text) > 50: - all_units.append(text) - except Exception: - pass - - if not all_units or not HAS_BM25: - return None, [] - tokenized = [tokenize(u) for u in all_units] - return BM25Okapi(tokenized), all_units - - - -# Korean→English expansion for G2-DOCS BM25 path (iter 44). -# Docs corpus is English; Korean queries must be expanded to match. -# These are CTX/ML domain terms that appear frequently in research docs. -_KO_EN_DOCS = { - "하이브리드": "hybrid", "밀집": "dense", "검색": "search,retrieve", - "재색인": "reindex", "인용": "citation", "거짓": "false", - "양성": "positive", "시멘틱": "semantic", "지연": "latency", - "시간": "time,latency", "수준": "tier,level", - "벡터": "vector,embedding", "마이그레이션": "migration", - "임베딩": "embedding", "벤치마크": "benchmark,eval", - "메모리": "memory", "코드베이스": "codebase", - "데이터베이스": "database", "오래된": "stale,staleness", - "측정": "measure,probe", "비율": "rate,ratio", - "성능": "performance,latency", "업그레이드": "upgrade", - "노드": "node", "병합": "merge", "구현": "implementation", - "분석": "analysis,evaluation", "아키텍처": "architecture", - "평가": "eval,evaluate,benchmark", "프레임워크": "framework", - "알고리즘": "algorithm", "최적화": "optimize,optimization", - "자동": "auto,automatic", "색인": "index", "인덱스": "index", -} - - -def _expand_ko_en_docs(tokens): - """Expand Korean tokens via _KO_EN_DOCS for G2-DOCS BM25 queries.""" - expanded = list(tokens) - for t in tokens: - mapping = _KO_EN_DOCS.get(t) - if mapping: - expanded.extend(mapping.split(",")) - return list(dict.fromkeys(expanded)) - - -def bm25_search_docs(project_dir, query, top_k=5): - """Return top-k docs most relevant to query (full-doc BM25, no chunking). - Query-side stopword filter prevents conversational fillers from dominating. - Korean queries are expanded via _KO_EN_DOCS before scoring (iter 44). - """ - if not query.strip(): - return [] - bm25, units = build_docs_bm25(project_dir) - if not bm25: - return [] - query_tokens = tokenize(query, drop_stopwords=True) - query_tokens = _expand_ko_en_docs(query_tokens) # Korean→English expansion - if not query_tokens: - return [] - scores = bm25.get_scores(query_tokens) - ranked = sorted(range(len(units)), key=lambda i: scores[i], reverse=True) - # threshold=1.0: full-doc scores for relevant queries are 3.0-6.0; 0.0 = no overlap - # adaptive floor (2026-04-24): also drop anything below 35% of top score - top_score = float(max(scores)) if len(scores) else 0.0 - floor = max(1.0, top_score * 0.35) - bm_filtered = [units[i] for i in ranked[:top_k * 2] if scores[i] >= floor] - # Semantic rerank (2026-04-24 iter 2): dedupes BM25-surface hits from different meanings - if len(bm_filtered) > top_k: - # Each unit is "filename\ncontent"; wrap as dict for the reranker - cand_dicts = [{"subject": u.split("\n", 1)[0], "text": u[:400]} for u in bm_filtered] - reranked_dicts = semantic_rerank_filter(cand_dicts, query, top_k=top_k) - # Map back to original units by subject (filename) - subject_to_unit = {u.split("\n", 1)[0]: u for u in bm_filtered} - return [subject_to_unit[d["subject"]] for d in reranked_dicts if d["subject"] in subject_to_unit] - return bm_filtered[:top_k] - - -# ── G2-DOCS: Hybrid BM25+Dense Search ─────────────────────────── - -_docs_emb_cache_state = {} # in-memory: {"key": str, "units_emb": [...]} - - -def _docs_cache_key(units): - """Stable cache key based on doc filenames (sorted join → simple hash).""" - filenames = sorted(u.split("\n", 1)[0] for u in units) - key_str = "|".join(filenames) - # stdlib-only fingerprint: sum of char ords mod 10^10 - return str(sum(ord(c) * (i + 1) for i, c in enumerate(key_str)) % (10 ** 10)) - - -def embed_docs_units(units, cache_path): - """Pre-embed docs corpus. Returns list of dicts: - {"hash": filename, "text": unit_string, "emb": list_or_[]}. - - Caches to cache_path; invalidates when doc set changes. - Fail-safe: items without embedding skip dense but still contribute via BM25. - """ - key = _docs_cache_key(units) - - if _docs_emb_cache_state.get("key") == key: - return _docs_emb_cache_state["units_emb"] - - if cache_path.exists(): - try: - cached = json.loads(cache_path.read_text()) - if cached.get("key") == key: - _docs_emb_cache_state.update(cached) - return cached["units_emb"] - except Exception: - pass - - # Embed each unit (filename + first 400 chars as subject) - units_emb = [] - for u in units: - filename = u.split("\n", 1)[0] - preview = u[:400] - emb = _vec_embed(preview) - units_emb.append({"hash": filename, "text": u, "emb": emb or []}) - - try: - cache_path.parent.mkdir(parents=True, exist_ok=True) - cache_path.write_text(json.dumps({"key": key, "units_emb": units_emb})) - except Exception: - pass - - _docs_emb_cache_state.update({"key": key, "units_emb": units_emb}) - return units_emb - - -def dense_rank_docs(units_emb, query, top_k=10): - """Dense first-stage retrieval for docs corpus. - - units_emb: list of {"hash": filename, "text": unit_str, "emb": list} - Returns top_k dicts ranked by cosine similarity, or [] if vec-daemon down. - """ - q_emb = _vec_embed(query) - if not q_emb: - return [] - scored = [] - for item in units_emb: - emb = item.get("emb") - if not emb: - continue - cos = _cosine(q_emb, emb) - if cos > 0.0: - scored.append((cos, item)) - if not scored: - return [] - scored.sort(key=lambda x: -x[0]) - _last_retrieval_scores["dense_top"] = float(scored[0][0]) - return [item for _, item in scored[:top_k]] - - -def hybrid_search_docs(project_dir, query, top_k=5): - """Hybrid BM25+dense RRF search over docs/research/*.md corpus. - - Same pipeline as hybrid_rank_decisions() for G1: - 1. BM25 top-(top_k*2) candidates (threshold filtered) - 2. Dense top-(top_k*2) via pre-embedded corpus (vec-daemon cosine) - 3. RRF merge (k=60) - 4. Semantic rerank (BGE/vec-daemon) on merged pool - - Fail-safe: dense unavailable → BM25+semantic rerank (existing behavior). - Returns list of unit strings — same format as bm25_search_docs(). - """ - bm25, units = build_docs_bm25(project_dir) - if not bm25 or not units or not query.strip(): - return [] - - query_tokens = tokenize(query, drop_stopwords=True) - query_tokens = _expand_ko_en_docs(query_tokens) # Korean→English expansion (iter 44) - if not query_tokens: - return [] - - # Step 1: BM25 candidates - scores = bm25.get_scores(query_tokens) - top_score = float(max(scores)) if len(scores) else 0.0 - _last_retrieval_scores["bm25_top"] = top_score - if top_score < 1.0: - return [] - floor = max(1.0, top_score * 0.35) - ranked = sorted(range(len(units)), key=lambda i: scores[i], reverse=True) - bm25_filtered = [units[i] for i in ranked[:top_k * 2] if scores[i] >= floor] - if not bm25_filtered: - return [] - - bm25_dicts = [{"hash": u.split("\n", 1)[0], "text": u} for u in bm25_filtered] - - # Step 2: Dense candidates (pre-embedded corpus, 1 vec-daemon call for query) - cache_path = Path(project_dir) / ".omc" / "docs_corpus_emb.json" - units_emb = embed_docs_units(units, cache_path) - dense_dicts = dense_rank_docs(units_emb, query, top_k=top_k * 2) - - if not dense_dicts: - # Fallback: BM25 + semantic rerank - if len(bm25_filtered) > top_k: - cand_dicts = [{"subject": u.split("\n", 1)[0], "text": u[:400]} - for u in bm25_filtered] - reranked = semantic_rerank_filter(cand_dicts, query, top_k=top_k) - subj_map = {u.split("\n", 1)[0]: u for u in bm25_filtered} - return [subj_map[d["subject"]] for d in reranked if d["subject"] in subj_map] - return bm25_filtered[:top_k] - - # Step 3: RRF merge - merged = rrf_merge(bm25_dicts, dense_dicts, k_rrf=60) - - # Step 4: Semantic rerank on merged pool - if len(merged) >= top_k + 2: - reranked = semantic_rerank_filter(merged, query, top_k=top_k) - return [item.get("text", "") for item in reranked if item.get("text")] - - return [item.get("text", "") for item in merged[:top_k] if item.get("text")] - - -# ── G2: Code File Discovery ────────────────────────────────────── - -_STOP_WORDS = { - "the","a","an","is","are","was","were","be","been","have","has","had", - "do","does","did","will","would","could","should","may","might","can", - "to","of","in","for","on","with","at","by","from","as","into", - "it","this","that","i","you","he","she","we","they","me", - "and","or","but","not","no","if","then","else","when","where","how","what", - "해줘","해","바람","좀","것","수","있","없","하다","되다","이","그","저","뭐","어떻게", - "기능","작업","관련","파일","코드","문서","수정","추가","변경","확인","돌려봐", - "올려","실행","해봐","분석","개선","확인해", -} -_KO_EN = { - "검색": "search,retrieve,find", "엔진": "engine,retriever", - "벤치마크": "benchmark,eval", "평가": "eval,evaluate", - "트리거": "trigger", "분류": "classify,classifier", - "밀도": "dense,density", "테스트": "test", - "결과": "result", "스코어": "score", - "그래프": "graph", "다운스트림": "downstream", - "외부": "external,reeval", "정확도": "accuracy,precision", - "이메일": "email,mail", "발송": "send,outreach", - "대시보드": "dashboard,admin", "구독": "subscription,subscribe", - "인증": "auth,authenticate", "로그인": "login,signin", - "사용자": "user,member", "데이터베이스": "database,schema", - "함수": "function,handler", "컴포넌트": "component", - "페이지": "page,route", "설정": "config,settings", - "환경": "env,environment", "서버": "server,backend", - "실험": "experiment,trial", "배포": "deploy,deployment", - "오류": "error,exception", "버그": "bug,error", - "성능": "performance,latency", "최적화": "optimize,cache", - "알림": "notification,alert", "권한": "permission,auth", - "훅": "hook", "메모리": "memory", "인덱스": "index", -} -_CODE_EXT = { - ".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".java", - ".sh", ".bash", ".yaml", ".yml", ".toml", ".sql", ".css", ".html", - ".c", ".cpp", ".h", ".rb", ".php", ".swift", ".kt", -} -_SKIP_PREFIXES = (".omc/", "docs/", "benchmarks/results/", "tests/fixtures/") - - -def extract_keywords(prompt): - """Extract meaningful keywords from prompt; expand Korean→English.""" - words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{2,}|[가-힣]{2,}', prompt) - keywords = [] - for w in words: - if w.lower() in _STOP_WORDS or len(w) < 2: - continue - if re.match(r'[가-힣]', w) and w in _KO_EN: - keywords.extend(_KO_EN[w].split(",")) - else: - keywords.append(w) - return keywords[:8] - - -def find_db(project_dir): - """Locate codebase-memory-mcp SQLite DB for this project.""" - cache_dir = os.path.expanduser("~/.cache/codebase-memory-mcp") - if not os.path.isdir(cache_dir): - return None - slug = project_dir.replace("/", "-").lstrip("-") - db_path = os.path.join(cache_dir, f"{slug}.db") - if os.path.exists(db_path): - return db_path - for f in os.listdir(cache_dir): - if f.endswith(".db") and os.path.basename(project_dir).lower() in f.lower(): - return os.path.join(cache_dir, f) - return None - - -_REINDEX_LOCK = os.path.expanduser("~/.cache/codebase-memory-mcp/.reindex_in_progress") -_STALE_THRESHOLD_HOURS = 24 - -# ── Citation Probe (iter 40) ────────────────────────────────────────────────── -# Logs retrieved nodes per turn to .omc/retrieval_log.jsonl. -# A separate analysis script (benchmarks/eval/citation_probe.py) cross-references -# these logs with vault.db chat history to compute actual citation rate per node type. -# Goal: measure what fraction of retrieved G1/G2 nodes Claude actually cites in responses. - -def log_retrieved_nodes(project_dir, session_id, prompt, block, items): - """ - Append a retrieval event to .omc/retrieval_log.jsonl. - - Args: - project_dir: project root path - session_id: Claude session ID (from input_data) - prompt: user prompt (first 120 chars stored) - block: "g1_decisions" | "g2_docs" | "g2_prefetch" | "g2_hooks" - items: list of dicts, each with at minimum {"id": str, "text": str} - g1: {"id": hash, "text": subject, "date": date} - g2_docs: {"id": filename, "text": unit_preview} - g2_prefetch: {"id": fpath, "text": f"{label}:{name}"} - """ - if not items: - return - try: - import time as _t - log_path = os.path.join(project_dir, ".omc", "retrieval_log.jsonl") - entry = { - "ts": _t.time(), - "session_id": session_id, - "prompt_prefix": prompt[:120], - "block": block, - "items": items[:10], # cap at 10 per block - } - with open(log_path, "a", encoding="utf-8") as f: - f.write(json.dumps(entry, ensure_ascii=False) + "\n") - except Exception: - pass # citation probe is non-critical — never break the main hook - - -def check_and_trigger_reindex(project_dir, db_path): - """ - Check if codebase-memory-mcp DB is stale (>24h). If so, spawn an incremental - reindex in the background (non-blocking). Returns a warning string if stale, - or None if fresh. - - Uses a lock file to prevent multiple concurrent reindex launches. - Tool: codebase-memory-mcp cli index_repository '{"repo_path":"...", "mode":"fast"}' - """ - try: - import time as _t_mod - age_hours = (_t_mod.time() - os.path.getmtime(db_path)) / 3600 - except OSError: - return None - - if age_hours < _STALE_THRESHOLD_HOURS: - return None # fresh — no action needed - - age_str = f"{age_hours:.0f}h" if age_hours < 48 else f"{age_hours/24:.1f}d" - - # Check if reindex already running (lock file < 10 min old) - if os.path.exists(_REINDEX_LOCK): - try: - import time as _t_mod - lock_age = (_t_mod.time() - os.path.getmtime(_REINDEX_LOCK)) / 60 - if lock_age < 10: - return f"⚠ G2-CODE DB stale ({age_str}) — reindex already running" - except OSError: - pass - - # Spawn background reindex - try: - import json as _json - args = _json.dumps({"repo_path": project_dir, "mode": "fast"}) - cmd = ["codebase-memory-mcp", "cli", "index_repository", args] - subprocess.Popen( - cmd, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - start_new_session=True, # detach from hook process group - ) - # Touch lock file - open(_REINDEX_LOCK, "w").close() - return f"⚠ G2-CODE DB stale ({age_str}) — auto-reindex triggered (fast mode, background)" - except Exception: - return f"⚠ G2-CODE DB stale ({age_str}) — run: codebase-memory-mcp cli index_repository to reindex" - - -def search_graph_for_prompt(db_path, keywords, limit=5): - """Query codebase graph nodes matching keywords.""" - if not keywords: - return [] - try: - import sqlite3 - db = sqlite3.connect(db_path) - results, seen = [], set() - for kw in keywords: - rows = db.execute( - "SELECT DISTINCT label, name, file_path FROM nodes " - "WHERE name LIKE ? AND label IN ('Function','Method','Class') " - "ORDER BY length(name) ASC LIMIT ?", - (f"%{kw}%", 3), - ).fetchall() - for r in rows: - key = (r[1], r[2]) - if key not in seen: - seen.add(key) - results.append(r) - if len(results) < limit: - frows = db.execute( - "SELECT DISTINCT label, name, file_path FROM nodes " - "WHERE file_path LIKE ? AND label IN ('Module','File') " - "ORDER BY length(file_path) ASC LIMIT ?", - (f"%{kw}%", 2), - ).fetchall() - for r in frows: - key = (r[1], r[2]) - if key not in seen: - seen.add(key) - results.append(r) - db.close() - return results[:limit] - except Exception: - return [] - - -def search_files_by_grep(project_dir, keywords, limit=5): - """Fallback: git grep -c to rank files by keyword match count.""" - long_kws = [k for k in keywords if len(k) >= 4 and not re.match(r'[가-힣]', k)] - if not long_kws: - return [] - try: - pattern = "|".join(re.escape(k) for k in long_kws[:4]) - r = subprocess.run( - ["git", "grep", "-c", "-E", "-i", "--", pattern], - cwd=project_dir, capture_output=True, text=True, timeout=3, - ) - if r.returncode != 0: - return [] - scored = [] - for line in r.stdout.strip().split("\n"): - if not line.strip(): - continue - try: - fpath, count = line.rsplit(":", 1) - scored.append((int(count), fpath.strip())) - except ValueError: - continue - scored.sort(key=lambda x: -x[0]) - files = [f for _, f in scored] - code = [ - f for f in files - if any(f.endswith(ext) for ext in _CODE_EXT) - and not any(f.startswith(p) for p in _SKIP_PREFIXES) - ] - return code[:limit] - except Exception: - return [] - - -# ── G2: Hooks File Discovery ───────────────────────────────────── - -_HOOKS_DIR = Path.home() / ".claude" / "hooks" -_HOOKS_TRIGGER_KWS = frozenset({ - # English - "hook", "hooks", "bm25-memory", "bm25_memory", "git-memory", "git_memory", - "auto-index", "auto_index", "g2-augment", "g2_augment", - "userPromptSubmit", "sessionstart", "posttooluse", - # Korean - "훅", "후크", -}) - - -def _build_hook_doc(py_path: Path) -> str: - """Extract file name + docstring + function/class signatures from a hook file.""" - try: - src = py_path.read_text(errors="replace") - except Exception: - return "" - lines = src.split("\n") - header_lines = [] - # Collect: module docstring (first triple-quoted block) + def/class lines - in_docstring = False - docstring_done = False - for line in lines[:80]: - stripped = line.strip() - if not docstring_done: - if stripped.startswith('"""') or stripped.startswith("'''"): - in_docstring = not in_docstring - header_lines.append(stripped[:200]) - if stripped.count('"""') >= 2 or stripped.count("'''") >= 2: - in_docstring = False - docstring_done = True - continue - if in_docstring: - header_lines.append(stripped[:200]) - if '"""' in stripped or "'''" in stripped: - in_docstring = False - docstring_done = True - continue - else: - docstring_done = True - if stripped.startswith("def ") or stripped.startswith("class "): - header_lines.append(stripped[:120]) - return f"{py_path.name}\n" + "\n".join(header_lines) - - -def search_hooks_files(query: str, limit: int = 3): - """BM25-search ~/.claude/hooks/*.py for hook function/filename matches.""" - if not _HOOKS_DIR.exists() or not HAS_BM25: - return [] - py_files = sorted(_HOOKS_DIR.glob("*.py")) - if not py_files: - return [] - docs = [(p, _build_hook_doc(p)) for p in py_files] - docs = [(p, d) for p, d in docs if d] - if not docs: - return [] - tokenized = [tokenize(d) for _, d in docs] - bm25 = BM25Okapi(tokenized) - scores = bm25.get_scores(tokenize(query)) - ranked = sorted(range(len(docs)), key=lambda i: scores[i], reverse=True) - return [(docs[i][0], scores[i]) for i in ranked[:limit] if scores[i] > 0] - - -def _has_hooks_keywords(prompt: str) -> bool: - """Return True if prompt mentions hook-related terms.""" - low = prompt.lower() - return any(kw in low for kw in _HOOKS_TRIGGER_KWS) - - -# ── Rich Mode: World Model ──────────────────────────────────────── - -def get_world_model(project_dir): - """Load dead-ends and facts from .omc/world-model.json (--rich mode).""" - wm_path = Path(project_dir) / ".omc" / "world-model.json" - if not wm_path.exists(): - return [], [] - try: - wm = json.loads(wm_path.read_text()) - except Exception: - return [], [] - raw_de = wm.get("dead_ends", []) - if isinstance(raw_de, dict): - raw_de = [] - dead_ends = [ - f" x {de.get('goal','')[:60]} -- {de.get('reason','')[:80]}" - for de in raw_de[-5:] - ] - facts = [] - for fact in wm.get("known_facts", []): - if isinstance(fact, dict): - facts.append(f" * {fact['fact'][:80]}") - elif isinstance(fact, str) and not any( - fact.startswith(p) for p in ("paper:", "README:", "uncertain:") - ): - facts.append(f" * {fact[:80]}") - return dead_ends, facts[-8:] - - -# ── Session Decisions ───────────────────────────────────────────── - -def get_session_decisions(project_dir): - """Read .omc/session-decisions.md for uncommitted decisions.""" - p = Path(project_dir) / ".omc" / "session-decisions.md" - if not p.exists(): - return [] - try: - lines = p.read_text().strip().split("\n") - return [l.strip() for l in lines if l.strip().startswith(">")][-5:] - except Exception: - return [] - - -# ── Pending Decisions (from previous session Stop hook) ────────── - -def consume_pending_decisions(project_dir: str): - """ - 이전 세션 stop-decision-capture.py가 남긴 .pending-decisions.json 읽고 삭제. - Returns list of formatted strings for additionalContext injection, or []. - """ - slug = project_dir.replace("/", "-") - pending_path = Path.home() / ".claude" / "projects" / slug / "memory" / ".pending-decisions.json" - if not pending_path.exists(): - return [] +from _bm25.hooks_search import search_hooks_files, _has_hooks_keywords # noqa: E402 +from _bm25.session import ( # noqa: E402 + get_world_model, get_session_decisions, consume_pending_decisions, +) +from _bm25.injection import write_injection_record # noqa: E402 +from _bm25.output import build_header_lines, emit_output # noqa: E402 - try: - payload = json.loads(pending_path.read_text(encoding="utf-8")) - decisions = payload.get("decisions", []) - captured_at = payload.get("captured_at", "")[:16] # "2026-04-14T12:34" - except Exception: - try: - pending_path.unlink(missing_ok=True) - except Exception: - pass - return [] +RICH = "--rich" in sys.argv - # 읽은 즉시 삭제 (1회성 전달) - try: - pending_path.unlink(missing_ok=True) - except Exception: - pass +# _last_retrieval_scores: alias to ranker's module-level dict so orchestrator +# can clear/read it without changing call sites. +_last_retrieval_scores = _ranker_scores - if not decisions: - return [] - lines = [f"[PENDING MEMORY] 이전 세션({captured_at}) 결정 후보 — MEMORY.md 업데이트 필요 여부 판단:"] - for d in decisions: - tag = d.get("tag", "") - ctx = d.get("context", "") - lines.append(f" [{tag}] {ctx}") - lines.append("→ 위 항목 중 MEMORY.md에 없는 중요한 결정이 있으면 즉시 Edit tool로 기록하세요.") - return lines +_HNAME = "bm25-memory" +# ── Telemetry gate — evaluated once at module load ─────────────────────────── +# Checking os.environ + Path.home() on every _log_event call adds latency even +# when telemetry is disabled. Cache the result at import time so disabled path +# is a single bool check with no I/O. +_TELEMETRY_ENABLED: bool = ( + os.environ.get("CTX_TELEMETRY") == "1" + or (Path.home() / ".claude" / "ctx-telemetry.enabled").exists() +) +_log_event_impl = None # lazy import; set on first enabled call -# ── Main ────────────────────────────────────────────────────────── def _log_event(event_type, payload): - """Opt-in telemetry wrapper — silent no-op if gate off. Never breaks hook path.""" + """Opt-in telemetry wrapper — zero-cost early return when gate is off. + Automatically injects hook=_HNAME so callers don't repeat it.""" + if not _TELEMETRY_ENABLED: + return + global _log_event_impl try: - sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - from _ctx_telemetry import log_event - log_event(event_type, payload) + if _log_event_impl is None: + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from _ctx_telemetry import log_event as _impl + _log_event_impl = _impl + merged = {"hook": _HNAME, **(payload or {})} + _log_event_impl(event_type, merged) except Exception: pass @@ -1428,8 +111,22 @@ def main(): except Exception: pass lines = [] - _blocks_fired = [] # for final hook_invoked telemetry summary + _blocks_fired = [] # for final hook_complete telemetry summary + _fallback_reasons: list[str] = [] # accumulated fallback tags for hook_complete _retrieval_meta = {"ts": _time.time(), "blocks": {}} # retrieval_event telemetry + _query_type: str = "" + + # Classify query type early (used in hook_complete and prompt_received) + try: + _query_type = _classify_query_type(prompt) + except Exception: + _query_type = "unknown" + + # Optional: prompt_received event (lightweight, emitted unconditionally) + _log_event("prompt_received", { + "query_type": _query_type, + "prompt_len": len(prompt) if prompt else 0, + }) # 0a. Pending decisions from previous session (Stop hook → queue file) pending = consume_pending_decisions(project_dir) @@ -1442,6 +139,12 @@ def main(): lines.append("[SESSION NOTES (미커밋 판단)]") lines.extend(session_notes) + # ── Fallback detection: check daemon availability before G1 ───────────── + if _VEC_DISABLED or not _VEC_SOCK.exists(): + _fallback_reasons.append("vec_daemon_down") + if not _BGE_SOCK.exists(): + _fallback_reasons.append("bge_daemon_down") + # 1. G1: Hybrid BM25+dense RRF over decision corpus (2026-04-26) # Uses hybrid_rank_decisions() when vec-daemon is up (BM25+dense→RRF→rerank). # Falls back to bm25_rank_decisions() if dense unavailable — explicit coverage. @@ -1449,28 +152,21 @@ def main(): _t_g1 = _time.perf_counter() corpus = get_decision_corpus(project_dir) g1_header = "" + _g1_count = 0 + _g1_top_lexical: float | None = None + _g1_dense_top: float | None = None if corpus: - # Auto-tune: adjust top_k based on flywheel recommendations - _g1_top_k = 7 - if _AUTO_TUNE: - _qtype_now = _classify_query_type(prompt) - _temporal_gap = _AUTO_TUNE.get("temporal_utility_gap", 0) - # If TEMPORAL utility is 10pp below KEYWORD, reduce top_k to inject only best matches - if _qtype_now == "TEMPORAL" and _temporal_gap > 0.10: - _g1_top_k = 5 # more selective for low-utility temporal queries - # Project-type profile adjustments (Stage 3 local loop) - _proj_type = _AUTO_TUNE.get("project_type_hint", "") - _proj_conf = _AUTO_TUNE.get("project_type_confidence", "LOW") - if _proj_conf in ("HIGH", "MEDIUM"): - if _proj_type == "python_ml": - # ML projects: training/model decisions span longer history - _g1_top_k = min(_g1_top_k + 1, 10) - elif _proj_type == "nextjs_react": - # React: component decisions are keyword-specific, fewer suffice - _g1_top_k = max(_g1_top_k - 1, 4) + _g1_top_k = _get_g1_top_k(prompt, _AUTO_TUNE) _last_retrieval_scores.clear() relevant = hybrid_rank_decisions(corpus, prompt, top_k=_g1_top_k) if relevant: + _g1_count = len(relevant) + # Capture scores for hook_complete + if "bm25_top" in _last_retrieval_scores: + _g1_top_lexical = round(_last_retrieval_scores["bm25_top"], 4) + if "dense_top" in _last_retrieval_scores: + _g1_dense_top = round(_last_retrieval_scores["dense_top"], 4) + # Build forced display header (mechanically injected, not advisory) first_subj = relevant[0]["subject"][:70] rest_count = len(relevant) - 1 @@ -1485,7 +181,7 @@ def main(): prefix = f" > [{date}] " if date else " > " lines.append(f"{prefix}{subj}") _log_event("block_fired", { - "hook": "bm25-memory", "block": "g1_decisions", + "block": "g1_decisions", "count": len(relevant), "duration_ms": int((_time.perf_counter() - _t_g1) * 1000), }) @@ -1497,11 +193,21 @@ def main(): "duration_ms": int((_time.perf_counter() - _t_g1) * 1000), "query_type": _classify_query_type(prompt), } - if "bm25_top" in _last_retrieval_scores: - _g1_meta["top_score_bm25"] = round(_last_retrieval_scores["bm25_top"], 4) - if "dense_top" in _last_retrieval_scores: - _g1_meta["top_score_dense"] = round(_last_retrieval_scores["dense_top"], 4) + if _g1_top_lexical is not None: + _g1_meta["top_score_bm25"] = _g1_top_lexical + if _g1_dense_top is not None: + _g1_meta["top_score_dense"] = _g1_dense_top _retrieval_meta["blocks"]["g1_decisions"] = _g1_meta + # Stage-level event + _g1_event: dict = { + "g1_count": _g1_count, + "duration_ms": int((_time.perf_counter() - _t_g1) * 1000), + } + if _g1_top_lexical is not None: + _g1_event["g1_top_score_bm25"] = _g1_top_lexical + if _g1_dense_top is not None: + _g1_event["g1_top_score_dense"] = _g1_dense_top + _log_event("g1_done", _g1_event) # Citation probe: log G1 retrieved nodes log_retrieved_nodes(project_dir, _session_id, prompt, "g1_decisions", [ {"id": c.get("hash", c["subject"][:20]), "text": c["subject"], "date": c.get("date", "")} @@ -1511,29 +217,15 @@ def main(): # 2. G2: BM25 over project docs g2_files = [] g2_keywords = [] + _g2_docs_count = 0 if prompt: _t_g2d = _time.perf_counter() - # Auto-tune: adjust G2-DOCS top_k based on flywheel recommendations - _g2d_top_k = 5 - if _AUTO_TUNE: - _qtype_now2 = _classify_query_type(prompt) - _g2_temporal_gap = _AUTO_TUNE.get("temporal_utility_gap", 0) - if _qtype_now2 == "TEMPORAL" and _g2_temporal_gap > 0.10: - _g2d_top_k = 3 # more selective for low-utility temporal doc queries - # Project-type profile adjustments (Stage 3 local loop) - _proj_type2 = _AUTO_TUNE.get("project_type_hint", "") - _proj_conf2 = _AUTO_TUNE.get("project_type_confidence", "LOW") - if _proj_conf2 in ("HIGH", "MEDIUM"): - if _proj_type2 == "nextjs_react": - # Next.js: more framework docs per query (more component/API docs) - _g2d_top_k = min(_g2d_top_k + 1, 8) - elif _proj_type2 == "rust_systems": - # Rust: docs are precise, fewer higher-quality docs preferred - _g2d_top_k = max(_g2d_top_k - 1, 3) + _g2d_top_k = _get_g2d_top_k(prompt, _AUTO_TUNE) _last_retrieval_scores.pop("bm25_top", None) _last_retrieval_scores.pop("dense_top", None) doc_chunks = hybrid_search_docs(project_dir, prompt, top_k=_g2d_top_k) if doc_chunks: + _g2_docs_count = len(doc_chunks) lines.append("[G2-DOCS] (BM25+dense RRF relevant research docs)") for chunk in doc_chunks: chunk_lines = chunk.strip().split("\n") @@ -1553,7 +245,7 @@ def main(): if snippet: lines.append(f" {snippet}") _log_event("block_fired", { - "hook": "bm25-memory", "block": "g2_docs", + "block": "g2_docs", "count": len(doc_chunks), "duration_ms": int((_time.perf_counter() - _t_g2d) * 1000), }) @@ -1566,11 +258,23 @@ def main(): "duration_ms": int((_time.perf_counter() - _t_g2d) * 1000), "query_type": _classify_query_type(prompt), } + _g2d_top_score: float | None = None if "bm25_top" in _last_retrieval_scores: _g2d_meta["top_score_bm25"] = round(_last_retrieval_scores["bm25_top"], 4) + _g2d_top_score = _g2d_meta["top_score_bm25"] if "dense_top" in _last_retrieval_scores: _g2d_meta["top_score_dense"] = round(_last_retrieval_scores["dense_top"], 4) + if _g2d_top_score is None: + _g2d_top_score = _g2d_meta["top_score_dense"] _retrieval_meta["blocks"]["g2_docs"] = _g2d_meta + # Stage-level event + _g2d_event: dict = { + "g2_docs_count": _g2_docs_count, + "duration_ms": int((_time.perf_counter() - _t_g2d) * 1000), + } + if _g2d_top_score is not None: + _g2d_event["top_score"] = _g2d_top_score + _log_event("g2_docs_done", _g2d_event) # Citation probe: log G2-DOCS retrieved nodes log_retrieved_nodes(project_dir, _session_id, prompt, "g2_docs", [ {"id": chunk.strip().split("\n")[0].split(" §")[0].strip(), "text": chunk.strip().split("\n")[0][:80]} @@ -1578,6 +282,7 @@ def main(): ]) # 3. G2: Code file discovery (graph → grep fallback) + _g2_code_count = 0 if prompt: keywords = extract_keywords(prompt) g2_keywords = keywords[:3] @@ -1589,8 +294,10 @@ def main(): stale_warn = check_and_trigger_reindex(project_dir, db_path) if stale_warn: lines.append(stale_warn) + _fallback_reasons.append("mcp_db_stale") graph_results = search_graph_for_prompt(db_path, keywords) if graph_results: + _g2_code_count = len(graph_results) lines.append(f"[G2-PREFETCH] Related code for '{' '.join(keywords[:3])}':") seen_files = set() for label, name, fpath in graph_results: @@ -1599,40 +306,57 @@ def main(): if seen_files: lines.append(f" Start with: {', '.join(sorted(seen_files)[:3])}") _log_event("block_fired", { - "hook": "bm25-memory", "block": "g2_prefetch", + "block": "g2_prefetch", "count": len(graph_results), "duration_ms": int((_time.perf_counter() - _t_g2p) * 1000), }) _blocks_fired.append("g2_prefetch") + _log_event("g2_code_done", { + "g2_code_count": _g2_code_count, + "duration_ms": int((_time.perf_counter() - _t_g2p) * 1000), + }) else: # Fallback: git grep + _fallback_reasons.append("mcp_db_missing") files = search_files_by_grep(project_dir, keywords) if files: + _g2_code_count = len(files) lines.append(f"[G2-GREP] Files matching '{' '.join(keywords[:3])}' (grep):") for f in files: lines.append(f" {f}") lines.append(f" Start with: {', '.join(files[:3])}") _log_event("block_fired", { - "hook": "bm25-memory", "block": "g2_grep", + "block": "g2_grep", "count": len(files), "duration_ms": int((_time.perf_counter() - _t_g2p) * 1000), }) _blocks_fired.append("g2_grep") + _log_event("g2_code_done", { + "g2_code_count": _g2_code_count, + "fallback_reason": "grep_fallback", + "duration_ms": int((_time.perf_counter() - _t_g2p) * 1000), + }) # 3b. G2: Hooks file discovery (when hook-related terms in prompt) + _g2_hooks_count = 0 if prompt and _has_hooks_keywords(prompt): _t_g2h = _time.perf_counter() hook_results = search_hooks_files(prompt) if hook_results: + _g2_hooks_count = len(hook_results) lines.append(f"[G2-HOOKS] Hook files matching '{prompt[:40]}':") for hp, score in hook_results: lines.append(f" {hp} (score={score:.1f})") _log_event("block_fired", { - "hook": "bm25-memory", "block": "g2_hooks", + "block": "g2_hooks", "count": len(hook_results), "duration_ms": int((_time.perf_counter() - _t_g2h) * 1000), }) _blocks_fired.append("g2_hooks") + _log_event("g2_hooks_done", { + "g2_hooks_count": _g2_hooks_count, + "duration_ms": int((_time.perf_counter() - _t_g2h) * 1000), + }) # 4. World model (--rich) if RICH: @@ -1645,192 +369,42 @@ def main(): lines.extend(facts) if lines: - # Prepend forced display header (mechanically enforced, replaces CLAUDE.md advisory) - header_lines = [] - if g1_header: - header_lines.append(g1_header) - if g2_files or g2_keywords: - files_str = ", ".join(f"`{f}`" for f in g2_files[:3]) if g2_files else "(docs BM25)" - kw_str = " ".join(g2_keywords[:3]) if g2_keywords else "" - via_str = f' — found via "{kw_str}"' if kw_str else "" - header_lines.append(f"> **G2** (space search): {files_str}{via_str}") - # Daemon degradation warnings — shown only when socket is absent - _daemon_warns = [] - if not _VEC_DISABLED and not _VEC_SOCK.exists(): - _daemon_warns.append("vec-daemon down — BM25-only mode (semantic rerank disabled)") - if _USE_CROSS_ENCODER and not _BGE_SOCK.exists(): - _daemon_warns.append("bge-daemon down — cross-encoder rerank disabled") - if _daemon_warns: - header_lines.append("> **⚠ Semantic layer**: " + " | ".join(_daemon_warns)) - # Auto-tune active badge — shows flywheel is running - if _AUTO_TUNE_ACTIVE: - n_rec = _AUTO_TUNE.get("based_on_n", "?") - prefer_hybrid = _AUTO_TUNE.get("prefer_hybrid_G1", False) - temporal_gap = _AUTO_TUNE.get("temporal_utility_gap") - proj_hint = _AUTO_TUNE.get("project_type_hint") - proj_conf = _AUTO_TUNE.get("project_type_confidence", "LOW") - parts = [f"n={n_rec}"] - if prefer_hybrid: - parts.append("hybrid✓") - if temporal_gap and temporal_gap > 0.05: - parts.append(f"temporal-gap={temporal_gap*100:.0f}pp") - if proj_hint and proj_hint != "multi_lang" and proj_conf in ("HIGH", "MEDIUM"): - parts.append(proj_hint) - header_lines.append(f"> **CTX auto-tune** [{', '.join(parts)}] — run `ctx-telemetry tune` to refresh") - if header_lines: - lines = header_lines + [""] + lines - - output = { - "hookSpecificOutput": { - "hookEventName": "UserPromptSubmit", - "additionalContext": "\n".join(lines), - } - } - json.dump(output, sys.stdout) - sys.stdout.flush() - if header_lines: - print("\n".join(header_lines), file=sys.stderr) - sys.stderr.flush() + header_lines = build_header_lines( + g1_header, g2_files, g2_keywords, + _VEC_SOCK, _VEC_DISABLED, _BGE_SOCK, _USE_CROSS_ENCODER, + _AUTO_TUNE, _AUTO_TUNE_ACTIVE, + ) + emit_output(lines, header_lines) # Final summary event: one record per hook invocation (outside `if lines:`) + # Always emitted — this is the primary metric record. + _hook_complete: dict = { + "latency_ms": int((_time.perf_counter() - _t_start) * 1000), + "exit_code": 0, + "query_type": _query_type, + "g1_count": _g1_count, + "g2_docs_count": _g2_docs_count, + "g2_code_count": _g2_code_count, + "g2_hooks_count": _g2_hooks_count, + "blocks_fired": ",".join(_blocks_fired) if _blocks_fired else "", + "fallback_reasons": ",".join(_fallback_reasons) if _fallback_reasons else "", + } + if _g1_top_lexical is not None: + _hook_complete["g1_top_score_bm25"] = _g1_top_lexical + if _g1_dense_top is not None: + _hook_complete["g1_top_score_dense"] = _g1_dense_top + _log_event("hook_complete", _hook_complete) + # Also emit legacy hook_invoked for dashboard backward compat _log_event("hook_invoked", { - "hook": "bm25-memory", "duration_ms": int((_time.perf_counter() - _t_start) * 1000), "prompt_len": len(prompt) if prompt else 0, }) # ── P1: record what we injected for utility-rate measurement ───── - # Stop hook reads this + the latest assistant turn + substring-matches - # each item's distinctive tokens. Not stored when dashboard-internal. - if os.environ.get("CTX_DASHBOARD_INTERNAL") != "1": - try: - # Preview = first 120 chars, newlines stripped (same privacy surface - # as vault.db which already stores full prompts; this just makes the - # dashboard see new prompts *before* vault.db incremental fires on Stop). - preview = (prompt or "")[:120].replace("\n", " ").replace("\r", " ") - # Full prompt stored too so the dashboard's node-details pane can - # show the whole message before vault.db catches up. - prompt_full_str = (prompt or "").replace("\r", "") - # Derive the project basename from CLAUDE_PROJECT_DIR (fallback to cwd) - try: - _proj = os.environ.get("CLAUDE_PROJECT_DIR") or os.getcwd() - _project_name = os.path.basename(_proj.rstrip("/")) if _proj else None - except Exception: - _project_name = None - injection = { - "ts": _time.time(), - "prompt_len": len(prompt) if prompt else 0, - "prompt_preview": preview, - "prompt_full": prompt_full_str, - "project": _project_name, - "items": [], - } - # Collect distinctive substrings from emitted blocks. - # Each item is (block, signature) — signature is a 4-20 char - # distinctive substring the assistant's response can echo. - # Meta/filler words from commit subjects that never represent a topic. - # Drops CTX's internal taxonomy (live-infinite, iter, goal_vN) + conventional - # commit prefixes + common English verbs — anything that would generate - # false-positive matches against unrelated responses. - _META_WORDS = frozenset([ - "live-infinite", "live-inf", "omc-live", "iter", "live", - "goal_v1", "goal_v2", "goal_v3", "goal", - "feat", "fix", "refactor", "perf", "docs", "test", "chore", - "success", "section", "update", "add", "remove", "change", - "fixed", "added", "removed", "completed", - ]) - # Header-row detector for "> **G1/G2**" and similar markdown headers - _is_header_line = lambda st: st.startswith("> **") and "** (" in st - - def _extract_content_tokens(subject: str, n: int = 5) -> list: - """Pick up to N distinctive content tokens from a commit subject. - Filters meta words, pure digits, punctuation-only fragments. - Prefers longer words (more specific = better substring hit rate).""" - candidates = [] - for w in subject.split(): - w_clean = w.strip(".,()[]{}:;!?\"'").lower() - if len(w_clean) < 4: - continue - if w_clean in _META_WORDS: - continue - if w_clean.replace("/", "").replace(".", "").replace("-", "").isdigit(): - continue # 20260402, 58/∞, etc. - # Keep case of original for better citation-style match - candidates.append(w.strip(".,()[]{}:;!?\"'")) - # Dedup preserving order, sort by length desc for specificity - seen = set() - uniq = [t for t in candidates if not (t.lower() in seen or seen.add(t.lower()))] - uniq.sort(key=lambda t: -len(t)) - return uniq[:n] - - for line in lines: - s = line.strip() - # Skip markdown headers like "> **G1** (time memory): ..." — they are - # not items, they're section labels that would leak into signatures. - if _is_header_line(s): - continue - # G1 decisions: "> [YYYY-MM-DD] subject" — capture date for age-based wow trigger - if s.startswith("> [") and "]" in s: - close_idx = s.index("]") - date_str = s[3:close_idx] - subj = s[close_idx + 1:].strip() - tokens = _extract_content_tokens(subj, n=5) - if tokens: - item = { - "block": "g1_decisions", - "tokens": tokens, - "subject": subj[:200], # preserved for semantic scoring - } - if len(date_str) == 10 and date_str[4] == "-" and date_str[7] == "-": - item["date"] = date_str - injection["items"].append(item) - # G2-DOCS entries: " > filename.md" → filename AS signature AND - # also extract date-token + topic words from filename for more hit - # surface (e.g. "20260411-g1-generalization-validation.md" also - # matches on "generalization" / "validation"). - elif s.startswith("> ") and (".md" in s or s.endswith(".py")): - fname = s.lstrip("> ").strip().split(" §")[0].split()[0] - if fname: - # filename + its stem words as tokens - stem = fname.rsplit(".", 1)[0] - parts = [p for p in stem.replace("-", " ").replace("_", " ").split() - if len(p) >= 4 and not p.isdigit()] - tokens = [fname] + parts[:4] - # Subject for semantic: the filename's natural-language form - subject = " ".join(parts) if parts else fname - injection["items"].append({ - "block": "g2_docs", "tokens": tokens, "subject": subject[:200] - }) - # G2-PREFETCH: symbol names (function/class) + their path - elif ": " in s and "@" in s and any(k in s for k in ("Function:", "Class:", "Method:", "Module:", "File:")): - try: - name = s.split(":", 1)[1].split("@")[0].strip() - path = s.split("@", 1)[1].strip() if "@" in s else "" - path_base = path.rsplit("/", 1)[-1] if path else "" - tokens = [t for t in [name, path_base] if t and len(t) >= 4] - if tokens: - injection["items"].append({ - "block": "g2_prefetch", - "tokens": tokens, - "subject": f"{name} in {path}"[:200], - }) - except Exception: - pass - Path(os.path.expanduser("~/.claude/last-injection.json")).write_text( - json.dumps(injection) - ) - # Write retrieval metadata for utility-rate.py → retrieval_event schema - _retrieval_meta["vec_daemon_up"] = _VEC_SOCK.exists() and not _VEC_DISABLED - _retrieval_meta["bge_daemon_up"] = _BGE_SOCK.exists() and bool( - os.environ.get("CTX_CROSS_ENCODER", "1") != "0" - ) - _retrieval_meta["query_char_count"] = len(prompt) if prompt else 0 - _retrieval_meta["session_id"] = _session_id or "" - Path(os.path.expanduser("~/.claude/last-retrieval-meta.json")).write_text( - json.dumps(_retrieval_meta) - ) - except Exception: - pass + write_injection_record( + prompt, lines, _retrieval_meta, + _VEC_SOCK, _VEC_DISABLED, _BGE_SOCK, _session_id, + ) if __name__ == "__main__": diff --git a/src/hooks/chat-memory.py b/src/hooks/chat-memory.py index 84bca7c..fa45023 100755 --- a/src/hooks/chat-memory.py +++ b/src/hooks/chat-memory.py @@ -13,12 +13,22 @@ import re import socket import sqlite3 -import sqlite_vec import struct import sys +try: + import sqlite_vec + HAS_SQLITE_VEC = True +except ImportError: + sqlite_vec = None # type: ignore[assignment] + HAS_SQLITE_VEC = False + print("⚠ sqlite_vec missing — vec retrieval disabled", file=sys.stderr) + VAULT_DB = os.path.expanduser("~/.local/share/claude-vault/vault.db") VEC_SOCK = os.path.expanduser("~/.local/share/claude-vault/vec-daemon.sock") +# Windows fallback: AF_UNIX missing on MSVC-built CPython → TCP loopback. +VEC_USE_TCP = not hasattr(socket, "AF_UNIX") +VEC_PORT = int(os.environ.get("CTX_VEC_PORT", "29501")) MAX_RESULTS = 3 MAX_CHARS_PER_MSG = 400 MIN_KEYWORD_LEN = 3 @@ -88,12 +98,17 @@ def cwd_to_project(cwd: str) -> str: def get_query_embedding(query: str) -> list[float] | None: """Get query embedding from vec-daemon via Unix socket. Returns None if unavailable.""" - if not os.path.exists(VEC_SOCK): - return None try: - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - sock.settimeout(VEC_TIMEOUT) - sock.connect(VEC_SOCK) + if VEC_USE_TCP: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(VEC_TIMEOUT) + sock.connect(("127.0.0.1", VEC_PORT)) + else: + if not os.path.exists(VEC_SOCK): + return None + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.settimeout(VEC_TIMEOUT) + sock.connect(VEC_SOCK) req = json.dumps({"q": query}) + "\n" sock.sendall(req.encode("utf-8")) buf = b"" @@ -118,6 +133,8 @@ def query_vault_vector( exclude_session_id: str | None = None, ) -> list[tuple]: """KNN vector search using sqlite-vec. Returns (msg_id, cosine_dist, role, content, ts, project).""" + if not HAS_SQLITE_VEC: + return [] if not query_emb or not os.path.exists(VAULT_DB): return [] try: diff --git a/src/hooks/utility-rate.py b/src/hooks/utility-rate.py index f8ce94c..fa4297d 100644 --- a/src/hooks/utility-rate.py +++ b/src/hooks/utility-rate.py @@ -43,6 +43,9 @@ # ── T1: Semantic similarity via vec-daemon ─────────────────────────── VEC_SOCK = HOME / ".local" / "share" / "claude-vault" / "vec-daemon.sock" +# Windows fallback: AF_UNIX missing on MSVC-built CPython → TCP loopback. +VEC_USE_TCP = not hasattr(socket, "AF_UNIX") +VEC_PORT = int(os.environ.get("CTX_VEC_PORT", "29501")) SEMANTIC_THRESHOLD = 0.85 # e5-small calibration (empirical, 2026-04-20): # related pairs: cos ∈ [0.84, 0.90] (item subject vs on-topic response chunk) @@ -52,12 +55,19 @@ def _embed(text: str, timeout: float = 0.8) -> list | None: """Query vec-daemon for an embedding. Returns None on any failure.""" - if not VEC_SOCK.exists() or not text: + if not text: return None try: - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.settimeout(timeout) - s.connect(str(VEC_SOCK)) + if VEC_USE_TCP: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(timeout) + s.connect(("127.0.0.1", VEC_PORT)) + else: + if not VEC_SOCK.exists(): + return None + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.settimeout(timeout) + s.connect(str(VEC_SOCK)) s.sendall((json.dumps({"q": text[:1000]}) + "\n").encode("utf-8")) buf = b"" while b"\n" not in buf: diff --git a/src/hooks/vec-daemon.py b/src/hooks/vec-daemon.py index 1243949..8dd2722 100644 --- a/src/hooks/vec-daemon.py +++ b/src/hooks/vec-daemon.py @@ -21,6 +21,10 @@ STOP_FILE = Path.home() / ".local/share/claude-vault/vec-daemon.stop" MODEL_NAME = "intfloat/multilingual-e5-small" +# Windows fallback: AF_UNIX is not exposed by MSVC-built CPython. Use TCP loopback. +USE_TCP = not hasattr(socket, "AF_UNIX") +VEC_PORT = int(os.environ.get("CTX_VEC_PORT", "29501")) + if "--stop" in sys.argv: STOP_FILE.write_text("stop") print("Stop file written.") @@ -33,8 +37,8 @@ os.kill(existing_pid, 0) print(f"[vec-daemon] Already running (PID {existing_pid}). Exiting.") sys.exit(0) - except (ProcessLookupError, ValueError): - pass # stale PID file, continue + except (ProcessLookupError, ValueError, OSError): + pass # stale PID file (or Windows os.kill not supported), continue def load_model(): from sentence_transformers import SentenceTransformer @@ -78,18 +82,27 @@ def main(): model = load_model() print(f"[vec-daemon] Model ready in {time.time()-t0:.1f}s", flush=True) - # Clean up stale socket - if SOCKET_PATH.exists(): - SOCKET_PATH.unlink() - # Write PID PID_FILE.write_text(str(os.getpid())) - srv = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - srv.bind(str(SOCKET_PATH)) + if USE_TCP: + srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # Skip SO_REUSEADDR on Windows: semantics differ (allows multiple + # bind to same port → port hijacking risk). Linux/macOS keep TIME_WAIT + # rebinding behavior. + if sys.platform != "win32": + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", VEC_PORT)) + listen_target = f"127.0.0.1:{VEC_PORT}" + else: + if SOCKET_PATH.exists(): + SOCKET_PATH.unlink() + srv = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + srv.bind(str(SOCKET_PATH)) + listen_target = str(SOCKET_PATH) srv.listen(5) srv.settimeout(1.0) # 1s accept timeout for stop-check loop - print(f"[vec-daemon] Listening on {SOCKET_PATH}", flush=True) + print(f"[vec-daemon] Listening on {listen_target}", flush=True) while True: if STOP_FILE.exists(): @@ -106,7 +119,8 @@ def main(): print(f"[vec-daemon] Accept error: {e}", flush=True) srv.close() - SOCKET_PATH.unlink(missing_ok=True) + if not USE_TCP: + SOCKET_PATH.unlink(missing_ok=True) PID_FILE.unlink(missing_ok=True) print("[vec-daemon] Stopped.") diff --git a/src/retrieval/adaptive_trigger.py b/src/retrieval/adaptive_trigger.py index dafce15..d84df38 100644 --- a/src/retrieval/adaptive_trigger.py +++ b/src/retrieval/adaptive_trigger.py @@ -22,6 +22,17 @@ from src.retrieval.full_context import RetrievalResult, estimate_tokens from src.trigger.trigger_classifier import TriggerClassifier, TriggerType +# Canonical tokenizer shared with production hook (Task C unification). +# tokenize() — corpus-side (drop_stopwords=False) and query-side (True) +# expand_query_tokens() — synonym expansion (KO↔EN, CTX domain vocab) +# score_corpus_bm25() — generic BM25 scorer returning raw numpy score array +try: + from src.hooks._bm25.tokenizer import tokenize as _bm25_tokenize, expand_query_tokens as _bm25_expand + from src.hooks._bm25.ranker import score_corpus_bm25 as _bm25_score + _HAS_UNIFIED_TOKENIZER = True +except ImportError: + _HAS_UNIFIED_TOKENIZER = False + # Directories to exclude from indexing (venvs, build artifacts, VCS, caches) _EXCLUDED_DIRS = frozenset({ 'venv', '.venv', 'env', '.env', @@ -148,10 +159,16 @@ def _index(self) -> None: # Build import graph self._index_imports(rel_path, content) - # Tokenize for BM25 (unigrams + selective bigrams for multi-word concepts) - expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', content) - expanded = expanded.replace("_", " ") - tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', expanded.lower()) + # Tokenize for BM25 — use canonical production tokenizer when available. + # Fallback keeps original regex for environments without _bm25 package. + if _HAS_UNIFIED_TOKENIZER: + # Production tokenizer: Korean particle strip + Porter stem + decimal preserve + tokens = _bm25_tokenize(content, drop_stopwords=False) + else: + expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', content) + expanded = expanded.replace("_", " ") + tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', expanded.lower()) + # Selective bigrams for multi-word concept matching (both paths) bigrams = [ f"{tokens[i]}_{tokens[i+1]}" for i in range(len(tokens) - 1) @@ -492,8 +509,12 @@ def _symbol_retrieve(self, query_id: str, query_text: str, symbol: str, k: int) # Raw content search finds files mentioning the symbol (e.g. in docstrings) but gives # no ordering signal. BM25 on the full query text provides a reliable reranking. if self.bm25 is not None and all(v == 0.5 for v in matched_files.values()): - full_exp = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") - full_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', full_exp.lower()) + if _HAS_UNIFIED_TOKENIZER: + full_tokens = _bm25_tokenize(query_text, drop_stopwords=True) + full_tokens = _bm25_expand(full_tokens) + else: + full_exp = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") + full_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', full_exp.lower()) if full_tokens: bm25_scores = self.bm25.get_scores(full_tokens) bm25_max = float(np.max(bm25_scores)) if bm25_scores.max() > 0 else 1.0 @@ -532,12 +553,21 @@ def _concept_retrieve(self, query_id: str, query_text: str, concept: str, k: int # Full-query tokens provide recall when concept extraction loses information # (e.g. long docstrings, COIR-style natural-language-to-code queries). if self.bm25 is not None: - concept_expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', concept).replace("_", " ") - concept_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', concept_expanded.lower()) - # Fallback to full query_text tokens if concept is empty or trivial - if not concept_tokens or all(len(t) <= 2 for t in concept_tokens): - full_exp = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") - concept_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', full_exp.lower()) + if _HAS_UNIFIED_TOKENIZER: + # Canonical tokenizer: Korean strip + stem + synonym expansion + concept_tokens = _bm25_tokenize(concept, drop_stopwords=True) + concept_tokens = _bm25_expand(concept_tokens) + # Fallback to full query_text tokens if concept is empty or trivial + if not concept_tokens or all(len(t) <= 2 for t in concept_tokens): + concept_tokens = _bm25_tokenize(query_text, drop_stopwords=True) + concept_tokens = _bm25_expand(concept_tokens) + else: + concept_expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', concept).replace("_", " ") + concept_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', concept_expanded.lower()) + # Fallback to full query_text tokens if concept is empty or trivial + if not concept_tokens or all(len(t) <= 2 for t in concept_tokens): + full_exp = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") + concept_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', full_exp.lower()) # Add bigrams to concept query (matches bigrams in BM25 corpus index) concept_bigrams = [ @@ -552,8 +582,12 @@ def _concept_retrieve(self, query_id: str, query_text: str, concept: str, k: int concept_max = float(np.max(concept_scores)) if concept_scores.max() > 0 else 1.0 # Full-query BM25 — always computed for hybrid blend - full_exp = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") - full_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', full_exp.lower()) + if _HAS_UNIFIED_TOKENIZER: + full_tokens = _bm25_tokenize(query_text, drop_stopwords=True) + full_tokens = _bm25_expand(full_tokens) + else: + full_exp = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") + full_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', full_exp.lower()) full_bigrams = [ f"{full_tokens[i]}_{full_tokens[i+1]}" for i in range(len(full_tokens) - 1) @@ -879,8 +913,12 @@ def _implicit_retrieve(self, query_id: str, query_text: str, context_ref: str, k # where import traversal finds no internal deps (external libs not in corpus). bm25_scores_arr: "np.ndarray | None" = None if self.bm25 is not None: - expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") - query_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', expanded.lower()) + if _HAS_UNIFIED_TOKENIZER: + query_tokens = _bm25_tokenize(query_text, drop_stopwords=True) + query_tokens = _bm25_expand(query_tokens) + else: + expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") + query_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', expanded.lower()) bigrams = [ f"{query_tokens[i]}_{query_tokens[i+1]}" for i in range(len(query_tokens) - 1) @@ -1038,8 +1076,13 @@ def _tfidf_retrieve(self, query_id: str, query_text: str, k: int) -> RetrievalRe strategy="adaptive_trigger", ) - expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") - query_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', expanded.lower()) + if _HAS_UNIFIED_TOKENIZER: + # Canonical production tokenizer: stopword drop + stem + synonym expansion + query_tokens = _bm25_tokenize(query_text, drop_stopwords=True) + query_tokens = _bm25_expand(query_tokens) + else: + expanded = re.sub(r'([a-z])([A-Z])', r'\1 \2', query_text).replace("_", " ") + query_tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{1,}', expanded.lower()) bigrams = [ f"{query_tokens[i]}_{query_tokens[i+1]}" for i in range(len(query_tokens) - 1) diff --git a/src/retrieval/bm25_retriever.py b/src/retrieval/bm25_retriever.py index 01bbf74..dde9338 100644 --- a/src/retrieval/bm25_retriever.py +++ b/src/retrieval/bm25_retriever.py @@ -1,22 +1,28 @@ """ BM25 sparse keyword retrieval strategy. -Uses the rank_bm25 library for keyword-based file retrieval. +Uses the canonical _bm25 core for keyword-based file retrieval. """ import os import re from typing import Dict, List -from rank_bm25 import BM25Okapi +from src.hooks._bm25.ranker import score_corpus_bm25 from src.retrieval.full_context import RetrievalResult, estimate_tokens def _tokenize(text: str) -> List[str]: - """Simple tokenizer: split on non-alphanumeric, lowercase, filter short tokens.""" - tokens = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', text.lower()) - return [t for t in tokens if len(t) > 1] + """Identifier-focused tokenizer: alphanumeric/underscore tokens, len > 1. + + Intentionally NOT the canonical _bm25.tokenize (PR-1 out-of-scope): + canonical applies dict.fromkeys() dedup, which collapses repeated + identifier occurrences and flattens BM25 TF scoring on code corpora. + Code search needs raw identifier-frequency counts. + """ + raw = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', text.lower()) + return [t for t in raw if len(t) > 1] class BM25Retriever: @@ -28,7 +34,7 @@ def __init__(self, codebase_dir: str): self.file_paths: List[str] = [] self.file_tokens: List[List[str]] = [] self.total_tokens = 0 - self.bm25 = None + self._has_index = False self._index() def _index(self) -> None: @@ -46,11 +52,11 @@ def _index(self) -> None: self.total_tokens += estimate_tokens(content) if self.file_tokens: - self.bm25 = BM25Okapi(self.file_tokens) + self._has_index = True def retrieve(self, query_id: str, query_text: str, k: int = 10) -> RetrievalResult: """Retrieve top-k files using BM25 scoring.""" - if self.bm25 is None: + if not self._has_index: return RetrievalResult( query_id=query_id, retrieved_files=[], @@ -61,7 +67,17 @@ def retrieve(self, query_id: str, query_text: str, k: int = 10) -> RetrievalResu ) query_tokens = _tokenize(query_text) - raw_scores = self.bm25.get_scores(query_tokens) + raw_scores = score_corpus_bm25(self.file_tokens, query_tokens) + + if raw_scores is None: + return RetrievalResult( + query_id=query_id, + retrieved_files=[], + scores={}, + tokens_used=0, + total_tokens=self.total_tokens, + strategy="bm25", + ) # Rank files by score scored_files = sorted( diff --git a/tests/golden/bm25_memory_outputs.jsonl b/tests/golden/bm25_memory_outputs.jsonl new file mode 100644 index 0000000..e1b2fff --- /dev/null +++ b/tests/golden/bm25_memory_outputs.jsonl @@ -0,0 +1,26 @@ +{"id": "single_keyword_bm25_location", "category": "keyword_single", "stdin": {"prompt": "BM25 어디 있지?", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"BM25 \\uc5b4\\ub514 \\uc788\\uc9c0\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'BM25 \\uc5b4\\ub514 \\uc788\\uc9c0' (grep):\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/aggregated_stat_report.py\\n benchmarks/eval/g2_docs_ab_test.py\\n benchmarks/eval/g1_docs_bm25_eval.py\\n src/cli/telemetry.py\\n Start with: src/retrieval/adaptive_trigger.py, benchmarks/eval/aggregated_stat_report.py, benchmarks/eval/g2_docs_ab_test.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 48, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "single_keyword_vec_daemon", "category": "keyword_single", "stdin": {"prompt": "vec-daemon 코드 위치", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"vec daemon \\uc704\\uce58\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'vec daemon \\uc704\\uce58' (grep):\\n tests/unit/test_install_cli.py\\n src/cli/install.py\\n src/hooks/bge-daemon.py\\n src/hooks/_bm25/rerank.py\\n src/hooks/vec-daemon.py\\n Start with: tests/unit/test_install_cli.py, src/cli/install.py, src/hooks/bge-daemon.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 46, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "single_keyword_rank_bm25", "category": "keyword_single", "stdin": {"prompt": "rank_bm25", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"rank_bm25\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'rank_bm25' (grep):\\n benchmarks/eval/claude_sonnet_ctx_eval.py\\n benchmarks/eval/nemotron_ctx_eval_v3.py\\n benchmarks/eval/nemotron_ctx_eval_v4.py\\n benchmarks/eval/doc_retrieval_eval_v2.py\\n benchmarks/eval/nemotron_ctx_eval.py\\n Start with: benchmarks/eval/claude_sonnet_ctx_eval.py, benchmarks/eval/nemotron_ctx_eval_v3.py, benchmarks/eval/nemotron_ctx_eval_v4.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 46, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "korean_paraphrase_decision_mem", "category": "korean_paraphrase", "stdin": {"prompt": "의사결정 기억은 어떻게 관리되나요?", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"\\uc758\\uc0ac\\uacb0\\uc815 \\uae30\\uc5b5\\uc740 \\uad00\\ub9ac\\ub418\\ub098\\uc694\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 37, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "korean_paraphrase_rerank_logic", "category": "korean_paraphrase", "stdin": {"prompt": "검색 결과 재랭킹 로직 보고 싶습니다", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"search retrieve find\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'search retrieve find' (grep):\\n benchmarks/eval/hook_comparison_eval.py\\n src/retrieval/adaptive_trigger.py\\n src/evaluator/benchmark_runner.py\\n src/evaluator/coir_evaluator.py\\n src/evaluator/repobench_evaluator.py\\n Start with: benchmarks/eval/hook_comparison_eval.py, src/retrieval/adaptive_trigger.py, src/evaluator/benchmark_runner.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 87, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "english_code_rank_decisions", "category": "english_code", "stdin": {"prompt": "where is the rank_decisions function defined", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"rank_decisions function defined\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'rank_decisions function defined' (grep):\\n hf_space_dashboard/static/app.js\\n src/evaluator/llm_quality.py\\n src/data/dataset_generator.py\\n run_llm_eval_opensource.py\\n run_llm_eval.py\\n Start with: hf_space_dashboard/static/app.js, src/evaluator/llm_quality.py, src/data/dataset_generator.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 78, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "english_code_docs_loader", "category": "english_code", "stdin": {"prompt": "show me the docs corpus loader", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"show docs corpus\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'show docs corpus' (grep):\\n src/evaluator/coir_evaluator.py\\n benchmarks/eval/doc_retrieval_eval_v2.py\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/g2_docs_eval.py\\n src/hooks/_bm25/docs_search.py\\n Start with: src/evaluator/coir_evaluator.py, benchmarks/eval/doc_retrieval_eval_v2.py, src/retrieval/adaptive_trigger.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 86, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "noctx_bypass_hello", "category": "avoidance", "stdin": {"prompt": "[noctx] just say hello", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": [], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"noctx just say\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'noctx just say' (grep):\\n benchmarks/eval/g1_docs_bm25_eval.py\\n benchmarks/eval/g1_docs_memory_eval.py\\n hf_space_dashboard/static/styles.css\\n benchmarks/eval/doc_retrieval_eval_v2.py\\n hf_space_dashboard/static/app.js\\n Start with: benchmarks/eval/g1_docs_bm25_eval.py, benchmarks/eval/g1_docs_memory_eval.py, hf_space_dashboard/static/styles.css\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 72, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "avoidance_fix_typo", "category": "avoidance", "stdin": {"prompt": "fix: typo in README", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": [], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"fix typo README\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'fix typo README' (grep):\\n src/hooks/_bm25/docs_search.py\\n benchmarks/eval/project_understanding_g1_eval.py\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/expand_mab_templates.py\\n benchmarks/eval/retrieve_ctx_v2.py\\n Start with: src/hooks/_bm25/docs_search.py, benchmarks/eval/project_understanding_g1_eval.py, src/retrieval/adaptive_trigger.py\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 66, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "empty_prompt", "category": "empty_short", "stdin": {"prompt": "", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 35, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "short_question_mark", "category": "empty_short", "stdin": {"prompt": "?", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 37, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "short_korean_laugh", "category": "empty_short", "stdin": {"prompt": "ㅋ", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 37, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "hooks_keyword_tokenizer", "category": "hooks_keyword", "stdin": {"prompt": "bm25-memory.py 의 토큰화 부분 알려줘", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"bm25 memory \\ud1a0\\ud070\\ud654\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'bm25 memory \\ud1a0\\ud070\\ud654' (grep):\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/aggregated_stat_report.py\\n benchmarks/eval/g2_docs_ab_test.py\\n benchmarks/eval/g1_docs_bm25_eval.py\\n src/hooks/chat-memory.py\\n Start with: src/retrieval/adaptive_trigger.py, benchmarks/eval/aggregated_stat_report.py, benchmarks/eval/g2_docs_ab_test.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 66, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "hooks_keyword_hook_search", "category": "hooks_keyword", "stdin": {"prompt": "hook 파일들 어디에 있어?", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): (docs BM25) \\u2014 found via \\\"hook \\ud30c\\uc77c\\ub4e4 \\uc5b4\\ub514\\uc5d0\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-GREP] Files matching 'hook \\ud30c\\uc77c\\ub4e4 \\uc5b4\\ub514\\uc5d0' (grep):\\n tests/unit/test_install_cli.py\\n src/cli/install.py\\n tests/unit/test_uninstall_cleanup.py\\n tests/unit/test_settings_patcher.py\\n tests/unit/test_bm25_memory_telemetry.py\\n Start with: tests/unit/test_install_cli.py, src/cli/install.py, tests/unit/test_uninstall_cleanup.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 48, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "single_keyword_bm25_location_bm25path", "category": "keyword_single", "stdin": {"prompt": "BM25 어디 있지?", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"chore(refactor): add BM25-path golden fixtures for bm25-memory\\\" and 6 more\\n> **G2** (space search): `20260326-ctx-goal1-goal2-final.md`, `20260327-ctx-nemotron-g1g2-comparison.md`, `README.md` \\u2014 found via \\\"BM25 \\uc5b4\\ub514 \\uc788\\uc9c0\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 7 of 220)\\n > [2026-05-05] chore(refactor): add BM25-path golden fixtures for bm25-memory\\n > [2026-04-09] 20260409 bm25-memory: G1+G2 BM25 hook (recall 0.169\\u21920.881)\\n > [2026-03-30] live-infinite iter 3/\\u221e: BM25 stem \\uac15\\ud654 \\uc2e4\\ud5d8 (neutral)\\n > [2026-05-05] chore(refactor): capture golden fixtures for bm25-memory pre-decomposition\\n > [2026-04-27] feat: flywheel turns \\u2014 auto-tune BM25 params from local telemetry\\n > [2026-03-27] refactor: restore optimal BM25 blend ratio in rank_ctx_doc (norm*0.9)\\n > [2026-04-03] 20260403 COIR full corpus: BM25 Hit@5=0.640 on 280K docs\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260326-ctx-goal1-goal2-final.md\\n **Date**: 2026-03-26\\n > 20260327-ctx-nemotron-g1g2-comparison.md\\n **Date**: 2026-03-27\\n > README.md\\n \\uc6d0\\ubcf8 [jaytoone/CTX](https://github.com/jaytoone/CTX) \\ub97c production-level \\ub85c \\ub9ac\\ud329\\ud1a0\\ub9c1/\\ubcf4\\uac15\\ud55c fork.\\n > CLAUDE.md\\n CTX = **Claude Code\\uc758 \\uc790\\ub3d9 context \\uc8fc\\uc785 \\uc2dc\\uc2a4\\ud15c**.\\n[G2-GREP] Files matching 'BM25 \\uc5b4\\ub514 \\uc788\\uc9c0' (grep):\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/aggregated_stat_report.py\\n benchmarks/eval/g2_docs_ab_test.py\\n benchmarks/eval/g1_docs_bm25_eval.py\\n src/cli/telemetry.py\\n Start with: src/retrieval/adaptive_trigger.py, benchmarks/eval/aggregated_stat_report.py, benchmarks/eval/g2_docs_ab_test.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 316, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "single_keyword_vec_daemon_bm25path", "category": "keyword_single", "stdin": {"prompt": "vec-daemon 코드 위치", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"feat: auto-start bge-daemon on SessionStart (alongside vec-daemon)\\\" and 4 more\\n> **G2** (space search): `20260411-g1-g2-architecture-improvements.md`, `QUICK_REFERENCE.md`, `20260410-claude-vault-research.md` \\u2014 found via \\\"vec daemon \\uc704\\uce58\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 5 of 220)\\n > [2026-04-27] feat: auto-start bge-daemon on SessionStart (alongside vec-daemon)\\n > [2026-04-27] feat: ship vec-daemon + bge-daemon in wheel + feature comparison DOCX\\n > [2026-05-04] fix: vec-daemon venv isolation in plugin Setup + SessionStart + bump 0.3.12\\n > [2026-04-27] feat: daemon degradation warnings + dependency guard\\n > [2026-05-04] fix: bge-daemon opt-in via CTX_BGE_ENABLE=1 + bump 0.3.13\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260411-g1-g2-architecture-improvements.md\\n **Date**: 2026-04-11 | **Type**: Implementation + empirical benchmark\\n > QUICK_REFERENCE.md\\n | MCP | \\ud604\\uc7ac \\uc0c1\\ud669 | \\uc790\\ub3d9\\ud654 \\ub300\\uc0c1 | \\uc6b0\\uc120\\uc21c\\uc704 |\\n > 20260410-claude-vault-research.md\\n **Date**: 2026-04-10 **Skill**: expert-research-v2\\n > 20260411-chat-memory-threshold-principled.md\\n **Date**: 2026-04-11 **Type**: Empirical analysis **Scope**: project (CTX)\\n > 20260410-vault-vector-migration-and-benchmark.md\\n **Date**: 2026-04-10 **Scope**: claude-vault chat memory \\uc2dc\\uc2a4\\ud15c\\n[G2-GREP] Files matching 'vec daemon \\uc704\\uce58' (grep):\\n tests/unit/test_install_cli.py\\n src/cli/install.py\\n src/hooks/bge-daemon.py\\n src/hooks/_bm25/rerank.py\\n src/hooks/vec-daemon.py\\n Start with: tests/unit/test_install_cli.py, src/cli/install.py, src/hooks/bge-daemon.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 242, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "single_keyword_rank_bm25_bm25path", "category": "keyword_single", "stdin": {"prompt": "rank_bm25", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): `20260426-g2-docs-hybrid-dense-retrieval.md`, `20260328-ctx-real-codebase-g2-eval.md`, `20260426-mab-longmemeval-validity-for-ctx.md` \\u2014 found via \\\"rank_bm25\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260426-g2-docs-hybrid-dense-retrieval.md\\n **Date**: 2026-04-26 **Iteration**: 36\\n > 20260328-ctx-real-codebase-g2-eval.md\\n **Date**: 2026-03-28 **Type**: Real Codebase Ablation (G2 only) **Backend**: MiniMax M2.5\\n > 20260426-mab-longmemeval-validity-for-ctx.md\\n **Date**: 2026-04-26 **Skill**: expert-research-v2\\n > 20260327-ctx-real-project-self-eval.md\\n **Date**: 2026-03-27 **Type**: Empirical Validation\\n > 20260328-ctx-downstream-nemotron-eval.md\\n **Date**: 2026-03-28\\n[G2-GREP] Files matching 'rank_bm25' (grep):\\n benchmarks/eval/claude_sonnet_ctx_eval.py\\n benchmarks/eval/nemotron_ctx_eval_v3.py\\n benchmarks/eval/nemotron_ctx_eval_v4.py\\n benchmarks/eval/doc_retrieval_eval_v2.py\\n benchmarks/eval/nemotron_ctx_eval.py\\n Start with: benchmarks/eval/claude_sonnet_ctx_eval.py, benchmarks/eval/nemotron_ctx_eval_v3.py, benchmarks/eval/nemotron_ctx_eval_v4.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 230, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "korean_paraphrase_decision_mem_bm25path", "category": "korean_paraphrase", "stdin": {"prompt": "의사결정 기억은 어떻게 관리되나요?", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"20260407 git-memory: universal decision detection (feat:/fix:/v-versio\\\" and 6 more\\n> **G2** (space search): `20260409-g1g2-critique-and-verification.md`, `README.md`, `20260410-claude-vault-research.md` \\u2014 found via \\\"\\uc758\\uc0ac\\uacb0\\uc815 \\uae30\\uc5b5\\uc740 \\uad00\\ub9ac\\ub418\\ub098\\uc694\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 7 of 220)\\n > [2026-04-07] 20260407 git-memory: universal decision detection (feat:/fix:/v-version patterns)\\n > [2026-04-09] 20260409 bm25-memory: G1+G2 BM25 hook (recall 0.169\\u21920.881)\\n > [2026-04-09] 20260409 1751 bm25-memory: G2-DOCS threshold fix + G1 recall \\uac80\\uc99d \\uc644\\ub8cc - BM25 threshold 0\\u21923.0: \\ud55c\\uad6d\\uc5b4 \\ud1a0\\ud070 \\uac70\\uc9d3\\uc591\\uc131 \\uc81c\\uac70 + G2-DOCS \\n > [2026-04-11] live iter 1/5: hook \\uc544\\ud0a4\\ud14d\\ucc98 \\uc2dc\\uac04/\\uacf5\\uac04 \\uae30\\uc5b5 \\uc0c1\\ud55c\\uc120 \\uc2e4\\uc99d \\uce21\\uc815 \\uc644\\ub8cc\\n > [2026-05-05] chore(refactor): add BM25-path golden fixtures for bm25-memory\\n > [2026-05-02] feat: chat-memory vault exclusion + bump 0.3.10 \\u2192 0.3.11\\n > [2026-05-05] chore(refactor): capture golden fixtures for bm25-memory pre-decomposition\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260409-g1g2-critique-and-verification.md\\n **Date**: 2026-04-09\\n > README.md\\n \\uc6d0\\ubcf8 [jaytoone/CTX](https://github.com/jaytoone/CTX) \\ub97c production-level \\ub85c \\ub9ac\\ud329\\ud1a0\\ub9c1/\\ubcf4\\uac15\\ud55c fork.\\n > 20260410-claude-vault-research.md\\n **Date**: 2026-04-10 **Skill**: expert-research-v2\\n > 20260328-ctx-real-codebase-g2-eval.md\\n **Date**: 2026-03-28 **Type**: Real Codebase Ablation (G2 only) **Backend**: MiniMax M2.5\\n > 20260408-g1-longterm-memory-evaluation-framework.md\\n **Date**: 2026-04-08 **Skill**: expert-research-v2 **Type**: Methodology Research\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 232, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "korean_paraphrase_rerank_logic_bm25path", "category": "korean_paraphrase", "stdin": {"prompt": "검색 결과 재랭킹 로직 보고 싶습니다", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"20260405 G2 prefetch benchmark: 30% -> 65% after ko-en mapping + filep\\\" and 5 more\\n> **G2** (space search): `QUICK_REFERENCE.md`, `20260327-ctx-paper-numbers-critique.md`, `20260328-ctx-downstream-nemotron-eval-v2.md` \\u2014 found via \\\"search retrieve find\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 6 of 220)\\n > [2026-04-06] 20260405 G2 prefetch benchmark: 30% -> 65% after ko-en mapping + filepath search\\n > [2026-03-25] Add document retrieval support to CTX hook + eval\\n > [2026-04-27] fix: G2-DOCS candidates=None \\u2192 actual corpus size (retrieval coverage gap)\\n > [2026-04-02] 20260402 1730 live-infinite iter 12/\\u221e: G1 redefined \\u2014 zero-storage instant retrieval\\n > [2026-04-27] feat: Stage 3 local loop \\u2014 profile-aware retrieval + auto-tune badge (v0.3.3)\\n > [2026-04-09] 20260409 PageIndex + BM25 docs eval: G1 long-term memory via research doc retrieval\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > QUICK_REFERENCE.md\\n | MCP | \\ud604\\uc7ac \\uc0c1\\ud669 | \\uc790\\ub3d9\\ud654 \\ub300\\uc0c1 | \\uc6b0\\uc120\\uc21c\\uc704 |\\n > 20260327-ctx-paper-numbers-critique.md\\n **Date**: 2026-03-27 **Skill**: expert-research-v2\\n > 20260328-ctx-downstream-nemotron-eval-v2.md\\n **Date**: 2026-03-28\\n > 20260327-ctx-nemotron-g1g2-comparison.md\\n **Date**: 2026-03-27\\n > MCP_AUTOMATION_RESEARCH.md\\n **\\uc0dd\\uc131\\uc77c**: 2026-03-25\\n[G2-GREP] Files matching 'search retrieve find' (grep):\\n benchmarks/eval/hook_comparison_eval.py\\n src/retrieval/adaptive_trigger.py\\n src/evaluator/benchmark_runner.py\\n src/evaluator/coir_evaluator.py\\n src/evaluator/repobench_evaluator.py\\n Start with: benchmarks/eval/hook_comparison_eval.py, src/retrieval/adaptive_trigger.py, src/evaluator/benchmark_runner.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 294, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "english_code_rank_decisions_bm25path", "category": "english_code", "stdin": {"prompt": "where is the rank_decisions function defined", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"20260402 1800 omc-live iter 1: SYMBOL_PATTERN fix \\u2014 PascalCase after f\\\" and 0 more\\n> **G2** (space search): `20260426-mab-longmemeval-validity-for-ctx.md`, `20260421-ctx-monetization-session-summary.md`, `20260330-ctx-academic-critique-web-grounded.md` \\u2014 found via \\\"rank_decisions function defined\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 1 of 220)\\n > [2026-04-02] 20260402 1800 omc-live iter 1: SYMBOL_PATTERN fix \\u2014 PascalCase after function/class keyword\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260426-mab-longmemeval-validity-for-ctx.md\\n **Date**: 2026-04-26 **Skill**: expert-research-v2\\n > 20260421-ctx-monetization-session-summary.md\\n **Session goal**: Turn the CTX dashboard into a monetizable product surface \\u2014 optimize loading, eval proof score against\\n > 20260330-ctx-academic-critique-web-grounded.md\\n **Date**: 2026-03-30 **Skill**: expert-research-v2\\n[G2-GREP] Files matching 'rank_decisions function defined' (grep):\\n hf_space_dashboard/static/app.js\\n src/evaluator/llm_quality.py\\n src/data/dataset_generator.py\\n run_llm_eval_opensource.py\\n run_llm_eval.py\\n Start with: hf_space_dashboard/static/app.js, src/evaluator/llm_quality.py, src/data/dataset_generator.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 284, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "english_code_docs_loader_bm25path", "category": "english_code", "stdin": {"prompt": "show me the docs corpus loader", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"fix: G2-DOCS candidates=None \\u2192 actual corpus size (retrieval coverage \\\" and 6 more\\n> **G2** (space search): `20260427-dashboard-semantic-appeal-spec.md`, `20260402-omc-live-infinite-progress-report.md`, `20260407-g1-temporal-eval-results.md` \\u2014 found via \\\"show docs corpus\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 7 of 220)\\n > [2026-04-27] fix: G2-DOCS candidates=None \\u2192 actual corpus size (retrieval coverage gap)\\n > [2026-04-03] 20260403 COIR full corpus: BM25 Hit@5=0.640 on 280K docs\\n > [2026-04-03] 20260403 live-inf iter 6/\\u221e: docs fully separated from BM25 corpus + file_paths\\n > [2026-05-05] chore(refactor): refresh BM25-path fixtures after golden commit added to G1 corpus\\n > [2026-04-03] 20260403 COIR standard benchmark: BM25 Hit@5=0.780 on CodeSearchNet Python (24.9K corpus)\\n > [2026-04-11] docs: 20260411 g1 arch improvements Obsidian wikilink sync\\n > [2026-03-27] docs: CTX downstream LLM eval report + DOC_INDEX \\uc5c5\\ub370\\uc774\\ud2b8\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260427-dashboard-semantic-appeal-spec.md\\n **Date**: 2026-04-27 **Type**: Design Research + Implementation Spec\\n > 20260402-omc-live-infinite-progress-report.md\\n **Date**: 2026-04-02\\n > 20260407-g1-temporal-eval-results.md\\n **Date**: 2026-04-07 **Type**: Empirical measurement\\n > 20260408-g1-longterm-eval-initial-results.md\\n **Date**: 2026-04-08\\n > 20260417-ctx-semantic-search-upgrade-sota.md\\n **Date**: 2026-04-17 **Skill**: expert-research-v2\\n[G2-GREP] Files matching 'show docs corpus' (grep):\\n src/evaluator/coir_evaluator.py\\n benchmarks/eval/doc_retrieval_eval_v2.py\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/g2_docs_eval.py\\n src/hooks/_bm25/docs_search.py\\n Start with: src/evaluator/coir_evaluator.py, benchmarks/eval/doc_retrieval_eval_v2.py, src/retrieval/adaptive_trigger.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 286, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "noctx_bypass_hello_bm25path", "category": "avoidance", "stdin": {"prompt": "[noctx] just say hello", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": [], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G2** (space search): `README.md`, `20260419-ctx-report-visibility-research.md`, `20260427-dashboard-ui-multilens-eval.md` \\u2014 found via \\\"noctx just say\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > README.md\\n \\uc6d0\\ubcf8 [jaytoone/CTX](https://github.com/jaytoone/CTX) \\ub97c production-level \\ub85c \\ub9ac\\ud329\\ud1a0\\ub9c1/\\ubcf4\\uac15\\ud55c fork.\\n > 20260419-ctx-report-visibility-research.md\\n **Date**: 2026-04-19 **Skill**: expert-research-v2\\n > 20260427-dashboard-ui-multilens-eval.md\\n **Date**: 2026-04-27 **Type**: Design Evaluation + Implementation Spec\\n > 20260424-memory-retrieval-benchmark-landscape.md\\n **Date**: 2026-04-24\\n[G2-GREP] Files matching 'noctx just say' (grep):\\n benchmarks/eval/g1_docs_bm25_eval.py\\n benchmarks/eval/g1_docs_memory_eval.py\\n hf_space_dashboard/static/styles.css\\n benchmarks/eval/doc_retrieval_eval_v2.py\\n hf_space_dashboard/static/app.js\\n Start with: benchmarks/eval/g1_docs_bm25_eval.py, benchmarks/eval/g1_docs_memory_eval.py, hf_space_dashboard/static/styles.css\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 269, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "avoidance_fix_typo_bm25path", "category": "avoidance", "stdin": {"prompt": "fix: typo in README", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": [], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"fix: align all v1.5 schema references in upload + README (v0.2.7)\\\" and 6 more\\n> **G2** (space search): `20260427-ctx-plugin-distribution-research.md`, `20260423-ctx-vs-claudemem-evaluation-rubric-v2-paper-tier.md`, `20260427-ctx-flywheel-data-coverage.md` \\u2014 found via \\\"fix typo README\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 7 of 220)\\n > [2026-04-27] fix: align all v1.5 schema references in upload + README (v0.2.7)\\n > [2026-03-30] docs(readme): add [fix] tag and control tags table to hook section\\n > [2026-03-30] live-infinite iter 26/\\u221e: success | goal_v2: fix ctx_loader\\u2192ctx_real_loader in integration doc + README +0.162\\u2192+0.163 con\\n > [2026-03-30] live-infinite iter 2/\\u221e: P3 README \\ubc30\\uc9c0 \\ucd94\\uac00\\n > [2026-03-30] feat: distribution prep \\u2014 pyproject.toml + README + HF Space + hooks/\\n > [2026-04-27] feat: extend auto-tune to G2-DOCS + startup badge + README tune docs\\n > [2026-03-30] live-infinite iter 8/\\u221e: README Key Findings \\u2014 add external codebase outperformance bullet\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260427-ctx-plugin-distribution-research.md\\n **Date**: 2026-04-27 **Skill**: expert-research-v2\\n > 20260423-ctx-vs-claudemem-evaluation-rubric-v2-paper-tier.md\\n **Date**: 2026-04-23\\n > 20260427-ctx-flywheel-data-coverage.md\\n **Date**: 2026-04-27 **live-inf iter 67\\u201374**\\n > README.md\\n \\uc6d0\\ubcf8 [jaytoone/CTX](https://github.com/jaytoone/CTX) \\ub97c production-level \\ub85c \\ub9ac\\ud329\\ud1a0\\ub9c1/\\ubcf4\\uac15\\ud55c fork.\\n > 20260409-bm25-memory-generalization-research.md\\n **Date**: 2026-04-09 **Skill**: expert-research-v2\\n[G2-GREP] Files matching 'fix typo README' (grep):\\n src/hooks/_bm25/docs_search.py\\n benchmarks/eval/project_understanding_g1_eval.py\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/expand_mab_templates.py\\n benchmarks/eval/retrieve_ctx_v2.py\\n Start with: src/hooks/_bm25/docs_search.py, benchmarks/eval/project_understanding_g1_eval.py, src/retrieval/adaptive_trigger.py\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 280, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "empty_prompt_bm25path", "category": "empty_short", "stdin": {"prompt": "", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 93, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "short_question_mark_bm25path", "category": "empty_short", "stdin": {"prompt": "?", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 161, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} +{"id": "hooks_keyword_tokenizer_bm25path", "category": "hooks_keyword", "stdin": {"prompt": "bm25-memory.py 의 토큰화 부분 알려줘", "session_id": "test", "transcript_path": "/tmp/test-transcript"}, "argv": ["--rich"], "env": {"HOME": "/tmp/ctx_golden_home_bm25", "CLAUDE_PROJECT_DIR": "/Users/d9ng/privateProject/_research/_util/CTX", "CTX_DISABLE_SEMANTIC_RERANK": "1", "CTX_CROSS_ENCODER": "0", "CTX_TELEMETRY": "", "CTX_DASHBOARD_INTERNAL": "1"}, "python_bin": ".venv-golden/bin/python", "expected_stdout": "{\"hookSpecificOutput\": {\"hookEventName\": \"UserPromptSubmit\", \"additionalContext\": \"> **G1** (time memory): \\\"20260409 bm25-memory: G1+G2 BM25 hook (recall 0.169\\u21920.881)\\\" and 6 more\\n> **G2** (space search): `20260328-ctx-real-codebase-g2-eval.md`, `20260411-chat-memory-threshold-principled.md`, `20260329-ctx-hook-improvement-report.md` \\u2014 found via \\\"bm25 memory \\ud1a0\\ud070\\ud654\\\"\\n\\n[SESSION NOTES (\\ubbf8\\ucee4\\ubc0b \\ud310\\ub2e8)]\\n> 2026-04-08 14:08: G1.5 \\uad6c\\ud604 \\uc644\\ub8cc: /g1 \\uc2a4\\ud0ac(\\uc218\\ub3d9) + [G1] \\ub9c8\\ucee4 \\uc790\\ub3d9\\ucea1\\ucc98 + git-memory.py SESSION NOTES \\uc8fc\\uc785. topic-dedup\\uc740 temporal retention\\uc774 \\uc544\\ub2cc topic diversity\\ub97c \\ucd5c\\uc801\\ud654\\ud568 (age=15 \\uc2e4\\uce21)\\n[RECENT DECISIONS] (BM25: top 7 of 220)\\n > [2026-04-09] 20260409 bm25-memory: G1+G2 BM25 hook (recall 0.169\\u21920.881)\\n > [2026-04-09] 20260409 PageIndex + BM25 docs eval: G1 long-term memory via research doc retrieval\\n > [2026-04-10] 20260410 1026 G1 \\uacf5\\uc815 \\ud3c9\\uac00: BM25 \\uad6c\\uc870\\uc801 \\ud3b8\\ud5a5 \\uc815\\ub7c9\\ud654 (\\ud3b8\\ud5a5=0.373, \\uacf5\\uc815 Recall@7=0.634) - g1_fair_eval.py: 59 Type1 paraphrase + 12 Ty\\n > [2026-04-09] 20260409 1751 bm25-memory: G2-DOCS threshold fix + G1 recall \\uac80\\uc99d \\uc644\\ub8cc - BM25 threshold 0\\u21923.0: \\ud55c\\uad6d\\uc5b4 \\ud1a0\\ud070 \\uac70\\uc9d3\\uc591\\uc131 \\uc81c\\uac70 + G2-DOCS \\n > [2026-05-05] chore(refactor): add BM25-path golden fixtures for bm25-memory\\n > [2026-05-05] chore(refactor): capture golden fixtures for bm25-memory pre-decomposition\\n > [2026-04-08] 20260408 \\uc6d0\\ub798 \\uc758\\ub3c4 gap analysis: Downstream \\u03b4 \\ubbf8\\uc218\\ud589, Format ablation \\ubbf8\\uc218\\ud589\\n[G2-DOCS] (BM25+dense RRF relevant research docs)\\n > 20260328-ctx-real-codebase-g2-eval.md\\n **Date**: 2026-03-28 **Type**: Real Codebase Ablation (G2 only) **Backend**: MiniMax M2.5\\n > 20260411-chat-memory-threshold-principled.md\\n **Date**: 2026-04-11 **Type**: Empirical analysis **Scope**: project (CTX)\\n > 20260329-ctx-hook-improvement-report.md\\n **\\ub0a0\\uc9dc**: 2026-03-29\\n > 20260411-auto-index-necessity-analysis.md\\n **Date**: 2026-04-11 **Skill**: expert-research-v2\\n > 20260407-g1g2-established-benchmarks.md\\n **Date**: 2026-04-07 **Skill**: expert-research-v2\\n[G2-GREP] Files matching 'bm25 memory \\ud1a0\\ud070\\ud654' (grep):\\n src/retrieval/adaptive_trigger.py\\n benchmarks/eval/aggregated_stat_report.py\\n benchmarks/eval/g2_docs_ab_test.py\\n benchmarks/eval/g1_docs_bm25_eval.py\\n src/hooks/chat-memory.py\\n Start with: src/retrieval/adaptive_trigger.py, benchmarks/eval/aggregated_stat_report.py, benchmarks/eval/g2_docs_ab_test.py\\n[KNOWN FACTS]\\n * MAB N=50 Wilson CI: none 0.00 [0.00,0.07] / ctx 0.40 [0.28,0.54] / ctx_v2 0.58 [\\n * LongMemEval (REAL) N=10: none=0.10 / ctx=0.10 / ctx_v2=0.30 / ctx_v3=0.30 / chro\\n * Final paper tables ready: MAB synthetic + MAB N=50+CI + LongMemEval real + PUAC \\n * PAPER HEADLINE: claudemem_faithful (LLM-summarize \\u2192 Chroma) = 0.80 on MAB N=10 v\\n * HEADLINE #2: MAB N=50 with Wilson CI \\u2014 ctx_v3 = 0.880 [0.762, 0.944] BEATS claud\\n * BUG FIXED 2026-04-25: retrieve_ctx_v3 + retrieve_claudemem_faithful used id(hays\\n * CORRECTED MAB N=50 Wilson CI: ctx 0.40 / ctx_v2 0.58 / chroma 0.78 / faithful 0.\\n * McNemar p-values N=50: ctx_v2 vs ctx p=0.049 SIGNIFICANT. ctx_v3 vs faithful p=0\"}}", "expected_exit_code": 0, "stdout_normalize": null, "elapsed_ms_observed": 227, "determinism_verified": true, "determinism_note": "two consecutive runs produced identical stdout+exitcode"} diff --git a/tests/golden/bm25_path_corpus_frozen.json b/tests/golden/bm25_path_corpus_frozen.json new file mode 100644 index 0000000..12ea04b --- /dev/null +++ b/tests/golden/bm25_path_corpus_frozen.json @@ -0,0 +1 @@ +{"corpus": [{"hash": "b398ee8b2871a88bf92af713df6b37ba2ce08ea1", "subject": "chore(refactor): refresh BM25-path fixtures after golden commit added to G1 corpus", "date": "2026-05-05", "text": "2026-05-05 chore(refactor): refresh BM25-path fixtures after golden commit added to G1 corpus"}, {"hash": "b36fc8da52e7b411e9aadac585e71355b15736ae", "subject": "chore(refactor): add BM25-path golden fixtures for bm25-memory", "date": "2026-05-05", "text": "2026-05-05 chore(refactor): add BM25-path golden fixtures for bm25-memory"}, {"hash": "b24fb26b626f5e619d8e292aa36af65e87be1b2c", "subject": "chore(refactor): capture golden fixtures for bm25-memory pre-decomposition", "date": "2026-05-05", "text": "2026-05-05 chore(refactor): capture golden fixtures for bm25-memory pre-decomposition"}, {"hash": "201c8101adec32d6800c5f1e56adb08e721274b3", "subject": "fix: replace broken iter5-full.png with actual dashboard screenshot", "date": "2026-05-04", "text": "2026-05-04 fix: replace broken iter5-full.png with actual dashboard screenshot"}, {"hash": "30ede7739b390cf9dc82e710007486eff376be1b", "subject": "fix: HuggingFace Space URL jaytoone→Be2Jay", "date": "2026-05-04", "text": "2026-05-04 fix: HuggingFace Space URL jaytoone→Be2Jay"}, {"hash": "f7d5cb7a37a18ab5b5334e8452865fd49ea414d3", "subject": "fix: bge-daemon opt-in via CTX_BGE_ENABLE=1 + bump 0.3.13", "date": "2026-05-04", "text": "2026-05-04 fix: bge-daemon opt-in via CTX_BGE_ENABLE=1 + bump 0.3.13"}, {"hash": "39c19033e5d0920e42613bca019cdf16a3100706", "subject": "fix: vec-daemon venv isolation in plugin Setup + SessionStart + bump 0.3.12", "date": "2026-05-04", "text": "2026-05-04 fix: vec-daemon venv isolation in plugin Setup + SessionStart + bump 0.3.12"}, {"hash": "de7b0360e8cf106adebb269b4afecfc41b256699", "subject": "feat: chat-memory vault exclusion + bump 0.3.10 → 0.3.11", "date": "2026-05-02", "text": "2026-05-02 feat: chat-memory vault exclusion + bump 0.3.10 → 0.3.11"}, {"hash": "a128976f83d36a4571ef02e6ff33fc3ce24c6ed6", "subject": "fix: smoke test false negative on fresh install + bump 0.3.9 → 0.3.10", "date": "2026-05-02", "text": "2026-05-02 fix: smoke test false negative on fresh install + bump 0.3.9 → 0.3.10"}, {"hash": "5baf71b1ce26f59fda2f58a132c3279e09e5c39d", "subject": "fix: settings_patcher + plugin structure + bump 0.3.8 → 0.3.9", "date": "2026-05-01", "text": "2026-05-01 fix: settings_patcher + plugin structure + bump 0.3.8 → 0.3.9"}, {"hash": "27b5fae94bae2ba57eb5671ed054a67ef648ee4d", "subject": "feat: add live telemetry snapshots for HF Space dashboard", "date": "2026-05-01", "text": "2026-05-01 feat: add live telemetry snapshots for HF Space dashboard"}, {"hash": "12f7689ea353d7626de6c901fc536997fdfdac69", "subject": "bump: version 0.3.7 → 0.3.8 (install fixes)", "date": "2026-04-28", "text": "2026-04-28 bump: version 0.3.7 → 0.3.8 (install fixes)"}, {"hash": "1439017d1622423b3c4c8e0b842a8cd6667d7bc3", "subject": "bump: version 0.3.6 → 0.3.7 (telemetry integrity fixes)", "date": "2026-04-27", "text": "2026-04-27 bump: version 0.3.6 → 0.3.7 (telemetry integrity fixes)"}, {"hash": "0ec560dc601ad2270b48c38e4b9a8c6104e1d849", "subject": "fix: telemetry schema integrity — 3 data quality bugs", "date": "2026-04-27", "text": "2026-04-27 fix: telemetry schema integrity — 3 data quality bugs"}, {"hash": "e7b07dff8bcc0a31003c0ba3166c81cb5e99a2f7", "subject": "fix: G2-DOCS candidates=None → actual corpus size (retrieval coverage gap)", "date": "2026-04-27", "text": "2026-04-27 fix: G2-DOCS candidates=None → actual corpus size (retrieval coverage gap)"}, {"hash": "c55dd7ec815c8950a6842334ffa603b954210583", "subject": "chore: untrack .omc/decision_corpus.json (runtime cache in .gitignore)", "date": "2026-04-27", "text": "2026-04-27 chore: untrack .omc/decision_corpus.json (runtime cache in .gitignore)"}, {"hash": "92e8ee7e6fef478034c413aa20a016f86207d286", "subject": "feat: add data-driven monetization trigger table to capstone (iter 82)", "date": "2026-04-27", "text": "2026-04-27 feat: add data-driven monetization trigger table to capstone (iter 82)"}, {"hash": "f4e5820940525dcb50816371e142f49a8f543281", "subject": "fix: classify query_type from transcript when block_meta missing (iter 81)", "date": "2026-04-27", "text": "2026-04-27 fix: classify query_type from transcript when block_meta missing (iter 81)"}, {"hash": "bc5589ff1ed7410edc4509dc21d06a1b93d15847", "subject": "feat: add live telemetry snapshot to data asset capstone (iter 80)", "date": "2026-04-27", "text": "2026-04-27 feat: add live telemetry snapshot to data asset capstone (iter 80)"}, {"hash": "3cdd863005f1657ccf33f65cc238ac66011b8262", "subject": "feat: add pricing model implications to data asset capstone (iter 79)", "date": "2026-04-27", "text": "2026-04-27 feat: add pricing model implications to data asset capstone (iter 79)"}, {"hash": "6460cc38d99cf4647484dd16c05700cff3a64105", "subject": "feat: pre-monetization data asset capstone doc + bump v0.3.6", "date": "2026-04-27", "text": "2026-04-27 feat: pre-monetization data asset capstone doc + bump v0.3.6"}, {"hash": "1e7809b0dcfd20b2187bb407bcc95f2415cc3e75", "subject": "feat: cited_node_types resolved via arch equivalence + node_type utility cross-tab (v0.3.5)", "date": "2026-04-27", "text": "2026-04-27 feat: cited_node_types resolved via arch equivalence + node_type utility cross-tab (v0.3.5)"}, {"hash": "a1934734e732630eb24d7811524ca4e624175dcb", "subject": "feat: schema v1.6 — node_type_dist (retrieval_event) + node_type_hist (session_aggregate)", "date": "2026-04-27", "text": "2026-04-27 feat: schema v1.6 — node_type_dist (retrieval_event) + node_type_hist (session_aggregate)"}, {"hash": "bff5ce5219763f779ba35b4a6978bbc584af3814", "subject": "feat: Stage 3 local loop — profile-aware retrieval + auto-tune badge (v0.3.3)", "date": "2026-04-27", "text": "2026-04-27 feat: Stage 3 local loop — profile-aware retrieval + auto-tune badge (v0.3.3)"}, {"hash": "f9895114bae3059d15dad81546fe5086a290fea3", "subject": "feat: ctx-telemetry cluster — local project type fingerprint (Stage 3 prerequisite)", "date": "2026-04-27", "text": "2026-04-27 feat: ctx-telemetry cluster — local project type fingerprint (Stage 3 prerequisite)"}, {"hash": "ac637dc01d885fcaf23d62e66afc9d11f6993c3a", "subject": "docs: telemetry-impl doc completeness — iter-68 session fields + header v0.3.1", "date": "2026-04-27", "text": "2026-04-27 docs: telemetry-impl doc completeness — iter-68 session fields + header v0.3.1"}, {"hash": "866a7242601ea2bc086b7a25d7b7a4a35de817c6", "subject": "feat: cmd_summary flywheel health line — causal r + upgrade hint (v0.3.0)", "date": "2026-04-27", "text": "2026-04-27 feat: cmd_summary flywheel health line — causal r + upgrade hint (v0.3.0)"}, {"hash": "b144d078da15e1663073f90fa88abff321bdb316", "subject": "feat: cross-session causal analysis in cmd_tune + calibrate (v0.2.9)", "date": "2026-04-27", "text": "2026-04-27 feat: cross-session causal analysis in cmd_tune + calibrate (v0.2.9)"}, {"hash": "f722b3b3a79f488ff6f8b4e5a8f5cde119af05b7", "subject": "feat: session_aggregate v1.5 — mean_top_score_bm25 + query_type_hist (v0.2.8)", "date": "2026-04-27", "text": "2026-04-27 feat: session_aggregate v1.5 — mean_top_score_bm25 + query_type_hist (v0.2.8)"}, {"hash": "c4b1326fd53e68326277593dce91b920a04d6888", "subject": "fix: align all v1.5 schema references in upload + README (v0.2.7)", "date": "2026-04-27", "text": "2026-04-27 fix: align all v1.5 schema references in upload + README (v0.2.7)"}, {"hash": "aa7386f369dc6620926eb166171ec7ad8256e195", "subject": "feat: extend top_score capture to G2-DOCS + causal r in cmd_tune (v0.2.6)", "date": "2026-04-27", "text": "2026-04-27 feat: extend top_score capture to G2-DOCS + causal r in cmd_tune (v0.2.6)"}, {"hash": "9d550c9ecd16a397aafcc0f5bae49f23bc1115ee", "subject": "feat: schema v1.5 — top_score_bm25 + top_score_dense causal calibration", "date": "2026-04-27", "text": "2026-04-27 feat: schema v1.5 — top_score_bm25 + top_score_dense causal calibration"}, {"hash": "2c8c7304c184277fcdefc7b2e176ddc7242550e2", "subject": "feat: Stage 2 upload pipeline client (k-anonymized session_aggregate POST)", "date": "2026-04-27", "text": "2026-04-27 feat: Stage 2 upload pipeline client (k-anonymized session_aggregate POST)"}, {"hash": "90aee4a28b69e612aaecb7ea3833f16b50fcbb5c", "subject": "feat: extend auto-tune to G2-DOCS + startup badge + README tune docs", "date": "2026-04-27", "text": "2026-04-27 feat: extend auto-tune to G2-DOCS + startup badge + README tune docs"}, {"hash": "9cc101e83f679bfe299a16d08dba4722850482db", "subject": "feat: flywheel turns — auto-tune BM25 params from local telemetry", "date": "2026-04-27", "text": "2026-04-27 feat: flywheel turns — auto-tune BM25 params from local telemetry"}, {"hash": "02ee15846365f8c18a383a954b87f24e8b043f4a", "subject": "feat: schema v1.4 — consent command + vault_entry_count + index_staleness_hours", "date": "2026-04-27", "text": "2026-04-27 feat: schema v1.4 — consent command + vault_entry_count + index_staleness_hours"}, {"hash": "6ca6ed49233f8faa6651d320ec75d6bc6da69f5d", "subject": "feat: schema v1.3 — user_id hash in retrieval_event + session_aggregate", "date": "2026-04-27", "text": "2026-04-27 feat: schema v1.3 — user_id hash in retrieval_event + session_aggregate"}, {"hash": "468be30f7aea995c7e9b724fdaed6a0a5ff03bdb", "subject": "feat: schema v1.2 + calibrate subcommand (citation bias detection)", "date": "2026-04-27", "text": "2026-04-27 feat: schema v1.2 + calibrate subcommand (citation bias detection)"}, {"hash": "e57d4ad7a813ab128976e9b52460370e3fb7b993", "subject": "feat: schema v1.1 — add query_type to retrieval_event (flywheel moat metric)", "date": "2026-04-27", "text": "2026-04-27 feat: schema v1.1 — add query_type to retrieval_event (flywheel moat metric)"}, {"hash": "5b9daae8c0ed56877d2710429ac0e550f03c8714", "subject": "feat: daemon degradation warnings + dependency guard", "date": "2026-04-27", "text": "2026-04-27 feat: daemon degradation warnings + dependency guard"}, {"hash": "e35caa7d2576ce9c002713cbdf6bb006543d2f4e", "subject": "feat: auto-start bge-daemon on SessionStart (alongside vec-daemon)", "date": "2026-04-27", "text": "2026-04-27 feat: auto-start bge-daemon on SessionStart (alongside vec-daemon)"}, {"hash": "df91729feaec4337331b2f491354361427f1752d", "subject": "feat: ship vec-daemon + bge-daemon in wheel + feature comparison DOCX", "date": "2026-04-27", "text": "2026-04-27 feat: ship vec-daemon + bge-daemon in wheel + feature comparison DOCX"}, {"hash": "f6997b2b32b67c2b40d9753657d44affd6b29495", "subject": "fix: hooks.json use $HOME/.claude/hooks/ not ${CLAUDE_PLUGIN_ROOT}/hooks/", "date": "2026-04-27", "text": "2026-04-27 fix: hooks.json use $HOME/.claude/hooks/ not ${CLAUDE_PLUGIN_ROOT}/hooks/"}, {"hash": "88ceff8cb09c681049dcf756ce68194cd31ade86", "subject": "fix: plugin marketplace schema + directory structure", "date": "2026-04-27", "text": "2026-04-27 fix: plugin marketplace schema + directory structure"}, {"hash": "11920f303f4ad33c04030ba10e54007d0ba277fb", "subject": "feat: ship hook files in wheel + auto-copy on ctx-install", "date": "2026-04-27", "text": "2026-04-27 feat: ship hook files in wheel + auto-copy on ctx-install"}, {"hash": "a70a0d6b575ebbabc27c16f01287b784db78f17b", "subject": "chore: rollback checkpoint — pre-dashboard-update-spec impl (iter 49 start state)", "date": "2026-04-27", "text": "2026-04-27 chore: rollback checkpoint — pre-dashboard-update-spec impl (iter 49 start state)"}, {"hash": "e704a1c4de4cd1ab14c3c7b0ad9c93fcb6e4b066", "subject": "live iter 34/∞: DOC_INDEX + G1 hybrid RRF research doc (post-human-loop update)", "date": "2026-04-26", "text": "2026-04-26 live iter 34/∞: DOC_INDEX + G1 hybrid RRF research doc (post-human-loop update)"}, {"hash": "13ef41287c4551c28dc60553005a23a2f93f4542", "subject": "live iter 34/∞: G1 hybrid BM25+dense-RRF — 0.966→0.983 Recall@7 (+1.7pp)", "date": "2026-04-26", "text": "2026-04-26 live iter 34/∞: G1 hybrid BM25+dense-RRF — 0.966→0.983 Recall@7 (+1.7pp)"}, {"hash": "a13e563453375c1235063fcba89f4584f27e5390", "subject": "live-inf USER_STOPPED state written (14 iters)", "date": "2026-04-17", "text": "2026-04-17 live-inf USER_STOPPED state written (14 iters)"}, {"hash": "d29692255aec188679c3bbd3430262085b57c6f2", "subject": "live-inf USER_STOPPED after 14 iters | semantic access restored + upgraded across CM/G1/G2", "date": "2026-04-17", "text": "2026-04-17 live-inf USER_STOPPED after 14 iters | semantic access restored + upgraded across CM/G1/G2"}, {"hash": "fdd26fca253ed0ab8d4e69220cef37ffe21f3744", "subject": "eval: g1_fair_eval.py 캐시 레이어 추가 — 반복 실행 분산 제거", "date": "2026-04-12", "text": "2026-04-12 eval: g1_fair_eval.py 캐시 레이어 추가 — 반복 실행 분산 제거"}, {"hash": "83e54f3c0bb6314a11f1e274cc535c2c7c8e9f4d", "subject": "live-inf state update: iter4 complete — G1 0.746, synonym+temporal+G2a shipped", "date": "2026-04-11", "text": "2026-04-11 live-inf state update: iter4 complete — G1 0.746, synonym+temporal+G2a shipped"}, {"hash": "7a58e399ec30a0a88c8125813ad2ca2c3bfe9269", "subject": "docs: 20260411 g1 arch improvements Obsidian wikilink sync", "date": "2026-04-11", "text": "2026-04-11 docs: 20260411 g1 arch improvements Obsidian wikilink sync"}, {"hash": "517a21d41c2ddfa98de61111ca1bd09373108827", "subject": "live iter 1/5: hook 아키텍처 시간/공간 기억 상한선 실증 측정 완료", "date": "2026-04-11", "text": "2026-04-11 live iter 1/5: hook 아키텍처 시간/공간 기억 상한선 실증 측정 완료\n2026-04-11 live iter 1/5: hook 아키텍처 시간/공간 기억 상한선 실증 측정 완료"}, {"hash": "223a2cc2dcf16341b3f41361ba1ea811a92790a7", "subject": "20260411 G2a fulldoc: A/B test → bm25_chunked→fulldoc 교체 (Recall@5 +9.1%)", "date": "2026-04-11", "text": "2026-04-11 20260411 G2a fulldoc: A/B test → bm25_chunked→fulldoc 교체 (Recall@5 +9.1%)"}, {"hash": "5033ff17f0db294486b40f492405e704f915bf46", "subject": "docs: Obsidian sync links 자동 추가 (20260410 fair eval 관련)", "date": "2026-04-10", "text": "2026-04-10 docs: Obsidian sync links 자동 추가 (20260410 fair eval 관련)"}, {"hash": "2935307ca5602ca9fa11313af23f54273f28f43a", "subject": "20260410 1026 G1 공정 평가: BM25 구조적 편향 정량화 (편향=0.373, 공정 Recall@7=0.634) - g1_fair_eval.py: 59 Type1 paraphrase + 12 Ty", "date": "2026-04-10", "text": "2026-04-10 20260410 1026 G1 공정 평가: BM25 구조적 편향 정량화 (편향=0.373, 공정 Recall@7=0.634) - g1_fair_eval.py: 59 Type1 paraphrase + 12 Ty"}, {"hash": "d90b36ed72a03438382c369acbbb50db7063e69f", "subject": "20260409 1751 bm25-memory: G2-DOCS threshold fix + G1 recall 검증 완료 - BM25 threshold 0→3.0: 한국어 토큰 거짓양성 제거 + G2-DOCS ", "date": "2026-04-09", "text": "2026-04-09 20260409 1751 bm25-memory: G2-DOCS threshold fix + G1 recall 검증 완료 - BM25 threshold 0→3.0: 한국어 토큰 거짓양성 제거 + G2-DOCS \n2026-04-09 20260409 1751 bm25-memory: G2-DOCS threshold fix + G1 recall 검증 완료 - BM25 threshold 0→3.0: 한국어 토큰 거짓양성 제거 + G2-DOCS "}, {"hash": "db72e3851bc185b0db615024134c88353e59bbf1", "subject": "20260409 bm25-memory: G1+G2 BM25 hook (recall 0.169→0.881)", "date": "2026-04-09", "text": "2026-04-09 20260409 bm25-memory: G1+G2 BM25 hook (recall 0.169→0.881)"}, {"hash": "4dcbdd9f3fea9db0ba79378000573a691e2c5e45", "subject": "20260409 FinanceBench: BM25 vs PageIndex fair comparison (n=20, real HF data)", "date": "2026-04-09", "text": "2026-04-09 20260409 FinanceBench: BM25 vs PageIndex fair comparison (n=20, real HF data)"}, {"hash": "9e5baa598a2e406eb56a87ef5efff81d7bed2080", "subject": "20260409 PageIndex + BM25 docs eval: G1 long-term memory via research doc retrieval", "date": "2026-04-09", "text": "2026-04-09 20260409 PageIndex + BM25 docs eval: G1 long-term memory via research doc retrieval"}, {"hash": "33a992a5e1e7e9569ca8b51eb4ace424ece123b1", "subject": "20260409 G1 long-term memory: full eval + SOTA comparison (7 baselines, 59 QA pairs)", "date": "2026-04-09", "text": "2026-04-09 20260409 G1 long-term memory: full eval + SOTA comparison (7 baselines, 59 QA pairs)"}, {"hash": "b2a9bf34c4773832496e95d4057c966fed882f28", "subject": "20260408 G1 temporal retention: age-based recall decay curve implemented + measured", "date": "2026-04-08", "text": "2026-04-08 20260408 G1 temporal retention: age-based recall decay curve implemented + measured"}, {"hash": "4a8507e72f0f0f81a860d2fe96dd0f698d7192dc", "subject": "20260408 G1 format ablation: 5포맷 downstream δ 실측 완료", "date": "2026-04-08", "text": "2026-04-08 20260408 G1 format ablation: 5포맷 downstream δ 실측 완료\n2026-04-08 20260408 G1 format ablation: 5포맷 downstream δ 실측 완료"}, {"hash": "fdb182e1adef4786a666fbc70f61227260f68483", "subject": "20260408 원래 의도 gap analysis: Downstream δ 미수행, Format ablation 미수행", "date": "2026-04-08", "text": "2026-04-08 20260408 원래 의도 gap analysis: Downstream δ 미수행, Format ablation 미수행"}, {"hash": "f3e39ba7a4dbed6007aeda88afa65e4027fc2dcc", "subject": "20260407 G1 noise filter + topic-dedup: NoiseRatio 50%→0%, TopicCov 73%→79%", "date": "2026-04-07", "text": "2026-04-07 20260407 G1 noise filter + topic-dedup: NoiseRatio 50%→0%, TopicCov 73%→79%"}, {"hash": "c707021892eeb123208a944e39971c0058828c50", "subject": "20260407 git-memory: universal decision detection (feat:/fix:/v-version patterns)", "date": "2026-04-07", "text": "2026-04-07 20260407 git-memory: universal decision detection (feat:/fix:/v-version patterns)"}, {"hash": "acd6096e44d1daedfb2b22d3e54619ceb32b3634", "subject": "20260407 G1 temporal eval results: Staleness 35.7%, Conflict 0% (4 projects)", "date": "2026-04-07", "text": "2026-04-07 20260407 G1 temporal eval results: Staleness 35.7%, Conflict 0% (4 projects)"}, {"hash": "d8f2de1b2f1d369315eb66d1c1c6e902eed8d4a4", "subject": "20260407 G1 temporal eval: Staleness Flag + Conflict Detection implemented", "date": "2026-04-07", "text": "2026-04-07 20260407 G1 temporal eval: Staleness Flag + Conflict Detection implemented"}, {"hash": "01dd8c0572d917cb5c5ce098ff736c77c1e62468", "subject": "20260405 G2 prefetch benchmark: 30% -> 65% after ko-en mapping + filepath search", "date": "2026-04-06", "text": "2026-04-06 20260405 G2 prefetch benchmark: 30% -> 65% after ko-en mapping + filepath search"}, {"hash": "ebd429f11ae3671fed62497373f94f660ac049bf", "subject": "20260405 Old CTX remnants fully removed", "date": "2026-04-06", "text": "2026-04-06 20260405 Old CTX remnants fully removed"}, {"hash": "a84b91e01120968ae368955e1e9539294d232412", "subject": "20260405 New CTX: git-memory + g2-augment + auto-index (old CTX retired)", "date": "2026-04-05", "text": "2026-04-05 20260405 New CTX: git-memory + g2-augment + auto-index (old CTX retired)"}, {"hash": "e06a15f935f07f8f661b5c0d7a7dcb683fb17fa6", "subject": "20260404 inject_decisions.py: git-only mode (no world-model dependency)", "date": "2026-04-04", "text": "2026-04-04 20260404 inject_decisions.py: git-only mode (no world-model dependency)"}, {"hash": "10519bd3207050281dd30a8b68293e31fd4cbe0f", "subject": "20260404 G1 git-log hook test results: 95% recall across 3 projects", "date": "2026-04-04", "text": "2026-04-04 20260404 G1 git-log hook test results: 95% recall across 3 projects"}, {"hash": "8a3f0ac3f1751804554e99a69645e845e9497c5c", "subject": "20260404 inject_decisions.py: git log primary, world-model secondary", "date": "2026-04-04", "text": "2026-04-04 20260404 inject_decisions.py: git log primary, world-model secondary"}, {"hash": "66ac72537a5d591be6ad9c6d6341a2e7dc25252a", "subject": "20260403 command hook + additionalContext for SOTA G1 activation", "date": "2026-04-03", "text": "2026-04-03 20260403 command hook + additionalContext for SOTA G1 activation"}, {"hash": "33dba2e9b476f627fbbcd6d48dc3f71ba05b116a", "subject": "20260403 G1/G2 measurement complete — standard benchmarks applied", "date": "2026-04-03", "text": "2026-04-03 20260403 G1/G2 measurement complete — standard benchmarks applied"}, {"hash": "ef4c7ef1b65586085f83cc8443c84b381bc23592", "subject": "20260403 COIR full corpus: BM25 Hit@5=0.640 on 280K docs", "date": "2026-04-03", "text": "2026-04-03 20260403 COIR full corpus: BM25 Hit@5=0.640 on 280K docs"}, {"hash": "9279b5690468180356331d467528d8caa8bb5144", "subject": "20260403 COIR standard benchmark: BM25 Hit@5=0.780 on CodeSearchNet Python (24.9K corpus)", "date": "2026-04-03", "text": "2026-04-03 20260403 COIR standard benchmark: BM25 Hit@5=0.780 on CodeSearchNet Python (24.9K corpus)"}, {"hash": "9e082f3d3695ff85f386424aaa6b5c8a0c9e6be5", "subject": "20260403 SOTA eval complete: G1 recall 90% + G2 complementary analysis", "date": "2026-04-03", "text": "2026-04-03 20260403 SOTA eval complete: G1 recall 90% + G2 complementary analysis"}, {"hash": "a989d84824237147a50d8b9493917bd31b818a5c", "subject": "20260403 G1/G2 definition clarified in CLAUDE.md", "date": "2026-04-03", "text": "2026-04-03 20260403 G1/G2 definition clarified in CLAUDE.md"}, {"hash": "4db7a3c57d35bd2a913023b9d44744e50af86ed6", "subject": "20260403 live-inf CONVERGED: G1 trigger→surfacing breakthrough achieved", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf CONVERGED: G1 trigger→surfacing breakthrough achieved\n2026-04-03 20260403 live-inf CONVERGED: G1 trigger→surfacing breakthrough achieved"}, {"hash": "783ac1a29deb236372ce38d947e73fee6b9e6f8c", "subject": "20260403 live-inf iter 4/∞: IMPLICIT_CONTEXT exempted from doc filtering", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf iter 4/∞: IMPLICIT_CONTEXT exempted from doc filtering"}, {"hash": "90cf49a0a3ff3d46ebdc28dc7e2543fcaa5a305c", "subject": "20260403 live-inf iter 2/∞: miss analysis complete — 7/30 structural BM25 limits", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf iter 2/∞: miss analysis complete — 7/30 structural BM25 limits"}, {"hash": "f6c9bd8ded7967c40091bebddb18c0bf4bcff538", "subject": "20260403 live-inf iter 1/∞: G1 BREAKTHROUGH — trigger→surfacing eval + doc-fallback fix", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf iter 1/∞: G1 BREAKTHROUGH — trigger→surfacing eval + doc-fallback fix"}, {"hash": "dd7f602cc25452878a94a94979000da0022716b9", "subject": "20260403 live-inf CONVERGED: plateau 7 iterations, escape attempts exhausted", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf CONVERGED: plateau 7 iterations, escape attempts exhausted\n2026-04-03 20260403 live-inf CONVERGED: plateau 7 iterations, escape attempts exhausted"}, {"hash": "429f257a3420ec07a7b0196e58b75546c652b015", "subject": "20260403 live-inf iter 8/∞: G1 3-run average stabilized — delta=+0.270 (±0.074)", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf iter 8/∞: G1 3-run average stabilized — delta=+0.270 (±0.074)"}, {"hash": "839f26fcc1b85f320bfec709cdea1f098f4e920f", "subject": "20260403 live-inf iter 7/∞: H03 ctx_query improved + History 0.55→0.661", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf iter 7/∞: H03 ctx_query improved + History 0.55→0.661"}, {"hash": "c6950b7904c4b9f82929bf3f6b415b211fc54f20", "subject": "20260403 live-inf iter 6/∞: docs fully separated from BM25 corpus + file_paths", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf iter 6/∞: docs fully separated from BM25 corpus + file_paths"}, {"hash": "85ccac041db25096cd5adc70e483d22234a5e09e", "subject": "20260403 live-inf iter 5/∞: docs excluded from BM25 corpus + compact summary reverted", "date": "2026-04-03", "text": "2026-04-03 20260403 live-inf iter 5/∞: docs excluded from BM25 corpus + compact summary reverted"}, {"hash": "62e6db02038d770e8e4f1ee57f99c382965f6eae", "subject": "20260402 2100 CTX repositioned as context bootstrapper + G1/G2 redefined", "date": "2026-04-02", "text": "2026-04-02 20260402 2100 CTX repositioned as context bootstrapper + G1/G2 redefined"}, {"hash": "1e84ef92baa79a313f172548acfed1b4c0d0afca", "subject": "20260402 2040 live iter 2/5: hybrid scoring (50% judge + 50% keyword) — CTX vs None delta=+0.300", "date": "2026-04-02", "text": "2026-04-02 20260402 2040 live iter 2/5: hybrid scoring (50% judge + 50% keyword) — CTX vs None delta=+0.300"}, {"hash": "1c3bdf4a5eae01b8756ec5469625734814a83ad5", "subject": "20260402 2030 live iter 1/5: ctx_query optimization + key doc boost (README/CLAUDE +1.0)", "date": "2026-04-02", "text": "2026-04-02 20260402 2030 live iter 1/5: ctx_query optimization + key doc boost (README/CLAUDE +1.0)"}, {"hash": "471065f8e564717f7a9e59e8127ce976ca1cc632", "subject": "20260402 2015 live-inf: save state for next session", "date": "2026-04-02", "text": "2026-04-02 20260402 2015 live-inf: save state for next session"}, {"hash": "453ce8f5cca7b8e09c0838ef2021723b546f105b", "subject": "20260402 2010 live-inf iter 2/∞: .md doc indexing + doc-priority boost for high-level queries", "date": "2026-04-02", "text": "2026-04-02 20260402 2010 live-inf iter 2/∞: .md doc indexing + doc-priority boost for high-level queries"}, {"hash": "0afe878da3cb6b2df9130d491cb01114cc5bcb56", "subject": "20260402 1935 live-inf: save state for context rotation", "date": "2026-04-02", "text": "2026-04-02 20260402 1935 live-inf: save state for context rotation"}, {"hash": "22f3137796c9da3a696cd7a7a7277f8ed13758d7", "subject": "20260402 1930 live-inf iter 1/∞: G1 eval rebuilt — LLM-as-judge + 3-arm + random baseline", "date": "2026-04-02", "text": "2026-04-02 20260402 1930 live-inf iter 1/∞: G1 eval rebuilt — LLM-as-judge + 3-arm + random baseline"}, {"hash": "e9b909686e70b786f6c6bd82d50b841ff726e884", "subject": "20260402 1900 live-inf iter 3/∞: G1=0.705 achieved | target 0.70 reached", "date": "2026-04-02", "text": "2026-04-02 20260402 1900 live-inf iter 3/∞: G1=0.705 achieved | target 0.70 reached"}, {"hash": "bc80c5e22bf74aaac1a092fa24d7c970b9e4ef87", "subject": "20260402 1850 live-inf iter 2/∞: G1 eval redesign — curated questions, language-independent keywords", "date": "2026-04-02", "text": "2026-04-02 20260402 1850 live-inf iter 2/∞: G1 eval redesign — curated questions, language-independent keywords"}, {"hash": "5ff6f222db6ddf9ad63366bb5ee5d1b451c2b266", "subject": "20260402 1820 live-inf iter 1/∞: G1 project-understanding eval framework", "date": "2026-04-02", "text": "2026-04-02 20260402 1820 live-inf iter 1/∞: G1 project-understanding eval framework"}, {"hash": "4de187223cca5c0817a68671afeff0936bf92268", "subject": "20260402 1805 omc-live iter 2: PascalCase SYMBOL_PATTERN tuned (8+ chars only)", "date": "2026-04-02", "text": "2026-04-02 20260402 1805 omc-live iter 2: PascalCase SYMBOL_PATTERN tuned (8+ chars only)"}, {"hash": "665169799ef1499baf79ca225c9651e5464f21d3", "subject": "20260402 1800 omc-live iter 1: SYMBOL_PATTERN fix — PascalCase after function/class keyword", "date": "2026-04-02", "text": "2026-04-02 20260402 1800 omc-live iter 1: SYMBOL_PATTERN fix — PascalCase after function/class keyword"}, {"hash": "f05c21572f7dd8f2e06b107fcf9758f914e92c61", "subject": "20260402 1745 live-infinite iter 13/∞: G1 zero-storage analysis — concept group R@5=0.965", "date": "2026-04-02", "text": "2026-04-02 20260402 1745 live-infinite iter 13/∞: G1 zero-storage analysis — concept group R@5=0.965"}, {"hash": "8f658a279fddede9fa08dff2539423d35a5e00d0", "subject": "20260402 1730 live-infinite iter 12/∞: G1 redefined — zero-storage instant retrieval", "date": "2026-04-02", "text": "2026-04-02 20260402 1730 live-infinite iter 12/∞: G1 redefined — zero-storage instant retrieval"}, {"hash": "5ba6a7c9ec2fbd1b1686202bab3c23845919ffd5", "subject": "20260402 1717 live-infinite iter 11/∞: reverse_import 10→30, temporal noise reduction, BM25 large-repo boost", "date": "2026-04-02", "text": "2026-04-02 20260402 1717 live-infinite iter 11/∞: reverse_import 10→30, temporal noise reduction, BM25 large-repo boost"}, {"hash": "3b1ac9ab65ef59530795b1f7f5cd4a4033999911", "subject": "20260402 1530 live-infinite iter 10/∞: checkpoint uncommitted changes from iter 6-10", "date": "2026-04-02", "text": "2026-04-02 20260402 1530 live-infinite iter 10/∞: checkpoint uncommitted changes from iter 6-10"}, {"hash": "d50322019041574d6744b2ed23fbe09fc8cadd0c", "subject": "20260402 1130 live-infinite iter 6/∞: import_alias_map for IMPLICIT_CONTEXT depth + alias traversal", "date": "2026-04-02", "text": "2026-04-02 20260402 1130 live-infinite iter 6/∞: import_alias_map for IMPLICIT_CONTEXT depth + alias traversal"}, {"hash": "8ff9bfe73ad1054ea713aae8abfdb94dfc529d7c", "subject": "live-infinite iter 5/∞: success | goal_v0: external R@5 0.5649→0.6033 (+3.8%)", "date": "2026-04-01", "text": "2026-04-01 live-infinite iter 5/∞: success | goal_v0: external R@5 0.5649→0.6033 (+3.8%)"}, {"hash": "d012467b7d8e0baa6021f5c248ba8a3631cbe540", "subject": "live-infinite iter 4/∞: external R@5 0.5406→0.5623 | bigram BM25 + query expansion", "date": "2026-04-01", "text": "2026-04-01 live-infinite iter 4/∞: external R@5 0.5406→0.5623 | bigram BM25 + query expansion"}, {"hash": "f02c5b7fb680f7ebaa3b09dc44b52c58803b4c87", "subject": "live-infinite iter 3/∞: external R@5 0.5259→0.5406 | goal_v0: reverse_import_graph + path_boost + BM25_blend", "date": "2026-04-01", "text": "2026-04-01 live-infinite iter 3/∞: external R@5 0.5259→0.5406 | goal_v0: reverse_import_graph + path_boost + BM25_blend"}, {"hash": "22668ca882052d3290fda22170103bfac29539a5", "subject": "live-infinite iter 1/∞: COIR R@5 0.740→1.000, RepoBench 0.558→0.975", "date": "2026-04-01", "text": "2026-04-01 live-infinite iter 1/∞: COIR R@5 0.740→1.000, RepoBench 0.558→0.975"}, {"hash": "2051a111098d43a3ef8f2196286185f41c5828de", "subject": "perf: SEMANTIC_CONCEPT R@5 0.500→0.867 on COIR — two fixes", "date": "2026-03-30", "text": "2026-03-30 perf: SEMANTIC_CONCEPT R@5 0.500→0.867 on COIR — two fixes"}, {"hash": "dcf2b224b427f829783268d93f26e79e56d9c04b", "subject": "docs(readme): add [fix] tag and control tags table to hook section", "date": "2026-03-30", "text": "2026-03-30 docs(readme): add [fix] tag and control tags table to hook section"}, {"hash": "c4b8536a39be3cf71a54ccd965e5eebc4965684b", "subject": "feat(hook): anti-anchoring guidance for Fix/Replace tasks", "date": "2026-03-30", "text": "2026-03-30 feat(hook): anti-anchoring guidance for Fix/Replace tasks"}, {"hash": "d285a223cdf9419c96b60868ad061fad639dddb6", "subject": "feat(hook): dynamic file suggestion in low-confidence SEMANTIC_CONCEPT warning", "date": "2026-03-30", "text": "2026-03-30 feat(hook): dynamic file suggestion in low-confidence SEMANTIC_CONCEPT warning"}, {"hash": "eddf0682d604834efdc8bff87b585a764073f473", "subject": "live-infinite iter 76/∞: CONVERGED | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화 — score=0.9922", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 76/∞: CONVERGED | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화 — score=0.9922\n2026-03-30 live-infinite iter 76/∞: CONVERGED | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화 — score=0.9922"}, {"hash": "30cdb82b67d8c1c1391ccbf0cc99486226063e32", "subject": "live-infinite iter 71/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화 — BEIR/CoIR 직접 인용, 관련 논문 비교 명확화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 71/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화 — BEIR/CoIR 직접 인용, 관련 논문 비교 명확화"}, {"hash": "ffda6825af0f63444a4f6622558aa6d574eb5a52", "subject": "live-infinite iter 70/∞: success | goal_v3: Conclusion IMPLICIT_CONTEXT p-value clarified: overall p=0.013→specific 30q ", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 70/∞: success | goal_v3: Conclusion IMPLICIT_CONTEXT p-value clarified: overall p=0.013→specific 30q "}, {"hash": "8612db84b43727dd037c63712e7aec898dcf1c19", "subject": "live-infinite iter 68/∞: success | goal_v3: Section 3.5 — COIR 0.38→0.380 (3-decimal consistency with COIR table)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 68/∞: success | goal_v3: Section 3.5 — COIR 0.38→0.380 (3-decimal consistency with COIR table)"}, {"hash": "b3def6252d0a5d899cdba832f5144db6ddde2ce1", "subject": "live-infinite iter 67/∞: success | goal_v3: Section 5.3 — wrong Table 1 cross-ref → Section 4.4", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 67/∞: success | goal_v3: Section 5.3 — wrong Table 1 cross-ref → Section 4.4"}, {"hash": "00581df0be8161fc957555caa2b02b184854294a", "subject": "live-infinite iter 66/∞: success | goal_v3: Section 4.5 — +24pp→+20pp TEMPORAL delta (consistent with 0.60 vs 0.40 in Se", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 66/∞: success | goal_v3: Section 4.5 — +24pp→+20pp TEMPORAL delta (consistent with 0.60 vs 0.40 in Se"}, {"hash": "fc4c32b9b6da2dd52da66463e4c8b5a6ece8e66d", "subject": "live-infinite iter 65/∞: success | goal_v3: Section 5.2 — TEMPORAL_HISTORY GraphPrompt value 0.50→0.60 (matches trigger-", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 65/∞: success | goal_v3: Section 5.2 — TEMPORAL_HISTORY GraphPrompt value 0.50→0.60 (matches trigger-"}, {"hash": "488155bc7b420b67c76264d6fae53257fb1a764b", "subject": "live-infinite iter 64/∞: success | goal_v3: Section 5.1 — GraphRAG-lite mislabeled as text-based, fixed to graph baselin", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 64/∞: success | goal_v3: Section 5.1 — GraphRAG-lite mislabeled as text-based, fixed to graph baselin"}, {"hash": "0ba58dfa720b75af9e1163f0ec639b2d360a173f", "subject": "live-infinite iter 63/∞: success | goal_v3: Section 5.3 — ablation vs main results discrepancy note added", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 63/∞: success | goal_v3: Section 5.3 — ablation vs main results discrepancy note added"}, {"hash": "5f37608cfe69fb012cd2ba466cf5e5e5866daa36", "subject": "live-infinite iter 62/∞: success | goal_v3: Section 4.5 — GraphPrompt 82→73, AgentNode 217→596 (consistent with Section ", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 62/∞: success | goal_v3: Section 4.5 — GraphPrompt 82→73, AgentNode 217→596 (consistent with Section "}, {"hash": "acbe48550e7dae7d9a7eb79bd85fc3038b97020a", "subject": "live-infinite iter 61/∞: success | goal_v3: Section 2.1 — Memori token% comparison '2--5%'→'5.2% synthetic, 1.1% real av", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 61/∞: success | goal_v3: Section 2.1 — Memori token% comparison '2--5%'→'5.2% synthetic, 1.1% real av"}, {"hash": "4da5436971718c4e8779523d16a7878cb1a57eba", "subject": "live-infinite iter 60/∞: success | goal_v3: References — Li et al. 2024a/b disambiguation (CAR vs CoIR)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 60/∞: success | goal_v3: References — Li et al. 2024a/b disambiguation (CAR vs CoIR)"}, {"hash": "b2329f8ee12bcf0e169202f5da7be6eb765bb3af", "subject": "live-infinite iter 59/∞: success | goal_v3: Section 4.1 — '415 total queries' clarified (968 real files, 249 real querie", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 59/∞: success | goal_v3: Section 4.1 — '415 total queries' clarified (968 real files, 249 real querie"}, {"hash": "cec60c016fc909a8db9c1812feda299836a73c38", "subject": "live-infinite iter 58/∞: success | goal_v3: Section 5.7 — CTX real token% 2.2%→1.1% (table row 211 avg)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 58/∞: success | goal_v3: Section 5.7 — CTX real token% 2.2%→1.1% (table row 211 avg)"}, {"hash": "a781e39c4288387030161aee7998f1e463afef3d", "subject": "live-infinite iter 57/∞: success | goal_v3: Section 4.4 — OneViral TES ratio 2.5x→3.6x (0.232/0.065=3.57)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 57/∞: success | goal_v3: Section 4.4 — OneViral TES ratio 2.5x→3.6x (0.232/0.065=3.57)"}, {"hash": "aba20d9e83726610725b28f257c8fd34fddd1554", "subject": "live-infinite iter 56/∞: success | goal_v3: Section 2.2 — jCodeMunch add missing citation (Gravelle, 2025)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 56/∞: success | goal_v3: Section 2.2 — jCodeMunch add missing citation (Gravelle, 2025)"}, {"hash": "b128170e4f719901bcf787a873d7f97f1d1587c5", "subject": "live-infinite iter 55/∞: success | goal_v3: Section 2.2 — CAR citation style (description,year)→(Li et al., 2024)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 55/∞: success | goal_v3: Section 2.2 — CAR citation style (description,year)→(Li et al., 2024)"}, {"hash": "4b56afd4406427c7912bed61c28e2f1c684855f3", "subject": "live-infinite iter 54/∞: success | goal_v3: Section 2.2 — MeCo citation (ACL 2025)→(Wang et al., 2025)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 54/∞: success | goal_v3: Section 2.2 — MeCo citation (ACL 2025)→(Wang et al., 2025)"}, {"hash": "21a00a96ac463ef1444b16e9d4c325034c1a35b4", "subject": "live-infinite iter 53/∞: success | goal_v3: Section 2.3 — citation style (NAACL 2024)→(Sun et al., 2024)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 53/∞: success | goal_v3: Section 2.3 — citation style (NAACL 2024)→(Sun et al., 2024)"}, {"hash": "920fef7817484e6e5ca5cd7a35480fa4f84b0e69", "subject": "live-infinite iter 52/∞: success | goal_v3: Section 4.4 — LlamaIndex TES parenthetical clarified (0.405 LlamaIndex vs 0.", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 52/∞: success | goal_v3: Section 4.4 — LlamaIndex TES parenthetical clarified (0.405 LlamaIndex vs 0."}, {"hash": "e158d1363743c771f01aef094dcab19d7bca33a5", "subject": "live-infinite iter 51/∞: success | goal_v3: Limitations R@3 0.890→0.869 — match main results table", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 51/∞: success | goal_v3: Limitations R@3 0.890→0.869 — match main results table"}, {"hash": "d7998445f2e7a5df1acee969108ab3ebca4168f8", "subject": "live-infinite iter 50/∞: success | goal_v3: Section 4.5 — six→seven baselines, add Hybrid to IMPLICIT_CONTEXT analysis", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 50/∞: success | goal_v3: Section 4.5 — six→seven baselines, add Hybrid to IMPLICIT_CONTEXT analysis"}, {"hash": "2bea57a498182f4af453112299f546577b8a6a4a", "subject": "live-infinite iter 49/∞: success | goal_v3: Section 5.1 — add Hybrid to analysis table, fix baseline count and margin cl", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 49/∞: success | goal_v3: Section 5.1 — add Hybrid to analysis table, fix baseline count and margin cl"}, {"hash": "8964c73da77bab6418ee6ab6313ae974af6ac266", "subject": "live-infinite iter 48/∞: success | goal_v3: add missing citation to uncited empirical claim", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 48/∞: success | goal_v3: add missing citation to uncited empirical claim"}, {"hash": "8cb57eedc81384c2d2f3e1dcbd993f9e40a5c287", "subject": "live-infinite iter 47/∞: success | goal_v3: remove inconsistent token% claim", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 47/∞: success | goal_v3: remove inconsistent token% claim"}, {"hash": "0a6a8cc90ee40de62321b6c83344d080420099f9", "subject": "live-infinite iter 46/∞: success | goal_v3: accuracy audit — two more factual fixes", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 46/∞: success | goal_v3: accuracy audit — two more factual fixes"}, {"hash": "ab0e31db1a7b6893719ff0282c7f21016c64d5e6", "subject": "live-infinite iter 45/∞: success | goal_v3: efficiency scan — factual correction", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 45/∞: success | goal_v3: efficiency scan — factual correction"}, {"hash": "62146b1f3bbf407f4aa5c66a55b6ff00488b0067", "subject": "live-infinite iter 44/∞: success | goal_v3: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 44/∞: success | goal_v3: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "df85f41dd041ce7f7c61b3da218528f6cce26641", "subject": "live-infinite iter 41/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 41/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "521264ae4782e6580163480451adb637faba11bf", "subject": "omc: update world model — iters 39-40 improved impact (0.985→0.992)", "date": "2026-03-30", "text": "2026-03-30 omc: update world model — iters 39-40 improved impact (0.985→0.992)"}, {"hash": "87ce1aa19b6a26ce88f87f91d0b03eeb39a0461f", "subject": "live-infinite iter 40/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 40/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "c5a01bb339c4d2f0652bdff3ab0e473ca67c4ba8", "subject": "live-infinite iter 39/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 39/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "24b6d38f38c0e2af5ad15b7a1e28da8ebd051a40", "subject": "live-infinite iter 37/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 37/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "512e02058e5e31c3bd306fae0907a07dda51be7a", "subject": "live-infinite iter 36/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 36/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "bbbdcd860796b2577271b0e4727044f61900cd3d", "subject": "omc: update world model — iter 35 plateau_count=0 (completeness improved)", "date": "2026-03-30", "text": "2026-03-30 omc: update world model — iter 35 plateau_count=0 (completeness improved)"}, {"hash": "80c1edd57f31794d839eb13ee937f029b2652243", "subject": "live-infinite iter 35/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 35/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "14db49529a08835bee9367aed05a5becff549b8d", "subject": "live-infinite iter 34/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 34/∞: success | goal_v2: CTX 논문 References 보강 + 학술적 임팩트 강화"}, {"hash": "fea45da02792e53de2da54351ad69854363fb840", "subject": "live-infinite iter 33/∞: success | goal_v2: fix Introduction TES 0.78→0.776 (consistent with Section 4.4)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 33/∞: success | goal_v2: fix Introduction TES 0.78→0.776 (consistent with Section 4.4)"}, {"hash": "d93b7c651befac7a11e2a97f82a3f928b7ebb98d", "subject": "live-infinite iter 32/∞: success | goal_v2: fix integration doc — add session tracker to install, update stale 0.733 ref", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 32/∞: success | goal_v2: fix integration doc — add session tracker to install, update stale 0.733 ref"}, {"hash": "deb818e16c733b4940a28f98c933f18f6b167ce0", "subject": "live-infinite iter 31/∞: success | goal_v2: clarify 0.495 vs 0.500 discrepancy (bootstrap vs simple per-codebase mean)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 31/∞: success | goal_v2: clarify 0.495 vs 0.500 discrepancy (bootstrap vs simple per-codebase mean)"}, {"hash": "5b4f22bb6cfab14d35617f3e4b25636292645ccb", "subject": "live-infinite iter 30/∞: success | goal_v2: fix stale delta values in When to Use CTX (+0.194/+0.156/+0.136→+0.198/+0.15", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 30/∞: success | goal_v2: fix stale delta values in When to Use CTX (+0.194/+0.156/+0.136→+0.198/+0.15"}, {"hash": "d171e5826eaa4b1faa784779ce3060a891d7b4ba", "subject": "live-infinite iter 29/∞: success | goal_v2: fix HuggingFace URL Be2Jay→jaytoone (consistent with pyproject.toml)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 29/∞: success | goal_v2: fix HuggingFace URL Be2Jay→jaytoone (consistent with pyproject.toml)"}, {"hash": "8ff5f5714a07188c5302af6e79544a63c1b4e384", "subject": "live-infinite iter 28/∞: success | goal_v2: fix Quick Start import path src.→ctx_retriever. (pip package namespace)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 28/∞: success | goal_v2: fix Quick Start import path src.→ctx_retriever. (pip package namespace)"}, {"hash": "f167253a0c58e266e232773b4b7d99ae457039d1", "subject": "live-infinite iter 27/∞: success | goal_v2: add PostToolUse session tracker to setup + fix TEMPORAL_HISTORY mechanism in", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 27/∞: success | goal_v2: add PostToolUse session tracker to setup + fix TEMPORAL_HISTORY mechanism in"}, {"hash": "6cf1532285735f8741451eb701527d7b0bfd211f", "subject": "live-infinite iter 26/∞: success | goal_v2: fix ctx_loader→ctx_real_loader in integration doc + README +0.162→+0.163 con", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 26/∞: success | goal_v2: fix ctx_loader→ctx_real_loader in integration doc + README +0.162→+0.163 con"}, {"hash": "eb67c13b5e45971decaf8bb225178a781869d2e9", "subject": "live-infinite iter 25/∞: success | goal_v2: fix section numbering (4.9↔4.8 swap) + remove Obsidian wikilinks from paper", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 25/∞: success | goal_v2: fix section numbering (4.9↔4.8 swap) + remove Obsidian wikilinks from paper"}, {"hash": "57c70d3fef85f5c0e805365af514d1fbb42879ce", "subject": "live-infinite iter 24/∞: success | goal_v2: fix hook setup docs — CTX_PROJECT config step + accurate dependency note", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 24/∞: success | goal_v2: fix hook setup docs — CTX_PROJECT config step + accurate dependency note"}, {"hash": "208a09d2da87cd372ea58dffcc2438627bb73c26", "subject": "live-infinite iter 23/∞: success | goal_v2: add hook performance table to README (efficiency dimension)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 23/∞: success | goal_v2: add hook performance table to README (efficiency dimension)"}, {"hash": "627f49a6dd736d7d3a4c960e2a025a978ef55ec8", "subject": "live-infinite iter 22/∞: success | goal_v2: remove internal artifacts from paper (v9, final_report_v10, v4 calibrated)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 22/∞: success | goal_v2: remove internal artifacts from paper (v9, final_report_v10, v4 calibrated)"}, {"hash": "fc3d20c899884ce7e5fa52d164b2bb4a31e8ad90", "subject": "live-infinite iter 21/∞: success | goal_v2: fix external benchmark consistency (Flask 0.542→0.545, add McNemar p-values)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 21/∞: success | goal_v2: fix external benchmark consistency (Flask 0.542→0.545, add McNemar p-values)"}, {"hash": "4eabe4deb0c0f07d9f0736b9012ad1b879667210", "subject": "live-infinite iter 20/∞: success | goal_v2: fix Conclusion 0.25 deployment threshold (match Abstract fix)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 20/∞: success | goal_v2: fix Conclusion 0.25 deployment threshold (match Abstract fix)"}, {"hash": "c07d3c8a9d4b45ba55f9e21888265e5688d7b539", "subject": "live-infinite iter 19/∞: paper — remove unsupported '0.25 deployment threshold' claim", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 19/∞: paper — remove unsupported '0.25 deployment threshold' claim"}, {"hash": "3ebf86bb58d9ce25a16e3ec34071b1ffd82eb3d7", "subject": "live-infinite iter 18/∞: paper — LongCodeBench → proper author-year citation (Guo et al., 2025)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 18/∞: paper — LongCodeBench → proper author-year citation (Guo et al., 2025)"}, {"hash": "f1c33b4ec6b01780635c959e9842bf579ff3e449", "subject": "live-infinite iter 17/∞: fix abstract — 49%→61% below BM25 on COIR text-to-code", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 17/∞: fix abstract — 49%→61% below BM25 on COIR text-to-code"}, {"hash": "83cea358e824029ac75da9410a872871f3a29e8d", "subject": "live-infinite iter 16/∞: paper — add missing [9] Santos et al. citation for associative recall", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 16/∞: paper — add missing [9] Santos et al. citation for associative recall"}, {"hash": "71fd11567e9ea7f0ed7130c17bfe9e4ce2b80471", "subject": "live-infinite iter 15/∞: docs — final ASCII→BM25 in trigger table", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 15/∞: docs — final ASCII→BM25 in trigger table"}, {"hash": "5150b2b859dd9372f09ae31a7b8dcc2fa4b7fc77", "subject": "live-infinite iter 14/∞: fix Conclusion — CTX R@5 0.982→0.874 (same error as abstract, different phrasing missed replace", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 14/∞: fix Conclusion — CTX R@5 0.982→0.874 (same error as abstract, different phrasing missed replace"}, {"hash": "050cf810ce94b0a6764ecd969d4e9b83ee07cde7", "subject": "live-infinite iter 13/∞: fix paper abstract — synthetic 600 queries → 166 queries (50 files)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 13/∞: fix paper abstract — synthetic 600 queries → 166 queries (50 files)"}, {"hash": "79f5bf955c9f2b65bbf9614f098a77b382a56d7b", "subject": "live-infinite iter 12/∞: paper — author Jeawon Jang + clean manuscript footer", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 12/∞: paper — author Jeawon Jang + clean manuscript footer"}, {"hash": "f55dda75ed11bc33833f07278dfeee321d941f05", "subject": "live-infinite iter 11/∞: docs — update integration guide ASCII→BM25 keyword scoring", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 11/∞: docs — update integration guide ASCII→BM25 keyword scoring"}, {"hash": "cdf0820469429400649166302bafc6c2d920d4a3", "subject": "live-infinite iter 10/∞: README intro — add external codebase outperformance to opening sentence", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 10/∞: README intro — add external codebase outperformance to opening sentence"}, {"hash": "b2318907841dd2c026d95104a5c8be93aef0c23f", "subject": "live-infinite iter 9/∞: README — downstream LLM eval table + G1/G2 Key Findings bullet", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 9/∞: README — downstream LLM eval table + G1/G2 Key Findings bullet"}, {"hash": "b4605f399dda9429497f14e652c82c9f7ed5ee5b", "subject": "live-infinite iter 8/∞: README Key Findings — add external codebase outperformance bullet", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 8/∞: README Key Findings — add external codebase outperformance bullet"}, {"hash": "c0bc6b3a52f45b9f3401daa70e7177f808d4c595", "subject": "live-infinite iter 7/∞: success | README — external codebase results table (CTX > BM25 on Flask/FastAPI/Requests)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 7/∞: success | README — external codebase results table (CTX > BM25 on Flask/FastAPI/Requests)"}, {"hash": "71156c19342395d50d76af02290e0f6f806f63a5", "subject": "live-infinite iter 6/∞: fix paper Limitations — clarify BM25:0.982 vs CTX:0.874 ambiguity", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 6/∞: fix paper Limitations — clarify BM25:0.982 vs CTX:0.874 ambiguity"}, {"hash": "0ef00d3720df9185c840ef37756eca09cf4bbe2a", "subject": "live-infinite iter 5b/∞: fix paper abstract+conclusion — CTX R@5=0.874 (not 0.982=BM25)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 5b/∞: fix paper abstract+conclusion — CTX R@5=0.874 (not 0.982=BM25)"}, {"hash": "41be9725c85d4d95f2e3ca2eae83722b7aea5a2f", "subject": "live-infinite iter 5/∞: success | goal_v3: paper References 보강 — BEIR [13] + CoIR [14] 인용 추가", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 5/∞: success | goal_v3: paper References 보강 — BEIR [13] + CoIR [14] 인용 추가"}, {"hash": "9754dbecbcddc8f349c832561375984132df5c49", "subject": "live-infinite iter 4/∞: success | goal_v2: README — When to Use CTX section (honest capability/limitation disclosure)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 4/∞: success | goal_v2: README — When to Use CTX section (honest capability/limitation disclosure)"}, {"hash": "9d479c4730c7cf94582768a1f4a0efe37ecc82d9", "subject": "live-infinite iter 3/∞: success | goal_v1: snake_case 트리거 분류 회귀 테스트 추가", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 3/∞: success | goal_v1: snake_case 트리거 분류 회귀 테스트 추가"}, {"hash": "ceefd941541d2701c42b7264b25a035e21731ac1", "subject": "live-infinite iter 2/∞: success | goal_v1: 논문+README trigger accuracy 업데이트", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 2/∞: success | goal_v1: 논문+README trigger accuracy 업데이트"}, {"hash": "a54c6e081cb6881f08fdaa0cd97be55280fc9096", "subject": "live-infinite iter 1/∞: success | goal_v0: trigger classifier EXPLICIT recall 0.367→1.000", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 1/∞: success | goal_v0: trigger classifier EXPLICIT recall 0.367→1.000"}, {"hash": "ea77b9d8ba4ef3209b870b290799e48360c8dc9d", "subject": "live-infinite iter 5/∞: AST 개선 테스트 추가 (19 tests)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 5/∞: AST 개선 테스트 추가 (19 tests)"}, {"hash": "cff8db567032b14c14c45f72cdeb066f225c3ba4", "subject": "live-infinite iter 4/∞: sig_only eval + 연구 문서 업데이트", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 4/∞: sig_only eval + 연구 문서 업데이트"}, {"hash": "0e8d2846777f5cfd61ec9e208130201c58639d7e", "subject": "live-infinite iter 4/∞: 미커밋 파일 정리 커밋", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 4/∞: 미커밋 파일 정리 커밋"}, {"hash": "3b8eb7229d132a6b7ac07261bcc4a43b72321a11", "subject": "live-infinite iter 4/∞: CI 배지 + arXiv 준비 확인", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 4/∞: CI 배지 + arXiv 준비 확인"}, {"hash": "e4385787b03a626bfccb6a453ef427f9e1da3e8e", "subject": "live-infinite iter 3/∞: BM25 stem 강화 실험 (neutral)", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 3/∞: BM25 stem 강화 실험 (neutral)"}, {"hash": "080047e352fdff8e0d34497328a05ee2c95d8ac0", "subject": "live-infinite iter 3/∞: P4 교차 파일 추론 강화", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 3/∞: P4 교차 파일 추론 강화"}, {"hash": "b4067e0ffe623c7228967fcecc8f56c5944d1bda", "subject": "live-infinite iter 2/∞: P3 README 배지 추가", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 2/∞: P3 README 배지 추가"}, {"hash": "302c9385ce8c4cbe7b7b0e9ef1b2d07eaffe07c3", "subject": "live-infinite iter 2/∞: P2 성능 개선 — AST 심볼 + 개념 인덱싱", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 2/∞: P2 성능 개선 — AST 심볼 + 개념 인덱싱"}, {"hash": "646936c74cae8fdc14a0e6c6806175a28baf7629", "subject": "live-infinite iter 1/∞: P1 실용 임팩트 완료", "date": "2026-03-30", "text": "2026-03-30 live-infinite iter 1/∞: P1 실용 임팩트 완료\n2026-03-30 live-infinite iter 1/∞: P1 실용 임팩트 완료"}, {"hash": "1335244ae72dc9b2cebb9a59dc42af98d05c357a", "subject": "feat: HF Space deployed + PyPI package validated", "date": "2026-03-30", "text": "2026-03-30 feat: HF Space deployed + PyPI package validated"}, {"hash": "462d7a9068da746efa564e550fa393865d7ed040", "subject": "feat: CONTRIBUTING.md + PyPI publish script + build fix", "date": "2026-03-30", "text": "2026-03-30 feat: CONTRIBUTING.md + PyPI publish script + build fix"}, {"hash": "f8dd0287545cfc8cbce92c3cd1474fc61ea82546", "subject": "feat: distribution prep — pyproject.toml + README + HF Space + hooks/", "date": "2026-03-30", "text": "2026-03-30 feat: distribution prep — pyproject.toml + README + HF Space + hooks/"}, {"hash": "b043d0e019f99e5187f572535c40947afd1a3be5", "subject": "feat: before/after benchmark + hook improvement report", "date": "2026-03-29", "text": "2026-03-29 feat: before/after benchmark + hook improvement report"}, {"hash": "523be3e80b5693f0f228885c3ab138ede2bdcc02", "subject": "fix: classify_intent 한국어 명사 오탐(FP) 수정 — regex 동사어미 앵커링", "date": "2026-03-29", "text": "2026-03-29 fix: classify_intent 한국어 명사 오탐(FP) 수정 — regex 동사어미 앵커링"}, {"hash": "74b2dbc9a445b381e800e45bd51a896c27f4ba0a", "subject": "test: TriggerClassifier 한국어 intent 분류 단위 테스트 33개 추가", "date": "2026-03-29", "text": "2026-03-29 test: TriggerClassifier 한국어 intent 분류 단위 테스트 33개 추가"}, {"hash": "770cb4eb59d809594b42db627a17d65f03499b8d", "subject": "feat: TriggerClassifier.classify_intent 한국어 modify/create 키워드 추가", "date": "2026-03-29", "text": "2026-03-29 feat: TriggerClassifier.classify_intent 한국어 modify/create 키워드 추가"}, {"hash": "638d5542d45a024ccce71c41351fa7cfd5afa970", "subject": "CTX: classify_intent() — Fix/Replace over-anchoring protection", "date": "2026-03-28", "text": "2026-03-28 CTX: classify_intent() — Fix/Replace over-anchoring protection"}, {"hash": "72481da0b8989087bdb81cee005121f19809476c", "subject": "CTX iter6: SOYA 배포 패키지 — 지연시간 프로파일링 + 배포 가이드", "date": "2026-03-28", "text": "2026-03-28 CTX iter6: SOYA 배포 패키지 — 지연시간 프로파일링 + 배포 가이드"}, {"hash": "914817d474538be3e9a4c5fb697e863a59e24b49", "subject": "CTX iter5: G2 v4 benchmark calibration + SOYA deployment verdict", "date": "2026-03-28", "text": "2026-03-28 CTX iter5: G2 v4 benchmark calibration + SOYA deployment verdict"}, {"hash": "bf857f810aa3f48c45be433f1d4a7c034dc1ab12", "subject": "CTX iter4: BM25 baseline comparison + paper draft v4.0 P10 update", "date": "2026-03-28", "text": "2026-03-28 CTX iter4: BM25 baseline comparison + paper draft v4.0 P10 update"}, {"hash": "3d8f2f13a50d6b722dbe7b96539f37c0b8959917", "subject": "feat: update paper draft (v4.0 P10) with external R@5=0.495, G1/G2 eval, bootstrap CI", "date": "2026-03-28", "text": "2026-03-28 feat: update paper draft (v4.0 P10) with external R@5=0.495, G1/G2 eval, bootstrap CI"}, {"hash": "727b5c3b6a75c950b0570e866db6863a0bcfee5c", "subject": "feat: fix SEMANTIC trigger misclassification — external R@5 0.217→0.495 (+128%)", "date": "2026-03-28", "text": "2026-03-28 feat: fix SEMANTIC trigger misclassification — external R@5 0.217→0.495 (+128%)"}, {"hash": "720380f47da62e63357bd57eef7420ef609d4133", "subject": "feat: fix external codebase generalization — IMPLICIT R@5 +350% avg", "date": "2026-03-28", "text": "2026-03-28 feat: fix external codebase generalization — IMPLICIT R@5 +350% avg"}, {"hash": "dbcd69215454a7a40c8f5611af7c3b6da54aa91a", "subject": "docs: CTX downstream LLM eval report + DOC_INDEX 업데이트", "date": "2026-03-27", "text": "2026-03-27 docs: CTX downstream LLM eval report + DOC_INDEX 업데이트"}, {"hash": "fcdd5443f25012f4d214cfb6acf1967c03367d84", "subject": "feat: CTX downstream LLM evaluation framework (G1+G2)", "date": "2026-03-27", "text": "2026-03-27 feat: CTX downstream LLM evaluation framework (G1+G2)"}, {"hash": "f42a22b3b6a5223e4fe544cd9b90cd8762cfb147", "subject": "feat: CTX-doc keyword R@3 ≥ 0.724 달성 — query_type-aware routing", "date": "2026-03-27", "text": "2026-03-27 feat: CTX-doc keyword R@3 ≥ 0.724 달성 — query_type-aware routing"}, {"hash": "7d1a6a8f144ddfc856516738515956c9f55b8df0", "subject": "refactor: restore optimal BM25 blend ratio in rank_ctx_doc (norm*0.9)", "date": "2026-03-27", "text": "2026-03-27 refactor: restore optimal BM25 blend ratio in rank_ctx_doc (norm*0.9)"}, {"hash": "5099f32b707f521826117d297d26d8b32f3cb415", "subject": "feat: replace TF-IDF with BM25 in AdaptiveTriggerRetriever and doc benchmark", "date": "2026-03-27", "text": "2026-03-27 feat: replace TF-IDF with BM25 in AdaptiveTriggerRetriever and doc benchmark"}, {"hash": "4f82db8b83b821b93b6719a368f179e18515c94e", "subject": "Fix ctx_loader: CHR 70% → 86.7% (+16.7pp)", "date": "2026-03-25", "text": "2026-03-25 Fix ctx_loader: CHR 70% → 86.7% (+16.7pp)"}, {"hash": "ad93926f56157ab25c2af78e7db14d3ad3e3fdc7", "subject": "Add CTX Hook effectiveness evaluation (CHR=70%, RT=117ms)", "date": "2026-03-25", "text": "2026-03-25 Add CTX Hook effectiveness evaluation (CHR=70%, RT=117ms)"}, {"hash": "37ba42e2d0124eb84c55175a7f7e7e8e3d39fdea", "subject": "Add document retrieval support to CTX hook + eval", "date": "2026-03-25", "text": "2026-03-25 Add document retrieval support to CTX hook + eval"}, {"hash": "402c93148811c862c8dcfe0e59448330c40d7fa7", "subject": "Add RepoBench eval + open-source LLM pass@1 (reproducible)", "date": "2026-03-25", "text": "2026-03-25 Add RepoBench eval + open-source LLM pass@1 (reproducible)"}, {"hash": "194d21afd215dd5f0861329cde39e4a995ef446c", "subject": "Tier upgrade: trigger accuracy, external eval, paper updates", "date": "2026-03-25", "text": "2026-03-25 Tier upgrade: trigger accuracy, external eval, paper updates"}, {"hash": "7ef1bc8413e434198e5c56e514cb15084572ec4a", "subject": "Add HF Space and tier evaluation research", "date": "2026-03-25", "text": "2026-03-25 Add HF Space and tier evaluation research"}]} \ No newline at end of file diff --git a/tests/golden/run_golden.py b/tests/golden/run_golden.py new file mode 100644 index 0000000..3eff4b1 --- /dev/null +++ b/tests/golden/run_golden.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +""" +Golden fixture runner for bm25-memory.py pre-decomposition regression guard. + +Usage: + python3 tests/golden/run_golden.py # read-only verify + python3 tests/golden/run_golden.py --update # overwrite expected outputs + +Exit codes: + 0 — all fixtures matched (or --update completed) + 1 — one or more fixtures differ from expected + +Environment: + Each fixture carries its own env dict. The runner merges it on top of + the current process environment (so PATH / PYTHONPATH are inherited). + HOME is overridden to /tmp/ctx_golden_home to isolate side effects. + +python_bin field (optional): + Each fixture may carry a "python_bin" field specifying the interpreter to + use. Relative paths are resolved from the project root. Absolute paths + are used as-is. If the field is absent, sys.executable (the interpreter + running this script) is used. If the specified interpreter does not exist + the fixture is an immediate FAIL — it is never silently skipped. + + Fallback fixtures (no python_bin): system Python, the BM25 library absent → + HAS_BM25=False path, G1/G2-DOCS return [] (only G2-GREP emitted). + BM25-path fixtures (python_bin=".venv-golden/bin/python"): BM25 library + present (v0.2.2) → HAS_BM25=True path, G1 [RECENT DECISIONS] + G2-DOCS + blocks emitted alongside G2-GREP. + +Note on HAS_BM25: + The BM25 library (package name: rank-bm25) is not installed in the default + Python 3.14 (Homebrew) environment used by this machine. G1 / G2-DOCS BM25 + ranking therefore returns [] and only G2-GREP (git grep) + Session Notes + + World Model are emitted. The fixtures capture that fallback behaviour + faithfully — they will continue to pass after Task A decomposition as long as + the same code paths execute. + BM25-path fixtures (suffix _bm25path) use .venv-golden/bin/python where the + BM25 library is installed; these capture the full G1+G2-DOCS output. +""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + + +FIXTURE_PATH = Path(__file__).parent / "bm25_memory_outputs.jsonl" +HOOK_PATH = Path(__file__).parent.parent.parent / "src" / "hooks" / "bm25-memory.py" +PROJECT_DIR = str(Path(__file__).parent.parent.parent.resolve()) + +# Frozen decision corpus for BM25-path fixtures. +# Captured at commit b398ee8 (220 entries, embeddings stripped). +# Injected into .omc/decision_corpus.json before each _bm25path fixture run, +# with the current git HEAD written into the "head" field so bm25-memory.py +# treats it as a valid cache hit and skips rebuilding from git log. +# This isolates BM25-path fixtures from future git commits. +FROZEN_CORPUS_PATH = Path(__file__).parent / "bm25_path_corpus_frozen.json" + + +def _get_git_head() -> str: + """Return the current git HEAD SHA, or empty string on failure.""" + try: + return subprocess.check_output( + ["git", "rev-parse", "HEAD"], + cwd=PROJECT_DIR, + stderr=subprocess.DEVNULL, + ).decode().strip() + except Exception: + return "" + + +def _inject_frozen_corpus() -> None: + """Write frozen corpus into .omc/decision_corpus.json with current HEAD. + + bm25-memory.py checks cache_path head == current HEAD to decide whether + to use the cache. By injecting the frozen corpus with the current HEAD, + we make the hook use our fixed corpus regardless of new git commits. + """ + if not FROZEN_CORPUS_PATH.exists(): + return + head = _get_git_head() + frozen = json.loads(FROZEN_CORPUS_PATH.read_text()) + cache_path = Path(PROJECT_DIR) / ".omc" / "decision_corpus.json" + cache_path.parent.mkdir(exist_ok=True) + cache_path.write_text(json.dumps({ + "head": head, + "corpus": frozen["corpus"], + "emb_head": "", # no embeddings → dense path disabled, BM25-only + }, ensure_ascii=False)) + + +def _ensure_golden_home() -> None: + """Create HOME skeleton directories for both fallback and BM25-path fixtures.""" + for home_root in ("/tmp/ctx_golden_home", "/tmp/ctx_golden_home_bm25"): + home = Path(home_root) / ".claude" + home.mkdir(parents=True, exist_ok=True) + # No ctx-auto-tune.json → auto-tune disabled (consistent with capture) + vault = Path(home_root) / ".local/share/claude-vault" + vault.mkdir(parents=True, exist_ok=True) + + +def _resolve_python_bin(python_bin: str | None) -> str: + """Resolve python_bin field to an absolute path. + + Relative paths are resolved from the project root (PROJECT_DIR). + If python_bin is None, fall back to sys.executable. + Raises FileNotFoundError if the resolved path does not exist. + """ + if python_bin is None: + return sys.executable + p = Path(python_bin) + if not p.is_absolute(): + p = Path(PROJECT_DIR) / p + if not p.exists(): + raise FileNotFoundError( + f"python_bin interpreter not found: {p}\n" + f" (original value: {python_bin!r})\n" + f" Ensure .venv-golden is set up: pip install rank-bm25 numpy" + ) + return str(p) + + +def _build_env(fixture_env: dict) -> dict: + """Merge fixture env on top of current process env.""" + env = {**os.environ} + env.update(fixture_env) + # Always ensure CLAUDE_PROJECT_DIR points to the actual project + env["CLAUDE_PROJECT_DIR"] = PROJECT_DIR + return env + + +def run_fixture(record: dict) -> tuple[str, str, int]: + """Run a single fixture, return (stdout, stderr, exit_code). + + Uses the interpreter specified by the fixture's optional "python_bin" field. + Relative paths are resolved from PROJECT_DIR. Missing interpreter → FAIL. + + For BM25-path fixtures (python_bin set), injects frozen decision corpus + before running so G1 BM25 ranking is stable across git commits. + """ + stdin_bytes = json.dumps(record["stdin"], ensure_ascii=False).encode("utf-8") + python_bin = _resolve_python_bin(record.get("python_bin")) + cmd = [python_bin, str(HOOK_PATH)] + record.get("argv", []) + env = _build_env(record["env"]) + + # Inject frozen corpus for BM25-path fixtures to prevent G1 rank drift + # caused by new commits being added to the decision corpus. + if record.get("python_bin"): + _inject_frozen_corpus() + + result = subprocess.run( + cmd, + input=stdin_bytes, + capture_output=True, + env=env, + cwd=PROJECT_DIR, + ) + return ( + result.stdout.decode("utf-8", errors="replace"), + result.stderr.decode("utf-8", errors="replace"), + result.returncode, + ) + + +import re as _re + + +def _normalize_g2grep_str(text: str) -> str: + """Apply G2-GREP normalization to a plain string (may be embedded in JSON context).""" + def _replace_block(m: _re.Match) -> str: + header = m.group(1) # e.g. "[G2-GREP] Files matching '...' (grep):" + file_lines = m.group(2) # the file list lines + start_line = m.group(3) # " Start with: ..." line + count = len([l for l in file_lines.strip().splitlines() if l.strip()]) + return f"{header}\n <{count} file(s) — paths normalized>\n{start_line}" + + # Pattern: header + 1+ indented file lines + "Start with:" line + pattern = ( + r'(\[G2-GREP\] Files matching \'[^\']*\' \(grep\):)' # group 1: header + r'(\n(?: [^\n]+\n)+)' # group 2: file lines + r'( Start with:[^\n]*)' # group 3: start-with line + ) + return _re.sub(pattern, _replace_block, text) + + +def _normalize_g2grep(text: str) -> str: + """Normalize G2-GREP blocks in hook output so file list changes don't cause drift. + + The hook emits a single JSON line whose 'hookSpecificOutput.additionalContext' + field contains the rendered context block (with G2-GREP sections). This + function parses the JSON, normalizes G2-GREP file lists inside the context + string, and re-serialises so that exact file path changes (from new files + being added to the repo) don't fail fixtures. + + What is still validated: + - G2-GREP header line (keyword and format) is unchanged + - Number of matched files is unchanged + - "Start with: ..." line presence + What is NOT validated: + - Which specific files appear in the list (only count checked) + """ + try: + data = json.loads(text) + ctx = data.get("hookSpecificOutput", {}).get("additionalContext", "") + if ctx and "[G2-GREP]" in ctx: + data["hookSpecificOutput"]["additionalContext"] = _normalize_g2grep_str(ctx) + return json.dumps(data, ensure_ascii=False) + except (json.JSONDecodeError, AttributeError): + # Fallback: treat as plain text (e.g., error output) + return _normalize_g2grep_str(text) + + +def _diff_summary(expected: str, actual: str) -> str: + """Return a compact unified-diff-style summary of mismatches.""" + exp_lines = expected.splitlines() + act_lines = actual.splitlines() + diff_lines = [] + max_lines = max(len(exp_lines), len(act_lines)) + for i in range(max_lines): + e = exp_lines[i] if i < len(exp_lines) else "" + a = act_lines[i] if i < len(act_lines) else "" + if e != a: + diff_lines.append(f" line {i+1}") + diff_lines.append(f" - {e[:120]}") + diff_lines.append(f" + {a[:120]}") + if len(diff_lines) > 30: + diff_lines.append(" ... (truncated)") + break + return "\n".join(diff_lines) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run golden fixtures for bm25-memory.py") + parser.add_argument( + "--update", + action="store_true", + help="Overwrite expected outputs with current hook output (explicit consent required).", + ) + args = parser.parse_args() + + if not FIXTURE_PATH.exists(): + print(f"ERROR: fixture file not found: {FIXTURE_PATH}", file=sys.stderr) + return 1 + + if not HOOK_PATH.exists(): + print(f"ERROR: hook not found: {HOOK_PATH}", file=sys.stderr) + return 1 + + _ensure_golden_home() + + records = [] + with FIXTURE_PATH.open(encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) + + if not records: + print("ERROR: no fixtures found in JSONL file", file=sys.stderr) + return 1 + + failures = [] + updated = [] + + for rec in records: + fid = rec["id"] + cat = rec["category"] + try: + actual_stdout, actual_stderr, actual_exit = run_fixture(rec) + except FileNotFoundError as exc: + print(f" FAIL [{cat}] {fid}: interpreter missing — {exc}", file=sys.stderr) + failures.append(fid) + continue + + expected_stdout = rec["expected_stdout"] + expected_exit = rec["expected_exit_code"] + # expected_stderr is optional — absent means "skip stderr check" (backward-compat). + expected_stderr: str | None = rec.get("expected_stderr") + + # Normalize G2-GREP file lists before comparison to prevent drift + # from new files being added to the repo. + norm_expected = _normalize_g2grep(expected_stdout) + norm_actual = _normalize_g2grep(actual_stdout) + + stdout_ok = norm_actual == norm_expected + exit_ok = actual_exit == expected_exit + # stderr comparison: only when fixture has explicit expected_stderr field. + stderr_ok = (expected_stderr is None) or (actual_stderr == expected_stderr) + + if stdout_ok and exit_ok and stderr_ok: + print(f" PASS [{cat}] {fid}") + elif args.update: + rec["expected_stdout"] = actual_stdout + rec["expected_exit_code"] = actual_exit + # Update expected_stderr only if the fixture already had the field. + if expected_stderr is not None: + rec["expected_stderr"] = actual_stderr + rec["elapsed_ms_observed"] = rec.get("elapsed_ms_observed", 0) + stderr_changed = (expected_stderr is not None) and (actual_stderr != expected_stderr) + print( + f" UPDATE [{cat}] {fid} (exit: {expected_exit}→{actual_exit}," + f" stdout_changed={not stdout_ok}, stderr_changed={stderr_changed})" + ) + updated.append(fid) + else: + msg_parts = [] + if not exit_ok: + msg_parts.append(f"exit expected={expected_exit} actual={actual_exit}") + if not stdout_ok: + diff = _diff_summary(norm_expected, norm_actual) + msg_parts.append(f"stdout mismatch (G2-GREP normalized):\n{diff}") + if not stderr_ok: + diff = _diff_summary(expected_stderr or "", actual_stderr) + msg_parts.append(f"stderr mismatch:\n{diff}") + print(f" FAIL [{cat}] {fid}:", file=sys.stderr) + for m in msg_parts: + print(f" {m}", file=sys.stderr) + failures.append(fid) + + if args.update and updated: + with FIXTURE_PATH.open("w", encoding="utf-8") as f: + for rec in records: + f.write(json.dumps(rec, ensure_ascii=False) + "\n") + print(f"\nUpdated {len(updated)} fixture(s) in {FIXTURE_PATH}") + return 0 + + total = len(records) + passed = total - len(failures) + print(f"\n{passed}/{total} fixtures passed") + + if failures: + print(f"FAILED: {failures}", file=sys.stderr) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/regression/test_pr1_tokenizer_baseline.py b/tests/regression/test_pr1_tokenizer_baseline.py new file mode 100644 index 0000000..57afe7c --- /dev/null +++ b/tests/regression/test_pr1_tokenizer_baseline.py @@ -0,0 +1,88 @@ +""" +PR-1 baseline: canonical _bm25.tokenize 와 4개 eval 사이트의 tokenize 결과 비교. + +발행 전: 4 사이트 모두 자체 정의 → delta 측정 +발행 후: 4 사이트 모두 canonical 호출 → 동일 (delta=0 또는 명시된 augmentation) +""" +import sys, re +sys.path.insert(0, '/Users/d9ng/privateProject/tunaCtx') +sys.path.insert(0, '/Users/d9ng/privateProject/tunaCtx/src/hooks') +from _bm25.tokenizer import tokenize as canonical_tokenize + + +SAMPLE_CORPUS = [ + "BM25 retrieval improves Recall@7 from 0.169 to 0.634 — 3.7x improvement.", + "한국어 토크나이저는 조사 분리(은/는/이/가/을/를)를 수행한다", + "iter 11 final R@5=0.595 (Flask 0.6462 / FastAPI 0.3870 / Requests 0.7526)", + "fix(hooks): Windows TCP loopback fallback for AF_UNIX-less CPython", + "decomposing bm25-memory.py into 11 sub-modules with 82 unit tests", +] +SAMPLE_QUERIES = [ + "BM25 recall improvement", + "한국어 검색 개선", + "Windows fallback hook", +] + +def site_g1_docs(text): + """Original tokenize from benchmarks/eval/g1_docs_bm25_eval.py:78""" + tokens = re.findall(r'\d+[-–]\d+|\d+\.\d+|\w+', text.lower()) + return [t for t in tokens if t] + +def site_g1_longterm(text): + """Original nested tokenize from g1_longterm_baseline_eval.py:267""" + return re.findall(r'\b\w+\b', text.lower()) + +KO_PARTICLES = re.compile(r'(와|과|이|가|은|는|을|를|의|에서|으로|에게|부터|까지|처럼|같이|보다|이나|며|에|로|도|만|나|고)$') +def site_g2_paraphrase(text): + """Original tokenize from g2_docs_paraphrase_eval.py:325""" + raw = re.findall(r'\d+[-–]\d+|\d+\.\d+|\w+', text.lower()) + result = [] + for tok in raw: + cleaned = KO_PARTICLES.sub('', tok) + if cleaned and cleaned != tok: + result.append(cleaned) + result.append(tok) + return list(dict.fromkeys(result)) + +def site_bm25_retriever(text): + """Original from src/retrieval/bm25_retriever.py:16 — identifier-only""" + raw = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', text.lower()) + return [t for t in raw if len(t) > 1] + +def compare(name, fn, drop_stopwords=False): + """Compare site fn vs canonical.""" + diffs = 0 + additions = 0 # canonical produces more tokens (e.g. stem) + losses = 0 # canonical produces fewer tokens + for text in SAMPLE_CORPUS + SAMPLE_QUERIES: + site_tokens = set(fn(text)) + canon_tokens = set(canonical_tokenize(text, drop_stopwords=drop_stopwords)) + if site_tokens != canon_tokens: + diffs += 1 + additions += len(canon_tokens - site_tokens) + losses += len(site_tokens - canon_tokens) + print(f" {name}: {diffs}/{len(SAMPLE_CORPUS)+len(SAMPLE_QUERIES)} samples differ | canonical_adds={additions}, canonical_loses={losses}") + +print("=== PR-1 baseline: site vs canonical tokenize delta ===\n") +print("[g1_docs_bm25_eval.py:78 vs canonical]") +compare("g1_docs", site_g1_docs) +print("\n[g1_longterm_baseline_eval.py:267 vs canonical]") +compare("g1_longterm", site_g1_longterm) +print("\n[g2_docs_paraphrase_eval.py:325 vs canonical]") +compare("g2_paraphrase", site_g2_paraphrase) +print("\n[bm25_retriever.py:16 vs canonical (identifier post-filter)]") +def canonical_id_filtered(text): + # canonical + identifier post-filter (keep tokens matching identifier shape, len>1) + return [t for t in canonical_tokenize(text, drop_stopwords=False) if re.fullmatch(r'[a-zA-Z_][a-zA-Z0-9_]*', t) and len(t) > 1] +diffs = 0 +for text in SAMPLE_CORPUS: + site = set(site_bm25_retriever(text)) + canon = set(canonical_id_filtered(text)) + if site != canon: + diffs += 1 + miss = site - canon; add = canon - site + if miss or add: + print(f" sample: site_only={list(miss)[:5]}, canonical_only={list(add)[:5]}") +print(f" bm25_retriever: {diffs}/{len(SAMPLE_CORPUS)} samples differ from canonical+id-filter") + +print("\n=== Baseline complete ===") diff --git a/tests/regression/test_pr3_deterministic_sort.py b/tests/regression/test_pr3_deterministic_sort.py new file mode 100644 index 0000000..92335bd --- /dev/null +++ b/tests/regression/test_pr3_deterministic_sort.py @@ -0,0 +1,114 @@ +""" +PR-3 regression: ranker.py 3 sort sites are deterministic under shuffled inputs. + +Verifies that equal-score items return in a stable order regardless of input +ordering. Without explicit tiebreak, Python's stable sort preserves input +order — meaning the same equal-score items in different input orders would +produce different output orders. + +Sites covered: + - dense_rank_decisions (L49) + - rrf_merge (L79) + - bm25_rank_decisions (L153) +""" +import sys +from pathlib import Path +import random + +PROJ = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(PROJ)) +sys.path.insert(0, str(PROJ / 'src/hooks')) + +from _bm25.ranker import rrf_merge, dense_rank_decisions # noqa: E402 + + +def _items(n, prefix='c'): + return [{'hash': f'{prefix}{i:03d}', 'text': f'item {i}', 'emb': []} for i in range(n)] + + +def test_rrf_merge_idempotent_same_input(): + """Same input → same output across repeat calls (no hidden randomness).""" + a = _items(20, 'a') + b = _items(20, 'b') + keys1 = [it['hash'] for it in rrf_merge(a, b)] + keys2 = [it['hash'] for it in rrf_merge(a, b)] + assert keys1 == keys2, f"rrf_merge non-idempotent: {keys1[:5]} vs {keys2[:5]}" + + +def test_rrf_merge_equal_rank_tiebreak_independent_of_list_input_order(): + """Items with identical RRF rank in both lists must order by hash — + independent of whether item X or item Y was inserted first into list_a. + + This is the bug that hash tiebreak fixes: previously dict-insertion + order leaked into the output, so swapping list_a/list_b position of + equal-rank items would shuffle the result.""" + # Both items rank 1 in list_a, rank 1 in list_b → identical RRF + a1 = [{'hash': 'zzz_late', 'text': 'z'}] + b1 = [{'hash': 'aaa_early', 'text': 'a'}] + a2 = [{'hash': 'aaa_early', 'text': 'a'}] + b2 = [{'hash': 'zzz_late', 'text': 'z'}] + # Different list_a content → different dict insertion order + keys1 = [it['hash'] for it in rrf_merge(a1, b1)] + keys2 = [it['hash'] for it in rrf_merge(a2, b2)] + assert keys1 == keys2 == ['aaa_early', 'zzz_late'], ( + f"hash tiebreak failed:\n case1={keys1}\n case2={keys2}" + ) + + +def test_rrf_merge_equal_score_tiebreak_is_hash(): + """Items with identical RRF scores (appearing at same rank in both lists) + must order by hash key, not insertion order.""" + a = [{'hash': 'z_high', 'text': 'z'}, {'hash': 'a_low', 'text': 'a'}] + b = [{'hash': 'a_low', 'text': 'a'}, {'hash': 'z_high', 'text': 'z'}] + # Both items appear at rank 1 in one list and rank 2 in the other → equal RRF + out = rrf_merge(a, b) + keys = [it['hash'] for it in out] + # With tiebreak by hash ascending: 'a_low' < 'z_high' + assert keys == ['a_low', 'z_high'], f"hash tiebreak failed: got {keys}" + + +def test_dense_rank_decisions_no_emb_returns_empty(): + """Sanity check: vec-daemon down → []""" + corpus = _items(5) + # No emb in items, _vec_embed will likely return None → [] + result = dense_rank_decisions(corpus, "any query") + assert result == [] or all('hash' in it for it in result) + + +def test_bm25_rank_idx_tiebreak_via_orchestrator(): + """bm25_rank_decisions L153 tiebreak: equal scores → index ascending. + + Build a corpus where two entries are byte-identical except for index; + they get identical BM25 scores → tiebreak should pick lower index first. + """ + from _bm25.ranker import bm25_rank_decisions, HAS_BM25 + if not HAS_BM25: + return # skip when rank_bm25 unavailable + corpus = [ + {'hash': f'h{i}', 'subject': 'identical text', 'text': 'identical text body for bm25'} + for i in range(5) + ] + result = bm25_rank_decisions(corpus, 'identical bm25', top_k=5, min_score=0.0, + adaptive_floor_ratio=0.0, + mmr_jaccard_threshold=1.01, # disable MMR + skip_rerank=True) + # All items have identical bm25 score → tiebreak by index → h0..h4 in order + hashes = [it['hash'] for it in result] + # Expectation: ascending index — but MMR cluster_sig may dedup. + # If only 1 returned, MMR collapsed correctly. Otherwise must be sorted. + if len(hashes) > 1: + assert hashes == sorted(hashes), f"index tiebreak broken: {hashes}" + + +if __name__ == '__main__': + test_rrf_merge_idempotent_same_input() + print("PASS: rrf_merge idempotent") + test_rrf_merge_equal_rank_tiebreak_independent_of_list_input_order() + print("PASS: rrf_merge equal-rank tiebreak independent of input order") + test_rrf_merge_equal_score_tiebreak_is_hash() + print("PASS: rrf_merge equal-score tiebreak by hash") + test_dense_rank_decisions_no_emb_returns_empty() + print("PASS: dense_rank_decisions no-emb sanity") + test_bm25_rank_idx_tiebreak_via_orchestrator() + print("PASS: bm25_rank_decisions index tiebreak") + print("\nAll PR-3 regression tests passed.") diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 0000000..d43dded --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,121 @@ +"""conftest.py — shared fixtures for CTX unit tests. + +Fixtures: + tmp_home — isolated home directory (simulates ~/.claude/) + tmp_project — isolated project directory with .omc/ structure + settings_path — path to a writable settings.json inside tmp_home + isolated_env — os.environ copy with HOME overridden to tmp_home +""" +from __future__ import annotations + +import json +import os +import subprocess +import sys +from pathlib import Path + +import pytest + + +# ─── Core fixtures ─────────────────────────────────────────────── + + +@pytest.fixture() +def tmp_home(tmp_path: Path) -> Path: + """An isolated home directory with ~/.claude/ structure. + + Use this instead of the real HOME to avoid touching the user's actual + ~/.claude/settings.json during tests. + """ + home = tmp_path / "home" + claude_dir = home / ".claude" + claude_dir.mkdir(parents=True) + (claude_dir / "hooks").mkdir() + return home + + +@pytest.fixture() +def settings_path(tmp_home: Path) -> Path: + """Path to settings.json inside the isolated tmp_home (file does not exist yet).""" + return tmp_home / ".claude" / "settings.json" + + +@pytest.fixture() +def tmp_project(tmp_path: Path) -> Path: + """An isolated project directory with .omc/ structure. + + Used for bm25-memory cache tests: provides a git repo-like structure + without touching the actual CTX working tree. + """ + project = tmp_path / "project" + project.mkdir() + (project / ".omc").mkdir() + + # Initialise a minimal git repo so git commands don't fail. + subprocess.run(["git", "init", str(project)], capture_output=True, check=True) + subprocess.run( + ["git", "-C", str(project), "config", "user.email", "test@test.com"], + capture_output=True, check=True, + ) + subprocess.run( + ["git", "-C", str(project), "config", "user.name", "Test User"], + capture_output=True, check=True, + ) + # Create an initial commit so HEAD resolves. + init_file = project / "README.md" + init_file.write_text("# test repo") + subprocess.run(["git", "-C", str(project), "add", "."], capture_output=True, check=True) + subprocess.run( + ["git", "-C", str(project), "commit", "-m", "feat: initial commit"], + capture_output=True, check=True, + ) + return project + + +@pytest.fixture() +def isolated_env(tmp_home: Path) -> dict: + """A copy of os.environ with HOME set to the isolated tmp_home. + + Also clears variables that would make hooks touch the real system: + - CHAT_MEMORY_EXCLUDED_PROJECTS + - CTX_TELEMETRY / CTX_AB_DISABLE + """ + env = os.environ.copy() + env["HOME"] = str(tmp_home) + # Clear potentially interfering vars + for var in ( + "CHAT_MEMORY_EXCLUDED_PROJECTS", + "CHAT_MEMORY_SCOPE", + "CHAT_MEMORY_EXTRA_PROJECTS", + "CTX_TELEMETRY", + "CTX_AB_DISABLE", + "CTX_DISABLE_SEMANTIC_RERANK", + "CHAT_MEMORY_GLOBAL_FALLBACK", + ): + env.pop(var, None) + return env + + +# ─── Helper functions ───────────────────────────────────────────── + + +def run_hook(hook_path: str, stdin_data: dict, env: dict, timeout: int = 10) -> subprocess.CompletedProcess: + """Run a hook script via subprocess with JSON stdin. + + Args: + hook_path: Absolute path to the hook .py file. + stdin_data: Dict that gets JSON-encoded as stdin. + env: Environment variables dict (use `isolated_env` fixture). + timeout: Max seconds to wait (default 10). + + Returns: + CompletedProcess with stdout, stderr, returncode. + """ + return subprocess.run( + [sys.executable, hook_path], + input=json.dumps(stdin_data), + capture_output=True, + text=True, + env=env, + timeout=timeout, + ) diff --git a/tests/unit/test_bm25_init_reexport.py b/tests/unit/test_bm25_init_reexport.py new file mode 100644 index 0000000..f14d78d --- /dev/null +++ b/tests/unit/test_bm25_init_reexport.py @@ -0,0 +1,137 @@ +"""Unit tests for _bm25 package-level re-exports. + +Verifies that: + - All names in __all__ are importable from the package root. + - Key functions are callable after import. + - Module-level state variables are intentionally NOT re-exported. + - No circular import occurs when importing the package. +""" +from __future__ import annotations + +import importlib +import sys +from pathlib import Path + +import pytest + +# Ensure the hooks directory is on sys.path (mirrors how bm25-memory.py operates). +_HOOKS_DIR = str(Path(__file__).parents[2] / "src" / "hooks") +if _HOOKS_DIR not in sys.path: + sys.path.insert(0, _HOOKS_DIR) + + +# ─── helpers ───────────────────────────────────────────────────────── + + +def _fresh_import(module_name: str): + """Import (or re-import) a module, bypassing the cache. + + Forces a clean import to test that the package-level __init__ triggers + no import-time errors on a cold load. + """ + for key in list(sys.modules.keys()): + if key == module_name or key.startswith(module_name + "."): + del sys.modules[key] + return importlib.import_module(module_name) + + +# ─── tests: basic importability ────────────────────────────────────── + + +def test_reexport_tokenize_callable(): + """tokenize must be importable and callable from package root.""" + from _bm25 import tokenize + assert callable(tokenize), "tokenize must be callable" + result = tokenize("hello world") + assert isinstance(result, list) + + +def test_reexport_score_corpus_bm25_callable(): + """score_corpus_bm25 must be importable and callable from package root.""" + from _bm25 import score_corpus_bm25 + assert callable(score_corpus_bm25), "score_corpus_bm25 must be callable" + + +def test_reexport_bm25_rank_decisions_callable(): + """bm25_rank_decisions must be importable and callable.""" + from _bm25 import bm25_rank_decisions + assert callable(bm25_rank_decisions) + + +# ─── tests: __all__ completeness ──────────────────────────────────── + + +def test_reexport_all_listed_functions(): + """Every name in __all__ must be importable from _bm25 and be callable.""" + import _bm25 + assert hasattr(_bm25, "__all__"), "_bm25 must define __all__" + assert len(_bm25.__all__) > 0, "__all__ must be non-empty" + + for name in _bm25.__all__: + obj = getattr(_bm25, name, None) + assert obj is not None, f"_bm25.{name} is listed in __all__ but not present" + assert callable(obj), f"_bm25.{name} is listed in __all__ but not callable" + + +# ─── tests: no circular import ─────────────────────────────────────── + + +def test_no_circular_import(): + """Importing _bm25 must not raise ImportError or RecursionError.""" + try: + _fresh_import("_bm25") + except (ImportError, RecursionError) as exc: + pytest.fail(f"Circular or broken import detected: {exc}") + + +def test_package_loads_without_exception_on_cold_import(): + """Cold import of _bm25 must succeed without side-effect exceptions.""" + mod = _fresh_import("_bm25") + assert mod is not None + + +# ─── tests: module-level state NOT re-exported ─────────────────────── + + +def test_auto_tune_not_reexported(): + """AUTO_TUNE must NOT be accessible directly from _bm25 package root. + + Reason: AUTO_TUNE is a module-level dict in autotune.py that is populated + at import time by reading a file. Re-exporting it creates confusing + binding semantics — callers must use the submodule path. + """ + import _bm25 + assert not hasattr(_bm25, "AUTO_TUNE"), ( + "AUTO_TUNE must not be re-exported from _bm25; use 'from _bm25.autotune import AUTO_TUNE'" + ) + + +def test_auto_tune_active_not_reexported(): + """AUTO_TUNE_ACTIVE must NOT be accessible directly from _bm25.""" + import _bm25 + assert not hasattr(_bm25, "AUTO_TUNE_ACTIVE"), ( + "AUTO_TUNE_ACTIVE must not be re-exported from _bm25" + ) + + +def test_last_retrieval_scores_not_reexported(): + """last_retrieval_scores (ranker module state) must NOT be re-exported. + + It is a mutable module-level dict used for inter-module telemetry. + Exporting it by name would create a confusing secondary reference. + """ + import _bm25 + assert not hasattr(_bm25, "last_retrieval_scores"), ( + "last_retrieval_scores must not be re-exported; use 'from _bm25.ranker import last_retrieval_scores'" + ) + + +# ─── tests: submodule imports still work ──────────────────────────── + + +def test_submodule_import_still_works(): + """Original submodule-path imports must remain functional (no regression).""" + from _bm25.tokenizer import tokenize, expand_query_tokens # noqa: F401 + from _bm25.ranker import score_corpus_bm25, rrf_merge # noqa: F401 + from _bm25.corpus import get_decision_corpus # noqa: F401 + from _bm25.output import emit_output # noqa: F401 diff --git a/tests/unit/test_bm25_memory_cache.py b/tests/unit/test_bm25_memory_cache.py new file mode 100644 index 0000000..04203a0 --- /dev/null +++ b/tests/unit/test_bm25_memory_cache.py @@ -0,0 +1,281 @@ +"""Unit tests for bm25-memory.py cache invalidation logic. + +bm25-memory.py is not importable (hyphen in name + heavy module-level side effects), +so all tests use subprocess invocation or file-system observation. + +Test strategy: + - Create a temporary git repo (via conftest.tmp_project fixture). + - Run bm25-memory.py pointing at that repo. + - Observe whether .omc/decision_corpus.json is created/updated. + +Cache behaviour (from source lines 194-222): + - Cache file: /.omc/decision_corpus.json + - Cache is valid when: cache["head"] == current git HEAD + - On HEAD change: cache is rebuilt (build_decision_corpus called again) + - On corrupted JSON: falls back to rebuild (exception silently ignored) +""" +from __future__ import annotations + +import json +import os +import subprocess +import sys +import time +from pathlib import Path + +import pytest + +HOOK_PATH = str(Path(__file__).parents[2] / "src" / "hooks" / "bm25-memory.py") +HOOK_TIMEOUT = 15 # seconds — bm25-memory does subprocess git calls + +pytestmark = pytest.mark.requires_subprocess + +VENV_PYTHON = str(Path(__file__).parents[2] / ".venv-golden" / "bin" / "python3") +# Fall back to the current interpreter if venv isn't available (e.g. CI). +_PYTHON = VENV_PYTHON if Path(VENV_PYTHON).is_file() else sys.executable + + +# ─── helpers ───────────────────────────────────────────────────── + + +def _run_hook(project_dir: Path, env: dict, timeout: int = HOOK_TIMEOUT) -> subprocess.CompletedProcess: + """Run bm25-memory.py with a minimal prompt directed at project_dir.""" + payload = json.dumps({ + "prompt": "BM25 decisions recently?", + "cwd": str(project_dir), + }) + # Ensure the hook reads the correct project_dir via env var AND process cwd. + # bm25-memory.py uses os.environ.get("CLAUDE_PROJECT_DIR", os.getcwd()) at line ~79. + env = {**env, "CLAUDE_PROJECT_DIR": str(project_dir)} + return subprocess.run( + [_PYTHON, HOOK_PATH, "--rich"], + input=payload, + capture_output=True, + text=True, + env=env, + cwd=str(project_dir), + timeout=timeout, + ) + + +def _git_add_commit(project_dir: Path, message: str = "feat: test commit") -> str: + """Create a new commit in the tmp_project repo; return new HEAD sha.""" + # Write a tiny file to commit. + (project_dir / "change.txt").write_text(f"change at {time.time()}") + subprocess.run(["git", "-C", str(project_dir), "add", "."], capture_output=True, check=True) + subprocess.run( + ["git", "-C", str(project_dir), "commit", "-m", message], + capture_output=True, check=True, + ) + result = subprocess.run( + ["git", "-C", str(project_dir), "rev-parse", "HEAD"], + capture_output=True, text=True, check=True, + ) + return result.stdout.strip() + + +def _read_cache(project_dir: Path) -> dict | None: + """Return parsed cache or None if missing/corrupt.""" + cache_path = project_dir / ".omc" / "decision_corpus.json" + if not cache_path.exists(): + return None + try: + return json.loads(cache_path.read_text()) + except json.JSONDecodeError: + return None + + +def _get_head(project_dir: Path) -> str: + result = subprocess.run( + ["git", "-C", str(project_dir), "rev-parse", "HEAD"], + capture_output=True, text=True, check=True, + ) + return result.stdout.strip() + + +# ─── fixtures ──────────────────────────────────────────────────── + + +@pytest.fixture() +def hook_env(tmp_home, tmp_project): + """Env for running bm25-memory in an isolated context.""" + env = os.environ.copy() + env["HOME"] = str(tmp_home) + env["CTX_DASHBOARD_INTERNAL"] = "1" # suppress telemetry writes + env.pop("CTX_TELEMETRY", None) + env.pop("CTX_AB_DISABLE", None) + return env + + +# ─── tests: cache path ──────────────────────────────────────────── + + +def test_cache_path_under_omc(tmp_project, hook_env): + """Cache must be written to /.omc/decision_corpus.json (regression guard). + + This test will FAIL if Task A changes the cache path, giving Task C/D + a clear signal to update accordingly. + """ + _run_hook(tmp_project, hook_env) + cache_path = tmp_project / ".omc" / "decision_corpus.json" + # The hook may not write the cache if there are no decision commits, + # but the path itself must be the canonical one. We check by either: + # (a) the file exists at the right path, OR + # (b) no other .omc/*.json file was created (no path drift). + omc_dir = tmp_project / ".omc" + json_files = list(omc_dir.glob("*.json")) + if json_files: + assert cache_path in json_files or any( + f.name == "decision_corpus.json" for f in json_files + ), ( + f"Unexpected cache file location. Found: {json_files}. " + f"Expected: {cache_path}" + ) + + +# ─── tests: cache hit/miss on HEAD change ──────────────────────── + + +def test_cache_invalidated_on_head_change(tmp_project, hook_env): + """After a new commit, the cache head field must reflect the new HEAD. + + tmp_project has a 'feat: initial commit' which _is_decision() recognises, + so the cache is always written on first run. + """ + # Run hook once to warm the cache. + _run_hook(tmp_project, hook_env) + + cache_after_first = _read_cache(tmp_project) + assert cache_after_first is not None, ( + "Cache was not written after first hook run. " + "tmp_project has 'feat: initial commit' which should be a decision commit. " + "Check _is_decision() or CLAUDE_PROJECT_DIR injection." + ) + + head_first = cache_after_first.get("head") + + # Commit a new decision commit. + head_second = _git_add_commit(tmp_project, "feat: add new feature for HEAD change test") + + # Verify HEAD actually changed. + assert head_first != head_second, "git commit should change HEAD" + + # Run hook again — it should detect HEAD mismatch and rebuild. + _run_hook(tmp_project, hook_env) + + cache_after_second = _read_cache(tmp_project) + assert cache_after_second is not None, ( + "Cache was not written after second hook run." + ) + + assert cache_after_second.get("head") == head_second, ( + f"Cache head should be updated to {head_second!r}, " + f"got {cache_after_second.get('head')!r}" + ) + + +def test_cache_hit_when_head_same(tmp_project, hook_env): + """When HEAD hasn't changed, the cache mtime must not change (no rebuild). + + tmp_project already has 'feat: initial commit', so no extra commit needed. + """ + # Run once to warm the cache (tmp_project's initial commit is a decision commit). + _run_hook(tmp_project, hook_env) + + cache_path = tmp_project / ".omc" / "decision_corpus.json" + assert cache_path.exists(), ( + "Cache was not written after first hook run. " + "Check CLAUDE_PROJECT_DIR injection or _is_decision() logic." + ) + + mtime_first = cache_path.stat().st_mtime + # Small sleep to make mtime difference detectable. + time.sleep(0.1) + + # Run hook again WITHOUT any new commit — HEAD is unchanged. + _run_hook(tmp_project, hook_env) + + mtime_second = cache_path.stat().st_mtime + assert mtime_first == mtime_second, ( + "Cache should not be rewritten when HEAD is unchanged " + f"(mtime before={mtime_first}, after={mtime_second})" + ) + + +# ─── tests: corrupted cache ─────────────────────────────────────── + + +def test_corrupted_cache_safe_rebuild(tmp_project, hook_env): + """If the cache file is corrupted JSON, the hook rebuilds it safely.""" + # Add a decision commit so there's something to cache. + _git_add_commit(tmp_project, "feat: decision for corrupted-cache test") + + # Write a corrupted cache file. + cache_path = tmp_project / ".omc" / "decision_corpus.json" + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_path.write_text("THIS IS NOT JSON {{{ corrupted !!!") + + # Run hook — should NOT crash. + result = _run_hook(tmp_project, hook_env) + assert result.returncode == 0, ( + f"Hook crashed on corrupted cache (exit {result.returncode}).\n" + f"stderr: {result.stderr[:500]}" + ) + + # Cache should now be valid JSON (rebuilt). + cache_after = _read_cache(tmp_project) + if cache_after is not None: + assert "head" in cache_after, "Rebuilt cache must have 'head' field" + + +def test_corrupted_cache_no_traceback(tmp_project, hook_env): + """Corrupted cache must not cause a Python traceback in stderr.""" + cache_path = tmp_project / ".omc" / "decision_corpus.json" + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_path.write_text("{{{{ BROKEN") + + result = _run_hook(tmp_project, hook_env) + assert "Traceback" not in result.stderr, ( + "Python traceback detected on corrupted cache:\n" + result.stderr[:500] + ) + + +def test_cache_missing_omc_dir_creates_it(tmp_project, hook_env): + """If .omc/ directory is missing entirely, the hook creates it safely.""" + omc_dir = tmp_project / ".omc" + # Remove .omc/ entirely. + import shutil + if omc_dir.exists(): + shutil.rmtree(str(omc_dir)) + + _git_add_commit(tmp_project, "feat: test without .omc dir") + result = _run_hook(tmp_project, hook_env) + + assert result.returncode == 0, ( + f"Hook crashed when .omc/ is missing (exit {result.returncode}).\n" + f"stderr: {result.stderr[:500]}" + ) + + +# ─── tests: non-git directory ───────────────────────────────────── + + +def test_hook_runs_in_non_git_directory(tmp_path, hook_env): + """Hook must not crash when cwd is not a git repository.""" + non_git = tmp_path / "not-a-git-repo" + non_git.mkdir() + + payload = json.dumps({"prompt": "BM25 decisions recently?", "cwd": str(non_git)}) + result = subprocess.run( + [_PYTHON, HOOK_PATH, "--rich"], + input=payload, + capture_output=True, + text=True, + env=hook_env, + timeout=HOOK_TIMEOUT, + ) + assert result.returncode == 0, ( + f"Hook crashed in non-git dir (exit {result.returncode}).\n" + f"stderr: {result.stderr[:500]}" + ) + assert "Traceback" not in result.stderr diff --git a/tests/unit/test_bm25_memory_telemetry.py b/tests/unit/test_bm25_memory_telemetry.py new file mode 100644 index 0000000..10ab127 --- /dev/null +++ b/tests/unit/test_bm25_memory_telemetry.py @@ -0,0 +1,291 @@ +"""Unit tests for bm25-memory.py telemetry instrumentation (Task D). + +Test strategy: subprocess invocation with isolated HOME (tmp_home fixture). +All tests use a tmp HOME so no real ~/.claude/ is touched. + +Covered cases: + 1. Disabled: no jsonl line appended when CTX_TELEMETRY not set. + 2. Enabled: hook_complete event with hook=bm25-memory emitted. + 3. Enabled: query_type field captured correctly for various prompts. + 4. Enabled: fallback_reasons captured when CTX_DISABLE_SEMANTIC_RERANK=1. + 5. Exception in telemetry path does not crash hook (exit 0). + 6. Latency overhead under 5ms (enabled vs disabled, 10 runs average). +""" +from __future__ import annotations + +import json +import os +import stat +import subprocess +import sys +import time +from pathlib import Path + +import pytest + +HOOK_PATH = str(Path(__file__).parents[2] / "src" / "hooks" / "bm25-memory.py") +HOOK_TIMEOUT = 20 # bm25-memory does subprocess git calls + +pytestmark = pytest.mark.requires_subprocess + +VENV_PYTHON = str(Path(__file__).parents[2] / ".venv-golden" / "bin" / "python3") +_PYTHON = VENV_PYTHON if Path(VENV_PYTHON).is_file() else sys.executable + +# ─── helpers ───────────────────────────────────────────────────────────────── + + +def _run_hook( + project_dir: Path, + env: dict, + prompt: str = "BM25 test query", + extra_args: list[str] | None = None, + timeout: int = HOOK_TIMEOUT, +) -> subprocess.CompletedProcess: + payload = json.dumps({ + "prompt": prompt, + "session_id": "test-session", + "cwd": str(project_dir), + }) + env = {**env, "CLAUDE_PROJECT_DIR": str(project_dir)} + args = [_PYTHON, HOOK_PATH, "--rich"] + (extra_args or []) + return subprocess.run( + args, + input=payload, + capture_output=True, + text=True, + env=env, + cwd=str(project_dir), + timeout=timeout, + ) + + +def _jsonl_lines(tmp_home: Path) -> list[dict]: + """Read all lines from the telemetry JSONL in the isolated home.""" + log_path = tmp_home / ".claude" / "ctx-telemetry.jsonl" + if not log_path.exists(): + return [] + lines = [] + for raw in log_path.read_text(encoding="utf-8").strip().splitlines(): + try: + lines.append(json.loads(raw)) + except json.JSONDecodeError: + pass + return lines + + +def _count_before(tmp_home: Path) -> int: + return len(_jsonl_lines(tmp_home)) + + +def _new_lines(tmp_home: Path, before: int) -> list[dict]: + return _jsonl_lines(tmp_home)[before:] + + +def _build_env(tmp_home: Path, telemetry: bool = False) -> dict: + """Build isolated env. Never touches real ~/.claude/.""" + env = os.environ.copy() + env["HOME"] = str(tmp_home) + env["CTX_DASHBOARD_INTERNAL"] = "0" # allow telemetry to fire + # Clear potentially interfering vars + for var in ( + "CTX_TELEMETRY", "CTX_AB_DISABLE", "CTX_DISABLE_SEMANTIC_RERANK", + "CHAT_MEMORY_EXCLUDED_PROJECTS", "CHAT_MEMORY_SCOPE", + ): + env.pop(var, None) + if telemetry: + env["CTX_TELEMETRY"] = "1" + return env + + +def _init_git_project(project_dir: Path) -> None: + """Minimal git repo with one decision commit so G1 fires.""" + subprocess.run(["git", "init", str(project_dir)], capture_output=True, check=True) + subprocess.run( + ["git", "-C", str(project_dir), "config", "user.email", "t@t.com"], + capture_output=True, check=True, + ) + subprocess.run( + ["git", "-C", str(project_dir), "config", "user.name", "Test"], + capture_output=True, check=True, + ) + (project_dir / "README.md").write_text("# test") + subprocess.run(["git", "-C", str(project_dir), "add", "."], capture_output=True, check=True) + subprocess.run( + ["git", "-C", str(project_dir), "commit", "-m", "feat: initial BM25 decision"], + capture_output=True, check=True, + ) + + +# ─── fixtures ──────────────────────────────────────────────────────────────── + + +@pytest.fixture() +def ctx_home(tmp_path: Path) -> Path: + """Isolated home directory for telemetry tests.""" + home = tmp_path / "home" + claude_dir = home / ".claude" + claude_dir.mkdir(parents=True) + # Suppress "first time" notice from touching real state + (claude_dir / ".ctx-telemetry.notified").touch() + return home + + +@pytest.fixture() +def ctx_project(tmp_path: Path) -> Path: + """Minimal git project for hook execution.""" + project = tmp_path / "project" + project.mkdir() + (project / ".omc").mkdir() + _init_git_project(project) + return project + + +# ─── tests ─────────────────────────────────────────────────────────────────── + + +def test_telemetry_disabled_no_jsonl_append(ctx_home, ctx_project): + """When CTX_TELEMETRY is not set, no lines appended to jsonl.""" + env = _build_env(ctx_home, telemetry=False) + log_path = ctx_home / ".claude" / "ctx-telemetry.jsonl" + + before = _count_before(ctx_home) + result = _run_hook(ctx_project, env) + assert result.returncode == 0, f"Hook crashed: {result.stderr[:300]}" + + after = len(_jsonl_lines(ctx_home)) + assert after == before, ( + f"Expected no new lines when telemetry disabled, got {after - before} new lines." + ) + # jsonl file should not be created at all if it didn't exist + if not log_path.exists(): + pass # correct: file never created + else: + # File existed before — line count must be unchanged + assert after == before + + +def test_telemetry_enabled_emits_hook_complete(ctx_home, ctx_project): + """CTX_TELEMETRY=1 must produce at least one hook=bm25-memory, type=hook_complete event.""" + env = _build_env(ctx_home, telemetry=True) + + before = _count_before(ctx_home) + result = _run_hook(ctx_project, env) + assert result.returncode == 0, f"Hook crashed: {result.stderr[:300]}" + + new = _new_lines(ctx_home, before) + assert new, "Expected telemetry events but got none." + + hook_complete = [e for e in new if e.get("type") == "hook_complete" and e.get("hook") == "bm25-memory"] + assert hook_complete, ( + f"No hook_complete event with hook=bm25-memory found.\n" + f"Events emitted: {[e.get('type') for e in new]}" + ) + + # hook_complete must have latency_ms + ev = hook_complete[0] + assert "latency_ms" in ev, f"hook_complete missing latency_ms: {ev}" + assert isinstance(ev["latency_ms"], int) and ev["latency_ms"] >= 0 + + # At least 2 distinct event types (hook_complete + prompt_received or stage event) + types_emitted = {e.get("type") for e in new} + assert len(types_emitted) >= 2, ( + f"Expected ≥2 distinct event types, got: {types_emitted}" + ) + + +def test_telemetry_enabled_includes_query_type(ctx_home, ctx_project): + """query_type field must be present and non-empty in hook_complete for various prompts.""" + env = _build_env(ctx_home, telemetry=True) + + test_cases = [ + "BM25 어디 있지?", # korean keyword + "where is the vec daemon", # english keyword + "test", # short + ] + for prompt in test_cases: + before = _count_before(ctx_home) + result = _run_hook(ctx_project, env, prompt=prompt) + assert result.returncode == 0, f"Hook crashed for prompt {prompt!r}: {result.stderr[:200]}" + + new = _new_lines(ctx_home, before) + hook_complete = [e for e in new if e.get("type") == "hook_complete"] + assert hook_complete, f"No hook_complete event for prompt {prompt!r}" + + ev = hook_complete[0] + assert "query_type" in ev, f"hook_complete missing query_type for {prompt!r}: {ev}" + assert isinstance(ev["query_type"], str) and ev["query_type"], ( + f"query_type is empty for prompt {prompt!r}: {ev}" + ) + + +def test_telemetry_enabled_emits_fallback_reason(ctx_home, ctx_project): + """With CTX_DISABLE_SEMANTIC_RERANK=1, fallback_reasons must contain 'vec_daemon_down'.""" + env = _build_env(ctx_home, telemetry=True) + env["CTX_DISABLE_SEMANTIC_RERANK"] = "1" + + before = _count_before(ctx_home) + result = _run_hook(ctx_project, env) + assert result.returncode == 0, f"Hook crashed: {result.stderr[:300]}" + + new = _new_lines(ctx_home, before) + hook_complete = [e for e in new if e.get("type") == "hook_complete"] + assert hook_complete, "No hook_complete event found." + + ev = hook_complete[0] + fallback = ev.get("fallback_reasons", "") + assert "vec_daemon_down" in fallback, ( + f"Expected 'vec_daemon_down' in fallback_reasons, got: {fallback!r}\nEvent: {ev}" + ) + + +def test_telemetry_enabled_zero_overhead_when_emit_fails(ctx_home, ctx_project): + """If telemetry JSONL path is unwritable, hook must still exit 0.""" + env = _build_env(ctx_home, telemetry=True) + + # Make .claude/ directory unwritable so jsonl append fails + claude_dir = ctx_home / ".claude" + original_mode = claude_dir.stat().st_mode + try: + claude_dir.chmod(0o444) # read-only + result = _run_hook(ctx_project, env) + assert result.returncode == 0, ( + f"Hook must exit 0 even when telemetry write fails.\n" + f"stderr: {result.stderr[:300]}" + ) + assert "Traceback" not in result.stderr, ( + "Unhandled exception in hook when telemetry path is unwritable." + ) + finally: + claude_dir.chmod(original_mode) # restore for cleanup + + +def test_telemetry_latency_overhead_under_5ms(ctx_home, ctx_project): + """Latency overhead of enabled vs disabled telemetry must be ≤5ms (10-run average).""" + N = 10 + env_on = _build_env(ctx_home, telemetry=True) + env_off = _build_env(ctx_home, telemetry=False) + + def measure(env: dict) -> float: + t0 = time.perf_counter() + r = _run_hook(ctx_project, env) + elapsed = (time.perf_counter() - t0) * 1000 + assert r.returncode == 0, f"Hook crashed: {r.stderr[:200]}" + return elapsed + + # Warm up (one run each to avoid cold-start skew) + measure(env_on) + measure(env_off) + + times_on = [measure(env_on) for _ in range(N)] + times_off = [measure(env_off) for _ in range(N)] + + avg_on = sum(times_on) / N + avg_off = sum(times_off) / N + overhead = avg_on - avg_off + + # Allow up to 5ms overhead (very conservative — actual overhead ≤1ms). + # Full subprocess round-trip is 300-600ms; 5ms is <2% relative overhead. + assert overhead <= 5.0, ( + f"Telemetry overhead too high: {overhead:.1f}ms " + f"(enabled={avg_on:.1f}ms, disabled={avg_off:.1f}ms, N={N})" + ) diff --git a/tests/unit/test_chat_memory_fallback.py b/tests/unit/test_chat_memory_fallback.py new file mode 100644 index 0000000..000fcbc --- /dev/null +++ b/tests/unit/test_chat_memory_fallback.py @@ -0,0 +1,209 @@ +"""Unit tests for src/hooks/chat-memory.py — graceful fallback behavior. + +All tests invoke the hook via subprocess (file is not importable due to +hyphenated name and module-level sqlite_vec import). + +Tests verify: + 1. No vault.db present → graceful exit (0), no crash. + 2. No vec-daemon socket → BM25-only fallback with ⚠ warning in stderr. + 3. Malformed / truncated JSON stdin → graceful exit (0). + 4. CHAT_MEMORY_EXCLUDED_PROJECTS matches cwd → vault access skipped. +""" +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +import pytest + +HOOK_PATH = str(Path(__file__).parents[2] / "src" / "hooks" / "chat-memory.py") +HOOK_TIMEOUT = 10 # seconds + +pytestmark = pytest.mark.requires_subprocess + + +def _run_hook(stdin_data: dict | str, env: dict, timeout: int = HOOK_TIMEOUT): + """Run chat-memory.py with given stdin and env.""" + import subprocess + + if isinstance(stdin_data, dict): + stdin_str = json.dumps(stdin_data) + else: + stdin_str = stdin_data + + return subprocess.run( + [sys.executable, HOOK_PATH], + input=stdin_str, + capture_output=True, + text=True, + env=env, + timeout=timeout, + ) + + +# ─── fixtures ──────────────────────────────────────────────────── + + +@pytest.fixture() +def base_env(tmp_home): + """Isolated env with HOME set to tmp_home; no vault.db; no vec-daemon socket.""" + env = os.environ.copy() + env["HOME"] = str(tmp_home) + # Clear interference vars. + for var in ( + "CHAT_MEMORY_EXCLUDED_PROJECTS", + "CHAT_MEMORY_SCOPE", + "CHAT_MEMORY_EXTRA_PROJECTS", + "CHAT_MEMORY_GLOBAL_FALLBACK", + "CTX_TELEMETRY", + "CTX_AB_DISABLE", + ): + env.pop(var, None) + # Point vault to a path that does not exist. + env["HOME"] = str(tmp_home) + return env + + +# ─── tests ─────────────────────────────────────────────────────── + + +def test_chat_memory_runs_with_no_vault_db(base_env, tmp_home): + """When vault.db is absent, hook exits 0 (degrade gracefully, no crash).""" + # Ensure vault.db truly does not exist. + vault_db = tmp_home / ".local" / "share" / "claude-vault" / "vault.db" + assert not vault_db.exists(), "vault.db must not exist for this test" + + result = _run_hook( + {"prompt": "What decisions did we make about BM25?", "cwd": str(tmp_home)}, + base_env, + ) + assert result.returncode == 0, ( + f"Hook crashed (exit {result.returncode}) when vault.db is absent.\n" + f"stderr: {result.stderr[:500]}" + ) + + +def test_chat_memory_no_vault_db_outputs_nothing(base_env, tmp_home): + """No vault.db → hook writes nothing to stdout (no injection).""" + result = _run_hook( + {"prompt": "Tell me about recent BM25 decisions", "cwd": str(tmp_home)}, + base_env, + ) + assert result.returncode == 0 + # stdout should be empty or not a hook injection. + stdout = result.stdout.strip() + assert stdout == "", ( + f"Hook unexpectedly produced output without a vault.db:\n{stdout[:300]}" + ) + + +def test_chat_memory_runs_with_no_vec_daemon(base_env, tmp_home): + """When vec-daemon socket is absent, hook falls back to BM25-only. + + The ⚠ vec-daemon down warning appears in stderr when the hook actually + found results via BM25 but no daemon was available. With no vault.db, + the hook exits early (no results), so this test simply verifies the + hook does NOT crash, regardless of whether it produced output. + """ + sock_path = tmp_home / ".local" / "share" / "claude-vault" / "vec-daemon.sock" + assert not sock_path.exists(), "Socket must not exist for this test" + + result = _run_hook( + {"prompt": "BM25 retrieval decisions recently?", "cwd": str(tmp_home)}, + base_env, + ) + # Hook must not crash. + assert result.returncode == 0, ( + f"Hook crashed with exit {result.returncode}.\nstderr: {result.stderr[:500]}" + ) + + +def test_chat_memory_handles_invalid_stdin(base_env): + """Malformed JSON on stdin → graceful exit (0), no traceback.""" + result = _run_hook("NOT VALID JSON {{{", base_env) + assert result.returncode == 0, ( + f"Hook should exit 0 on bad stdin, got {result.returncode}.\n" + f"stderr: {result.stderr[:500]}" + ) + # Must not print Python traceback. + assert "Traceback" not in result.stderr, ( + "Unexpected traceback on invalid stdin:\n" + result.stderr[:500] + ) + + +def test_chat_memory_handles_truncated_stdin(base_env): + """Truncated JSON (incomplete) → graceful exit (0).""" + result = _run_hook('{"prompt": "BM25 decisions"', base_env) # missing closing } + assert result.returncode == 0, ( + f"Hook should exit 0 on truncated stdin, got {result.returncode}.\n" + f"stderr: {result.stderr[:500]}" + ) + + +def test_chat_memory_handles_short_prompt(base_env): + """Prompt shorter than 10 chars → hook exits 0 (too short to process).""" + result = _run_hook({"prompt": "hi"}, base_env) + assert result.returncode == 0 + assert result.stdout.strip() == "" + + +def test_chat_memory_handles_empty_prompt(base_env): + """Empty prompt → hook exits 0.""" + result = _run_hook({"prompt": ""}, base_env) + assert result.returncode == 0 + + +def test_chat_memory_respects_excluded_project(base_env, tmp_home): + """CHAT_MEMORY_EXCLUDED_PROJECTS matching cwd → vault access skipped, exit 0.""" + project_cwd = str(tmp_home / "secret-project") + # Set the env var so this project is excluded. + env = {**base_env, "CHAT_MEMORY_EXCLUDED_PROJECTS": project_cwd} + + result = _run_hook( + { + "prompt": "What BM25 decisions did we make last week about retrieval?", + "cwd": project_cwd, + }, + env, + ) + assert result.returncode == 0, ( + f"Hook crashed on excluded project (exit {result.returncode}).\n" + f"stderr: {result.stderr[:500]}" + ) + # No injection output expected. + assert result.stdout.strip() == "", ( + "Hook should produce no injection output for excluded projects" + ) + + +def test_chat_memory_no_crash_on_missing_sqlite_vec(base_env): + """If sqlite_vec is not importable, hook must exit 0 with a warning — no traceback.""" + # Patch sqlite_vec availability: prepend a fake module that raises ImportError. + import tempfile + + with tempfile.TemporaryDirectory() as td: + fake_mod = Path(td) / "sqlite_vec.py" + fake_mod.write_text("raise ImportError('no sqlite_vec for test')\n") + env = {**base_env, "PYTHONPATH": td} + + result = _run_hook( + {"prompt": "What BM25 decisions did we make about retrieval scoring?"}, + env, + ) + # Must exit cleanly (graceful fallback to BM25-only mode). + assert result.returncode == 0, ( + f"Hook crashed (exit {result.returncode}) when sqlite_vec is missing.\n" + f"stderr: {result.stderr[:500]}" + ) + # Must emit the ⚠ warning to stderr (not silently swallow the import error). + assert "sqlite_vec missing" in result.stderr, ( + "Expected '⚠ sqlite_vec missing' warning in stderr, got:\n" + + result.stderr[:500] + ) + # Must not print a Python traceback. + assert "Traceback" not in result.stderr, ( + "Unexpected Python traceback when sqlite_vec is missing:\n" + + result.stderr[:500] + ) diff --git a/tests/unit/test_code_search_sort.py b/tests/unit/test_code_search_sort.py new file mode 100644 index 0000000..cb95faf --- /dev/null +++ b/tests/unit/test_code_search_sort.py @@ -0,0 +1,124 @@ +"""test_code_search_sort.py — deterministic sort guarantee for search_files_by_grep. + +Verifies that search_files_by_grep() returns results in (-count, path) order: + 1. Higher-count files rank first. + 2. Ties are broken by lexicographic path order. + 3. Repeated calls with identical input produce identical output. +""" +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +# Ensure src package is importable regardless of PYTHONPATH +_ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(_ROOT / "src" / "hooks")) + +from _bm25.code_search import search_files_by_grep + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_grep_stdout(entries: list[tuple[str, int]]) -> str: + """Build fake `git grep -c` stdout from (path, count) pairs.""" + return "\n".join(f"{path}:{count}" for path, count in entries) + "\n" + + +def _run_with_mock_grep(entries: list[tuple[str, int]], keywords: list[str], limit: int = 5) -> list[str]: + """Run search_files_by_grep with mocked subprocess output.""" + fake_stdout = _make_grep_stdout(entries) + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = fake_stdout + + with patch("subprocess.run", return_value=mock_result): + return search_files_by_grep("/fake/project", keywords, limit=limit) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +class TestSearchFilesByGrepSort: + """search_files_by_grep returns deterministic (-count, path) order.""" + + def test_higher_count_ranks_first(self): + """Files with more matches appear before files with fewer matches.""" + entries = [ + ("src/retrieval/adaptive_trigger.py", 3), + ("src/hooks/bm25-memory.py", 10), + ("tests/unit/test_bm25.py", 1), + ] + result = _run_with_mock_grep(entries, ["bm25"]) + assert result[0] == "src/hooks/bm25-memory.py" + assert result[1] == "src/retrieval/adaptive_trigger.py" + assert result[2] == "tests/unit/test_bm25.py" + + def test_ties_broken_by_path_lexicographic(self): + """When two files have the same count, alphabetically earlier path ranks first.""" + entries = [ + ("src/zoo.py", 5), + ("src/alpha.py", 5), + ("src/middle.py", 5), + ] + result = _run_with_mock_grep(entries, ["bm25"]) + assert result == ["src/alpha.py", "src/middle.py", "src/zoo.py"] + + def test_mixed_count_and_tie(self): + """Combined: primary sort by count desc, secondary by path asc.""" + entries = [ + ("b/high2.py", 10), + ("a/high1.py", 10), + ("c/low.py", 2), + ("a/mid.py", 5), + ] + result = _run_with_mock_grep(entries, ["search"], limit=10) + assert result[0] == "a/high1.py" # count=10, alpha first + assert result[1] == "b/high2.py" # count=10, alpha second + assert result[2] == "a/mid.py" # count=5 + assert result[3] == "c/low.py" # count=2 + + def test_deterministic_repeated_calls(self): + """Five consecutive calls with the same input return the same result.""" + entries = [ + ("src/z_file.py", 4), + ("src/a_file.py", 4), + ("src/m_file.py", 7), + ("src/b_file.py", 4), + ] + first = _run_with_mock_grep(entries, ["token"]) + for _ in range(4): + subsequent = _run_with_mock_grep(entries, ["token"]) + assert subsequent == first, "Results must be identical across calls" + + def test_limit_respected(self): + """Result length does not exceed the requested limit.""" + entries = [(f"src/file{i}.py", i) for i in range(1, 11)] + result = _run_with_mock_grep(entries, ["bm25"], limit=3) + assert len(result) == 3 + + def test_empty_grep_returns_empty(self): + """If grep returns nothing, result is an empty list.""" + mock_result = MagicMock() + mock_result.returncode = 1 # non-zero → no matches + mock_result.stdout = "" + with patch("subprocess.run", return_value=mock_result): + result = search_files_by_grep("/fake/project", ["bm25"]) + assert result == [] + + def test_short_keywords_filtered(self): + """Keywords shorter than 4 chars are ignored (long_kws filter).""" + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = "" + with patch("subprocess.run") as mock_run: + result = search_files_by_grep("/fake/project", ["ab", "x"]) + # subprocess.run should NOT be called because no long keywords + mock_run.assert_not_called() + assert result == [] diff --git a/tests/unit/test_install_cli.py b/tests/unit/test_install_cli.py new file mode 100644 index 0000000..eb9cb6f --- /dev/null +++ b/tests/unit/test_install_cli.py @@ -0,0 +1,636 @@ +"""Unit tests for src/cli/install.py. + +Coverage targets: + - _new_hooks_block() produces correct event/matcher structure + - UserPromptSubmit hooks all present + - PostToolUse has Grep matcher for g2-fallback + - --dry-run flag: no file changes + - Install to empty/missing settings.json + - Install merges with pre-existing hooks from other tools + - Idempotent install (two runs, no duplicate entries) + - Uninstall removes only CTX hooks +""" +from __future__ import annotations + +import json +import os +import subprocess +import sys +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +# Allow import without package install. +sys.path.insert(0, str(Path(__file__).parents[2] / "src" / "cli")) +import install as install_mod +from install import _new_hooks_block, _hook_entry, CTX_HOOKS +from settings_patcher import patch_settings, unpatch_settings + + +# ─── helpers ───────────────────────────────────────────────────── + + +def _all_commands_in_block(block: dict) -> list[str]: + """Flatten all command strings from a hooks block.""" + cmds = [] + for entries in block.values(): + for entry in entries: + for hook in entry.get("hooks", []): + cmd = hook.get("command", "") + if cmd: + cmds.append(cmd) + return cmds + + +def _all_commands_in_settings(settings: dict) -> list[str]: + return _all_commands_in_block(settings.get("hooks", {})) + + +# ─── tests: _new_hooks_block ───────────────────────────────────── + + +def test_new_hooks_block_includes_all_required(): + """All 5 CTX hooks must appear in _new_hooks_block().""" + block = _new_hooks_block() + cmds = _all_commands_in_block(block) + # Every hook filename from CTX_HOOKS must appear in some command. + for spec in CTX_HOOKS: + fname = spec[0] + assert any(fname in cmd for cmd in cmds), ( + f"Hook '{fname}' missing from _new_hooks_block()" + ) + + +def test_new_hooks_block_user_prompt_submit_events(): + """UserPromptSubmit entries must cover chat-memory, bm25-memory, memory-keyword-trigger.""" + block = _new_hooks_block() + ups_entries = block.get("UserPromptSubmit", []) + ups_cmds = [ + hook["command"] + for entry in ups_entries + for hook in entry.get("hooks", []) + ] + for fname in ["chat-memory.py", "bm25-memory.py", "memory-keyword-trigger.py"]: + assert any(fname in cmd for cmd in ups_cmds), ( + f"'{fname}' missing from UserPromptSubmit entries" + ) + + +def test_new_hooks_block_post_tool_use_grep_matcher(): + """g2-fallback.py must be registered under PostToolUse with matcher='Grep'.""" + block = _new_hooks_block() + post_entries = block.get("PostToolUse", []) + grep_entries = [e for e in post_entries if e.get("matcher") == "Grep"] + assert len(grep_entries) > 0, "No Grep matcher entry in PostToolUse" + + grep_cmds = [ + hook["command"] + for entry in grep_entries + for hook in entry.get("hooks", []) + ] + assert any("g2-fallback.py" in cmd for cmd in grep_cmds), ( + "g2-fallback.py not found in Grep matcher group" + ) + + +def test_new_hooks_block_bm25_includes_rich_flag(): + """bm25-memory.py command must include the '--rich' argument.""" + block = _new_hooks_block() + cmds = _all_commands_in_block(block) + bm25_cmds = [c for c in cmds if "bm25-memory.py" in c] + assert len(bm25_cmds) > 0 + assert any("--rich" in cmd for cmd in bm25_cmds), ( + "bm25-memory.py must include '--rich' flag" + ) + + +def test_hook_entry_command_format(): + """_hook_entry returns dict with correct 'type' and 'command' keys.""" + entry = _hook_entry("chat-memory.py") + assert entry["type"] == "command" + assert "chat-memory.py" in entry["command"] + assert "$HOME/.claude/hooks/" in entry["command"] + + +def test_hook_entry_with_extra_args(): + """_hook_entry with extra_args appends them to the command.""" + entry = _hook_entry("bm25-memory.py", ["--rich"]) + assert "--rich" in entry["command"] + + +# ─── tests: install to empty/missing settings ───────────────────── + + +def test_install_to_empty_settings(tmp_path): + """patch_settings on a non-existent file creates it with all CTX hooks.""" + settings_file = tmp_path / "settings.json" + block = _new_hooks_block() + result = patch_settings(settings_file, block) + assert result.ok + assert len(result.added) > 0 + saved = json.loads(settings_file.read_text()) + cmds = _all_commands_in_settings(saved) + for spec in CTX_HOOKS: + fname = spec[0] + assert any(fname in cmd for cmd in cmds), f"'{fname}' missing from saved settings" + + +def test_install_merges_with_existing_hooks(tmp_path): + """Install should preserve hooks from other tools already in settings.json.""" + settings_file = tmp_path / "settings.json" + existing = { + "hooks": { + "UserPromptSubmit": [ + {"hooks": [{"type": "command", "command": "python3 /other/foreign-hook.py"}]} + ] + } + } + settings_file.write_text(json.dumps(existing), encoding="utf-8") + + block = _new_hooks_block() + result = patch_settings(settings_file, block) + assert result.ok + + saved = json.loads(settings_file.read_text()) + cmds = _all_commands_in_settings(saved) + assert "python3 /other/foreign-hook.py" in cmds, "Foreign hook must be preserved" + # CTX hooks must also be present. + for spec in CTX_HOOKS: + fname = spec[0] + assert any(fname in cmd for cmd in cmds), f"'{fname}' missing after merge" + + +def test_install_idempotent(tmp_path): + """Running patch_settings twice produces no duplicate commands.""" + settings_file = tmp_path / "settings.json" + block = _new_hooks_block() + + patch_settings(settings_file, block) # first install + result2 = patch_settings(settings_file, block) # second install + + assert result2.ok + assert len(result2.added) == 0, "Second install should add nothing" + assert len(result2.skipped) > 0, "Second install should skip all CTX hooks" + + saved = json.loads(settings_file.read_text()) + cmds = _all_commands_in_settings(saved) + assert len(cmds) == len(set(cmds)), "Duplicate commands detected after second install" + + +# ─── tests: dry_run flag via cmd_install ───────────────────────── + + +def test_dry_run_prints_summary_no_write(tmp_path): + """cmd_install with --dry-run writes nothing but prints a summary.""" + settings_file = tmp_path / "settings.json" + + # Patch all step functions that touch the file system. + with ( + patch.object(install_mod, "CLAUDE_SETTINGS", settings_file), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", tmp_path / ".claude" / "hooks"), + patch("install.step_copy_hooks", return_value=(3, 0, 0, [])), + patch("install.step_copy_daemons", return_value=(0, 0, [])), + patch("install.step_verify_hooks_present", return_value=(True, ["chat-memory.py"], [])), + patch("install.step_smoke_test", return_value=(True, "smoke OK")), + ): + import argparse + args = argparse.Namespace(dry_run=True, uninstall=False, command=None, + force_hooks=False, no_update_hooks=False) + rc = install_mod.cmd_install(args) + + # Dry run always returns 0 and must not create the settings file. + assert rc == 0 + assert not settings_file.exists(), "dry_run must not create settings.json" + + +# ─── tests: uninstall ───────────────────────────────────────────── + + +def test_uninstall_removes_ctx_hooks_only(tmp_path): + """cmd_uninstall removes CTX-registered hooks and leaves foreign hooks intact.""" + settings_file = tmp_path / "settings.json" + # Build a settings file with CTX hooks + a foreign hook. + block = _new_hooks_block() + settings = {"hooks": {}} + for event, entries in block.items(): + settings["hooks"].setdefault(event, []).extend(entries) + settings["hooks"].setdefault("UserPromptSubmit", []).append( + {"hooks": [{"type": "command", "command": "python3 /foreign/tool.py"}]} + ) + settings_file.write_text(json.dumps(settings), encoding="utf-8") + + # Build remove list the same way cmd_uninstall does. + remove = [] + for spec in CTX_HOOKS: + filename = spec[0] + extra = spec[3] if len(spec) >= 4 else None + remove.append(_hook_entry(filename, extra)["command"]) + + result = unpatch_settings(settings_file, remove) + assert result.ok + + saved = json.loads(settings_file.read_text()) + cmds = _all_commands_in_settings(saved) + # Foreign hook must survive. + assert "python3 /foreign/tool.py" in cmds + # All CTX hooks must be gone. + for cmd in remove: + assert cmd not in cmds, f"CTX hook still present after uninstall: {cmd}" + + +# ─── tests: step functions ─────────────────────────────────────── + + +def test_step_copy_hooks_no_package_returns_error(): + """step_copy_hooks returns (0, 0, 0, [error]) when package hooks dir not found.""" + with patch("install._pkg_hooks_dir", return_value=None): + copied, updated, skipped, errors = install_mod.step_copy_hooks() + assert copied == 0 + assert updated == 0 + assert skipped == 0 + assert len(errors) == 1 + assert "ctx-retriever" in errors[0].lower() or "not found" in errors[0].lower() + + +def test_step_copy_hooks_dry_run_counts_but_no_write(tmp_path): + """step_copy_hooks with dry_run=True counts files but copies nothing.""" + # Create a fake source dir with a hook file. + fake_src = tmp_path / "pkg_hooks" + fake_src.mkdir() + (fake_src / "chat-memory.py").write_text("# fake hook") + + fake_dst = tmp_path / "claude_hooks" + fake_dst.mkdir() + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_dst), + ): + copied, updated, skipped, errors = install_mod.step_copy_hooks(dry_run=True) + + assert copied > 0 + assert errors == [] + # In dry_run, no file should exist in fake_dst. + assert not (fake_dst / "chat-memory.py").exists() + + +def test_step_copy_hooks_skips_already_present(tmp_path): + """step_copy_hooks with identical content reports unchanged (skipped) and does not copy.""" + fake_src = tmp_path / "pkg_hooks" + fake_src.mkdir() + content = "# fake bm25 — identical content" + (fake_src / "bm25-memory.py").write_text(content) + + fake_dst = tmp_path / "claude_hooks" + fake_dst.mkdir() + # Pre-create the destination file with identical content (same hash → unchanged). + (fake_dst / "bm25-memory.py").write_text(content) + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_dst), + ): + copied, updated, skipped, errors = install_mod.step_copy_hooks() + + assert skipped > 0 + assert copied == 0 + assert updated == 0 + + +def test_step_copy_hooks_updates_changed_file(tmp_path): + """step_copy_hooks updates an existing file when hash differs (creates backup).""" + fake_src = tmp_path / "pkg_hooks" + fake_src.mkdir() + (fake_src / "bm25-memory.py").write_text("# NEW VERSION") + + fake_dst = tmp_path / "claude_hooks" + fake_dst.mkdir() + # Pre-create the destination file with different content. + (fake_dst / "bm25-memory.py").write_text("# OLD VERSION") + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_dst), + ): + copied, updated, skipped, errors = install_mod.step_copy_hooks() + + assert updated > 0 + assert copied == 0 + assert errors == [] + # Destination should now have new content. + assert (fake_dst / "bm25-memory.py").read_text() == "# NEW VERSION" + # A backup should have been created. + backups = list(fake_dst.glob("bm25-memory.backup_*.py")) + assert len(backups) == 1 + assert backups[0].read_text() == "# OLD VERSION" + + +def test_step_copy_hooks_no_update_skips_changed(tmp_path): + """--no-update-hooks skips even when hash differs.""" + fake_src = tmp_path / "pkg_hooks" + fake_src.mkdir() + (fake_src / "bm25-memory.py").write_text("# NEW VERSION") + + fake_dst = tmp_path / "claude_hooks" + fake_dst.mkdir() + (fake_dst / "bm25-memory.py").write_text("# OLD VERSION") + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_dst), + ): + copied, updated, skipped, errors = install_mod.step_copy_hooks(no_update=True) + + assert skipped > 0 + assert updated == 0 + # File must remain unchanged. + assert (fake_dst / "bm25-memory.py").read_text() == "# OLD VERSION" + + +def test_step_copy_hooks_force_overwrites(tmp_path): + """--force-hooks overwrites even when hash is identical.""" + fake_src = tmp_path / "pkg_hooks" + fake_src.mkdir() + content = "# same content" + (fake_src / "bm25-memory.py").write_text(content) + + fake_dst = tmp_path / "claude_hooks" + fake_dst.mkdir() + (fake_dst / "bm25-memory.py").write_text(content) + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_dst), + ): + copied, updated, skipped, errors = install_mod.step_copy_hooks(force=True) + + # force=True → "updated" path even if hashes match + assert updated > 0 + + +def test_step_verify_hooks_present_detects_missing(tmp_path): + """step_verify_hooks_present reports missing files correctly.""" + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + # Put only ONE hook file; others will be missing. + (fake_hooks / "chat-memory.py").write_text("# present") + + with patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks): + ok, found, missing = install_mod.step_verify_hooks_present() + + assert not ok + assert "chat-memory.py" in found + assert len(missing) > 0 + + +def test_step_verify_hooks_present_all_ok(tmp_path): + """step_verify_hooks_present returns True when all hooks are in place.""" + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + for spec in CTX_HOOKS: + (fake_hooks / spec[0]).write_text("# stub") + + with patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks): + ok, found, missing = install_mod.step_verify_hooks_present() + + assert ok + assert missing == [] + + +def test_step_smoke_test_missing_hook(tmp_path): + """step_smoke_test returns False when bm25-memory.py is missing.""" + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + # Deliberately do NOT create bm25-memory.py. + + with patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks): + ok, msg = install_mod.step_smoke_test() + + assert not ok + assert "missing" in msg.lower() or "bm25" in msg.lower() + + +def test_cmd_status_runs_without_crash(tmp_path, capsys): + """cmd_status completes without raising; printed output is non-empty.""" + fake_settings = tmp_path / "settings.json" + fake_settings.write_text(json.dumps({"hooks": {}}), encoding="utf-8") + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + fake_vault = tmp_path / "claude-vault" + fake_vault.mkdir() + + with ( + patch.object(install_mod, "CLAUDE_SETTINGS", fake_settings), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks), + patch.object(install_mod, "CLAUDE_VAULT_DIR", fake_vault), + ): + import argparse + args = argparse.Namespace(dry_run=False, uninstall=False, command="status") + rc = install_mod.cmd_status(args) + + assert rc == 0 + captured = capsys.readouterr() + assert "status" in captured.out.lower() or "hooks" in captured.out.lower() + + +def test_cmd_uninstall_dry_run(tmp_path, capsys): + """cmd_uninstall with --dry-run does not modify settings.json.""" + settings_file = tmp_path / "settings.json" + block = _new_hooks_block() + settings = {"hooks": {}} + for event, entries in block.items(): + settings["hooks"].setdefault(event, []).extend(entries) + settings_file.write_text(json.dumps(settings), encoding="utf-8") + mtime_before = settings_file.stat().st_mtime + + import time; time.sleep(0.05) + with patch.object(install_mod, "CLAUDE_SETTINGS", settings_file): + import argparse + args = argparse.Namespace(dry_run=True, uninstall=True, command=None) + rc = install_mod.cmd_uninstall(args) + + assert rc == 0 + assert settings_file.stat().st_mtime == mtime_before + + +def test_step_copy_daemons_no_package(): + """step_copy_daemons returns (0, 0, []) when package hooks dir not found.""" + with patch("install._pkg_hooks_dir", return_value=None): + copied, skipped, errors = install_mod.step_copy_daemons() + assert copied == 0 + assert skipped == 0 + assert errors == [] # non-fatal: daemons are optional + + +def test_step_copy_daemons_dry_run(tmp_path): + """step_copy_daemons with dry_run=True counts would-copy but doesn't write.""" + fake_src = tmp_path / "pkg" + fake_src.mkdir() + (fake_src / "vec-daemon.py").write_text("# fake vec-daemon") + + fake_vault = tmp_path / "claude-vault" + fake_vault.mkdir() + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_VAULT_DIR", fake_vault), + ): + copied, skipped, errors = install_mod.step_copy_daemons(dry_run=True) + + assert copied > 0 + assert errors == [] + assert not (fake_vault / "vec-daemon.py").exists() + + +def test_step_copy_daemons_actual_copy(tmp_path): + """step_copy_daemons actually copies daemon files when dry_run=False.""" + fake_src = tmp_path / "pkg" + fake_src.mkdir() + (fake_src / "vec-daemon.py").write_text("# fake vec-daemon") + + fake_vault = tmp_path / "claude-vault" + fake_vault.mkdir() + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_VAULT_DIR", fake_vault), + ): + copied, skipped, errors = install_mod.step_copy_daemons(dry_run=False) + + assert copied > 0 + assert errors == [] + assert (fake_vault / "vec-daemon.py").exists() + + +def test_step_copy_daemons_skips_existing(tmp_path): + """step_copy_daemons skips daemons that already exist in vault dir.""" + fake_src = tmp_path / "pkg" + fake_src.mkdir() + (fake_src / "vec-daemon.py").write_text("# new") + + fake_vault = tmp_path / "claude-vault" + fake_vault.mkdir() + (fake_vault / "vec-daemon.py").write_text("# already there") + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_VAULT_DIR", fake_vault), + ): + copied, skipped, errors = install_mod.step_copy_daemons() + + assert skipped > 0 + assert copied == 0 + + +def test_step_copy_hooks_actual_copy(tmp_path): + """step_copy_hooks copies a hook file and sets it executable.""" + fake_src = tmp_path / "pkg" + fake_src.mkdir() + (fake_src / "chat-memory.py").write_text("#!/usr/bin/env python3\n# hook") + + fake_dst = tmp_path / "hooks" + fake_dst.mkdir() + + with ( + patch("install._pkg_hooks_dir", return_value=fake_src), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_dst), + ): + copied, updated, skipped, errors = install_mod.step_copy_hooks(dry_run=False) + + assert copied > 0 + assert errors == [] + dst_file = fake_dst / "chat-memory.py" + assert dst_file.exists() + # Check executable bit is set. + assert dst_file.stat().st_mode & 0o111 + + +def test_cmd_install_smoke_test_fail_returns_4(tmp_path, capsys): + """cmd_install returns exit code 4 when smoke test fails.""" + settings_file = tmp_path / "settings.json" + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + fake_vault = tmp_path / "claude-vault" + fake_vault.mkdir() + + with ( + patch.object(install_mod, "CLAUDE_SETTINGS", settings_file), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks), + patch.object(install_mod, "CLAUDE_VAULT_DIR", fake_vault), + patch("install.step_copy_hooks", return_value=(2, 0, 0, [])), + patch("install.step_copy_daemons", return_value=(0, 0, [])), + patch("install.step_verify_hooks_present", return_value=(True, list(spec[0] for spec in CTX_HOOKS), [])), + patch("install.step_smoke_test", return_value=(False, "bm25-memory.py missing")), + ): + import argparse + args = argparse.Namespace(dry_run=False, uninstall=False, command=None, + force_hooks=False, no_update_hooks=False) + rc = install_mod.cmd_install(args) + + assert rc == 4 + + +def test_cmd_install_hook_copy_failure_returns_2(tmp_path, capsys): + """cmd_install returns exit code 2 when hook copy errors out.""" + settings_file = tmp_path / "settings.json" + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + + with ( + patch.object(install_mod, "CLAUDE_SETTINGS", settings_file), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks), + patch("install.step_copy_hooks", return_value=(0, 0, 0, ["copy chat-memory.py: Permission denied"])), + patch("install.step_copy_daemons", return_value=(0, 0, [])), + patch("install.step_verify_hooks_present", return_value=(True, [], [])), + ): + import argparse + args = argparse.Namespace(dry_run=False, uninstall=False, command=None, + force_hooks=False, no_update_hooks=False) + rc = install_mod.cmd_install(args) + + assert rc == 2 + + +def test_cmd_install_missing_hooks_after_copy_returns_2(tmp_path, capsys): + """cmd_install returns 2 when hooks are still missing after copy step.""" + settings_file = tmp_path / "settings.json" + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + + with ( + patch.object(install_mod, "CLAUDE_SETTINGS", settings_file), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks), + patch("install.step_copy_hooks", return_value=(0, 0, 0, [])), + patch("install.step_copy_daemons", return_value=(0, 0, [])), + patch("install.step_verify_hooks_present", return_value=(False, [], ["bm25-memory.py"])), + ): + import argparse + args = argparse.Namespace(dry_run=False, uninstall=False, command=None, + force_hooks=False, no_update_hooks=False) + rc = install_mod.cmd_install(args) + + assert rc == 2 + + +def test_cmd_install_success_returns_0(tmp_path, capsys): + """cmd_install returns 0 on full success.""" + settings_file = tmp_path / "settings.json" + fake_hooks = tmp_path / "hooks" + fake_hooks.mkdir() + + with ( + patch.object(install_mod, "CLAUDE_SETTINGS", settings_file), + patch.object(install_mod, "CLAUDE_HOOKS_DIR", fake_hooks), + patch.object(install_mod, "CLAUDE_VAULT_DIR", tmp_path / "vault"), + patch("install.step_copy_hooks", return_value=(5, 0, 0, [])), + patch("install.step_copy_daemons", return_value=(2, 0, [])), + patch("install.step_verify_hooks_present", return_value=(True, list(spec[0] for spec in CTX_HOOKS), [])), + patch("install.step_smoke_test", return_value=(True, "hook fired OK")), + ): + import argparse + args = argparse.Namespace(dry_run=False, uninstall=False, command=None, + force_hooks=False, no_update_hooks=False) + rc = install_mod.cmd_install(args) + + assert rc == 0 diff --git a/tests/unit/test_settings_patcher.py b/tests/unit/test_settings_patcher.py new file mode 100644 index 0000000..a0e9e91 --- /dev/null +++ b/tests/unit/test_settings_patcher.py @@ -0,0 +1,441 @@ +"""Unit tests for src/cli/settings_patcher.py. + +Coverage targets: + - atomic write (temp + os.replace) + - timestamped backup creation + - idempotency (two-patch dedup) + - dry-run (no write) + - unpatch removes only specified commands + - corrupted JSON handled as {} + - partial-write safety +""" +from __future__ import annotations + +import json +import os +import sys +import time +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +# Allow import of src/cli/settings_patcher.py without package install. +sys.path.insert(0, str(Path(__file__).parents[2] / "src" / "cli")) +from settings_patcher import ( + patch_settings, + unpatch_settings, + _load, + _save_atomic, + _cmd_in_settings, + PatchResult, +) + + +# ─── fixtures ──────────────────────────────────────────────────── + + +def _minimal_hooks_block() -> dict: + """A small hooks dict that mirrors the format install.py produces.""" + return { + "UserPromptSubmit": [ + {"hooks": [{"type": "command", "command": "python3 $HOME/.claude/hooks/chat-memory.py"}]}, + {"hooks": [{"type": "command", "command": "python3 $HOME/.claude/hooks/bm25-memory.py --rich"}]}, + ], + "PostToolUse": [ + {"matcher": "Grep", "hooks": [{"type": "command", "command": "python3 $HOME/.claude/hooks/g2-fallback.py"}]}, + ], + } + + +def _other_tool_hooks_block() -> dict: + """Simulates an existing hook from a completely different tool.""" + return { + "UserPromptSubmit": [ + {"hooks": [{"type": "command", "command": "python3 /some/other/tool/hook.py"}]}, + ], + } + + +# ─── tests: _load ──────────────────────────────────────────────── + + +def test_load_missing_file_returns_empty(tmp_path): + p = tmp_path / "nonexistent.json" + result = _load(p) + assert result == {} + + +def test_load_valid_json(tmp_path): + p = tmp_path / "settings.json" + p.write_text('{"hooks": {}}', encoding="utf-8") + assert _load(p) == {"hooks": {}} + + +def test_load_corrupted_json_returns_empty(tmp_path): + p = tmp_path / "settings.json" + p.write_text("{ this is: broken json }", encoding="utf-8") + result = _load(p) + assert result == {} + + +def test_load_empty_file_returns_empty(tmp_path): + p = tmp_path / "settings.json" + p.write_text("", encoding="utf-8") + result = _load(p) + assert result == {} + + +# ─── tests: _save_atomic ───────────────────────────────────────── + + +def test_atomic_write_temp_then_replace(tmp_path): + """Atomic write: temp file should not persist; final file should contain data.""" + p = tmp_path / "settings.json" + data = {"hooks": {"UserPromptSubmit": []}} + + _save_atomic(p, data) + + # Final file must exist and be valid JSON. + assert p.exists() + assert json.loads(p.read_text()) == data + + # No .tmp_ctx residual file should remain. + tmp_file = p.with_suffix(".tmp_ctx") + assert not tmp_file.exists() + + +def test_atomic_write_uses_os_replace(tmp_path): + """Verify os.replace is called (not shutil.move or direct write).""" + p = tmp_path / "settings.json" + data = {"test": True} + + with patch("settings_patcher.os.replace", wraps=os.replace) as mock_replace: + _save_atomic(p, data) + assert mock_replace.called, "os.replace should be called for atomic rename" + + +def test_backup_created_with_timestamp(tmp_path): + """When file already exists, a timestamped backup is created.""" + p = tmp_path / "settings.json" + original_data = {"original": True} + p.write_text(json.dumps(original_data), encoding="utf-8") + + new_data = {"updated": True} + _save_atomic(p, new_data) + + # Find backup files in the directory. + backups = list(tmp_path.glob("*.backup_*.json")) + assert len(backups) == 1, f"Expected 1 backup, found {len(backups)}: {backups}" + # Backup should contain the original content. + backup_content = json.loads(backups[0].read_text()) + assert backup_content == original_data + + +def test_no_backup_when_file_missing(tmp_path): + """No backup when settings.json doesn't exist yet.""" + p = tmp_path / "settings.json" + _save_atomic(p, {"new": True}) + backups = list(tmp_path.glob("*.backup_*.json")) + assert len(backups) == 0 + + +def test_save_atomic_returns_empty_string_for_new_file(tmp_path): + """_save_atomic returns '' (not a path) when creating a brand-new file.""" + p = tmp_path / "settings.json" + # File must NOT exist before the call. + assert not p.exists() + result = _save_atomic(p, {"key": "value"}) + assert result == "", f"Expected empty string for new file, got: {result!r}" + assert p.exists(), "File should have been created" + + +def test_save_atomic_returns_backup_path_for_existing_file(tmp_path): + """_save_atomic returns the backup path string when updating an existing file.""" + p = tmp_path / "settings.json" + p.write_text(json.dumps({"original": True}), encoding="utf-8") + result = _save_atomic(p, {"updated": True}) + assert result != "", "Expected backup path for existing file, got empty string" + assert result.endswith(".json") + backup = Path(result) + assert backup.exists(), f"Backup file should exist at {backup}" + assert json.loads(backup.read_text()) == {"original": True} + + +def test_parent_dirs_created_if_missing(tmp_path): + """_save_atomic creates parent directories if they don't exist.""" + p = tmp_path / "deep" / "nested" / "settings.json" + _save_atomic(p, {"ok": True}) + assert p.exists() + + +# ─── tests: patch_settings ─────────────────────────────────────── + + +def test_patch_to_empty_settings(tmp_path): + """Patching when settings.json does not exist creates the file.""" + p = tmp_path / "settings.json" + hooks = _minimal_hooks_block() + result = patch_settings(p, hooks) + assert result.ok + assert len(result.added) > 0 + # File must now be valid JSON containing the hooks. + saved = json.loads(p.read_text()) + assert "hooks" in saved + + +def test_patch_preserves_other_hooks(tmp_path): + """Existing hooks from other tools are never removed.""" + p = tmp_path / "settings.json" + existing = {"hooks": _other_tool_hooks_block()} + p.write_text(json.dumps(existing), encoding="utf-8") + + result = patch_settings(p, _minimal_hooks_block()) + assert result.ok + + saved = json.loads(p.read_text()) + # Other tool's hook must still be present. + all_cmds = [ + hook.get("command") + for entries in saved["hooks"].values() + for entry in entries + for hook in entry.get("hooks", []) + ] + assert "python3 /some/other/tool/hook.py" in all_cmds + + +def test_idempotent_patch(tmp_path): + """Running patch_settings twice does not duplicate entries.""" + p = tmp_path / "settings.json" + hooks = _minimal_hooks_block() + + result1 = patch_settings(p, hooks) + result2 = patch_settings(p, hooks) + + assert result1.ok + assert result2.ok + # Second run: all entries should be in skipped, none in added. + assert len(result2.added) == 0 + assert len(result2.skipped) > 0 + + # Confirm no duplicate commands in the saved file. + saved = json.loads(p.read_text()) + all_cmds = [ + hook.get("command") + for entries in saved["hooks"].values() + for entry in entries + for hook in entry.get("hooks", []) + ] + assert len(all_cmds) == len(set(all_cmds)), "Duplicate hook commands found!" + + +def test_dry_run_no_write(tmp_path): + """dry_run=True: reports what would change but does not write any file.""" + p = tmp_path / "settings.json" + result = patch_settings(p, _minimal_hooks_block(), dry_run=True) + assert result.ok + # File must not have been created. + assert not p.exists(), "dry_run must not create the settings file" + # Result still reports what would be added. + assert len(result.added) > 0 + + +def test_dry_run_existing_file_unchanged(tmp_path): + """dry_run=True on existing file leaves it byte-for-byte identical.""" + p = tmp_path / "settings.json" + original = {"hooks": {}, "other": "keep"} + p.write_text(json.dumps(original), encoding="utf-8") + original_mtime = p.stat().st_mtime + + # Small sleep to make mtime distinguishable if file is touched. + time.sleep(0.05) + patch_settings(p, _minimal_hooks_block(), dry_run=True) + + assert p.stat().st_mtime == original_mtime, "dry_run should not modify the file" + + +def test_corrupted_json_treated_as_empty(tmp_path): + """Corrupted settings.json is treated as {} — new hooks are added cleanly.""" + p = tmp_path / "settings.json" + p.write_text("NOT JSON AT ALL }{", encoding="utf-8") + + result = patch_settings(p, _minimal_hooks_block()) + assert result.ok + assert len(result.added) > 0 + # File should now be valid JSON. + saved = json.loads(p.read_text()) + assert "hooks" in saved + + +def test_post_tool_use_matcher_merge(tmp_path): + """PostToolUse entries with the same matcher are merged, not duplicated.""" + p = tmp_path / "settings.json" + # First install: adds a Grep-matcher entry. + hooks1 = { + "PostToolUse": [ + {"matcher": "Grep", "hooks": [{"type": "command", "command": "python3 /hook_a.py"}]}, + ] + } + # Second install: another Grep-matcher entry. + hooks2 = { + "PostToolUse": [ + {"matcher": "Grep", "hooks": [{"type": "command", "command": "python3 /hook_b.py"}]}, + ] + } + patch_settings(p, hooks1) + patch_settings(p, hooks2) + + saved = json.loads(p.read_text()) + post_tool_entries = saved["hooks"].get("PostToolUse", []) + grep_entries = [e for e in post_tool_entries if e.get("matcher") == "Grep"] + # Both hooks should be in the same matcher group, not two separate groups. + all_grep_cmds = [h["command"] for e in grep_entries for h in e.get("hooks", [])] + assert "python3 /hook_a.py" in all_grep_cmds + assert "python3 /hook_b.py" in all_grep_cmds + + +# ─── tests: unpatch_settings ───────────────────────────────────── + + +def test_unpatch_removes_only_specified(tmp_path): + """unpatch_settings removes only the listed commands; others remain.""" + p = tmp_path / "settings.json" + # Build a settings file with CTX hooks + a foreign hook. + settings = { + "hooks": { + "UserPromptSubmit": [ + {"hooks": [{"type": "command", "command": "python3 $HOME/.claude/hooks/chat-memory.py"}]}, + {"hooks": [{"type": "command", "command": "python3 /other/tool.py"}]}, + ] + } + } + p.write_text(json.dumps(settings), encoding="utf-8") + + to_remove = ["python3 $HOME/.claude/hooks/chat-memory.py"] + result = unpatch_settings(p, to_remove) + assert result.ok + + saved = json.loads(p.read_text()) + all_cmds = [ + hook.get("command") + for entries in saved["hooks"].values() + for entry in entries + for hook in entry.get("hooks", []) + ] + assert "python3 $HOME/.claude/hooks/chat-memory.py" not in all_cmds, "Should have been removed" + assert "python3 /other/tool.py" in all_cmds, "Foreign hook should be preserved" + + +def test_unpatch_dry_run_no_write(tmp_path): + """unpatch dry_run: reports removals without modifying the file.""" + p = tmp_path / "settings.json" + settings = { + "hooks": { + "UserPromptSubmit": [ + {"hooks": [{"type": "command", "command": "python3 $HOME/.claude/hooks/chat-memory.py"}]}, + ] + } + } + p.write_text(json.dumps(settings), encoding="utf-8") + mtime_before = p.stat().st_mtime + + time.sleep(0.05) + result = unpatch_settings(p, ["python3 $HOME/.claude/hooks/chat-memory.py"], dry_run=True) + assert result.ok + assert p.stat().st_mtime == mtime_before, "dry_run unpatch must not modify the file" + + +def test_unpatch_nonexistent_command_is_not_found(tmp_path): + """Trying to unpatch a command that isn't present is not an error per se.""" + p = tmp_path / "settings.json" + settings = {"hooks": {}} + p.write_text(json.dumps(settings), encoding="utf-8") + + result = unpatch_settings(p, ["python3 /does-not-exist.py"]) + # Should succeed without crashing; removed list should be empty. + assert result.ok or result.error is not None # either is acceptable + assert len(result.added) == 0 # "added" in unpatch context = removed + + +# ─── tests: save_atomic partial-write safety ───────────────────── + + +def test_save_atomic_preserves_original_on_write_error(tmp_path): + """If the write raises an OSError, the original file must remain intact. + + NOTE: This test mocks os.replace to raise an error AFTER the tmp file + is written. The original content should be preserved because atomic + rename never completed. + """ + p = tmp_path / "settings.json" + original_content = {"original": "keep me"} + p.write_text(json.dumps(original_content), encoding="utf-8") + + def bad_replace(src, dst): + # Remove the temp file to simulate cleanup, then raise. + try: + os.unlink(src) + except FileNotFoundError: + pass + raise OSError("Simulated disk full") + + with patch("settings_patcher.os.replace", side_effect=bad_replace): + try: + _save_atomic(p, {"should": "not appear"}) + except OSError: + pass # Expected + + # Original must still be intact. + saved = json.loads(p.read_text()) + assert saved == original_content, "Original file must not be corrupted on write error" + + +def test_atomic_write_real_filesystem_rename(tmp_path): + """Real-disk atomic write: final file correct, no .tmp_ctx residual, backup present.""" + p = tmp_path / "settings.json" + original_data = {"original": True, "key": "old_value"} + p.write_text(json.dumps(original_data), encoding="utf-8") + + new_data = {"updated": True, "key": "new_value"} + backup_path_str = _save_atomic(p, new_data) + + # Final file must exist and contain the new data. + assert p.exists(), "settings.json must exist after atomic write" + assert json.loads(p.read_text()) == new_data, "Final file must contain new data" + + # No .tmp_ctx residual file should remain (real rename must have completed). + tmp_residual = p.with_suffix(".tmp_ctx") + assert not tmp_residual.exists(), "Temp file must not remain after atomic rename" + + # A timestamped backup must have been created with original content. + assert backup_path_str, "Backup path must be non-empty for existing-file update" + backup = Path(backup_path_str) + assert backup.exists(), f"Backup file must exist at {backup}" + assert json.loads(backup.read_text()) == original_data, "Backup must contain original data" + + +def test_atomic_write_no_tmp_residual_on_new_file(tmp_path): + """For a new file (no prior content), no .tmp_ctx should remain after write.""" + p = tmp_path / "settings_new.json" + assert not p.exists() + + _save_atomic(p, {"brand": "new"}) + + assert p.exists() + tmp_residual = p.with_suffix(".tmp_ctx") + assert not tmp_residual.exists(), "Temp file must not remain after creating new file" + + +def test_atomic_write_backup_name_contains_timestamp(tmp_path): + """Backup filename must embed a timestamp (YYYYMMDD_HHMMSS pattern).""" + import re + p = tmp_path / "settings.json" + p.write_text(json.dumps({"v": 1}), encoding="utf-8") + + backup_path_str = _save_atomic(p, {"v": 2}) + + backup = Path(backup_path_str) + # e.g. settings.backup_20260505_123456.json + assert re.search(r"backup_\d{8}_\d{6}", backup.name), ( + f"Backup filename must contain timestamp: {backup.name}" + ) diff --git a/tests/unit/test_uninstall_cleanup.py b/tests/unit/test_uninstall_cleanup.py new file mode 100644 index 0000000..db006ae --- /dev/null +++ b/tests/unit/test_uninstall_cleanup.py @@ -0,0 +1,286 @@ +"""Unit tests for --uninstall file cleanup logic. + +Covers _cleanup_hook_files() and cmd_uninstall() in src/cli/install.py: + - Normal uninstall: hook files and _bm25/ removed when hashes match. + - User-modified file: kept (not removed) unless --force. + - _bm25/ with extra files: kept unless --force. + - --force: removes everything regardless of hash. + - dry_run: nothing deleted. + - not_found: missing files reported cleanly. +""" +from __future__ import annotations + +import hashlib +import json +import shutil +import sys +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import patch, MagicMock + +import pytest + +# Make install.py importable without package install. +_CLI_DIR = str(Path(__file__).parents[2] / "src" / "cli") +if _CLI_DIR not in sys.path: + sys.path.insert(0, _CLI_DIR) + +import install as _install + + +# ─── helpers ───────────────────────────────────────────────────────── + + +def _sha256_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def _write(path: Path, content: bytes = b"# hook content") -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(content) + return path + + +def _make_args(**kwargs) -> SimpleNamespace: + defaults = {"dry_run": False, "force": False} + defaults.update(kwargs) + return SimpleNamespace(**defaults) + + +def _override_hooks_dir(monkeypatch, tmp_home: Path): + """Redirect CLAUDE_HOOKS_DIR to an isolated tmp location.""" + hooks_dir = tmp_home / ".claude" / "hooks" + hooks_dir.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(_install, "CLAUDE_HOOKS_DIR", hooks_dir) + monkeypatch.setattr(_install, "CLAUDE_SETTINGS", tmp_home / ".claude" / "settings.json") + return hooks_dir + + +# ─── tests: _cleanup_hook_files ────────────────────────────────────── + + +def test_cleanup_removes_matching_files(tmp_path, monkeypatch): + """Files with matching hash against package source are removed.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + + # Write a hook file whose content matches what we'll pretend is the source. + content = b"# canonical hook\n" + hook_file = hooks_dir / "chat-memory.py" + _write(hook_file, content) + + # Build a fake package source directory with the same file. + src_dir = tmp_path / "pkg_hooks" + src_dir.mkdir() + (src_dir / "chat-memory.py").write_bytes(content) + + # Redirect only the hook we care about for this test. + monkeypatch.setattr(_install, "CTX_HOOKS", [ + ("chat-memory.py", "UserPromptSubmit", False), + ]) + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=False, dry_run=False) + + assert "chat-memory.py" in result["removed"], "Matching-hash file must be removed" + assert not hook_file.exists(), "File must be deleted from disk" + + +def test_cleanup_keeps_user_modified_file(tmp_path, monkeypatch): + """User-modified files (hash mismatch) must NOT be deleted when force=False.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + + original_content = b"# original hook\n" + modified_content = b"# user modified this\n" + + hook_file = hooks_dir / "chat-memory.py" + _write(hook_file, modified_content) + + src_dir = tmp_path / "pkg_hooks" + src_dir.mkdir() + (src_dir / "chat-memory.py").write_bytes(original_content) + + monkeypatch.setattr(_install, "CTX_HOOKS", [ + ("chat-memory.py", "UserPromptSubmit", False), + ]) + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=False, dry_run=False) + + assert "chat-memory.py" in result["kept"], "Modified file must be kept" + assert hook_file.exists(), "Modified file must not be deleted" + assert "chat-memory.py" not in result["removed"] + + +def test_cleanup_force_removes_modified_file(tmp_path, monkeypatch): + """--force removes user-modified files without hash check.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + + hook_file = hooks_dir / "chat-memory.py" + _write(hook_file, b"# user modified\n") + + src_dir = tmp_path / "pkg_hooks" + src_dir.mkdir() + (src_dir / "chat-memory.py").write_bytes(b"# original\n") + + monkeypatch.setattr(_install, "CTX_HOOKS", [ + ("chat-memory.py", "UserPromptSubmit", False), + ]) + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=True, dry_run=False) + + assert "chat-memory.py" in result["removed"] + assert not hook_file.exists(), "--force must delete even user-modified files" + + +def test_cleanup_dry_run_does_not_delete(tmp_path, monkeypatch): + """dry_run=True must not actually delete any files.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + + content = b"# hook\n" + hook_file = hooks_dir / "chat-memory.py" + _write(hook_file, content) + + src_dir = tmp_path / "pkg_hooks" + src_dir.mkdir() + (src_dir / "chat-memory.py").write_bytes(content) + + monkeypatch.setattr(_install, "CTX_HOOKS", [ + ("chat-memory.py", "UserPromptSubmit", False), + ]) + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=False, dry_run=True) + + # Reported as removed but file must still exist. + assert "chat-memory.py" in result["removed"] + assert hook_file.exists(), "dry_run must not delete files" + + +def test_cleanup_not_found_reported(tmp_path, monkeypatch): + """Missing files are reported in 'not_found', not 'removed' or 'kept'.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + # Do not create any hook file. + monkeypatch.setattr(_install, "CTX_HOOKS", [ + ("chat-memory.py", "UserPromptSubmit", False), + ]) + + src_dir = tmp_path / "pkg_hooks" + src_dir.mkdir() + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=False, dry_run=False) + + assert "chat-memory.py" in result["not_found"] + assert "chat-memory.py" not in result["removed"] + + +def test_cleanup_bm25_dir_removed_when_clean(tmp_path, monkeypatch): + """_bm25/ directory is removed when all files match package source.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + monkeypatch.setattr(_install, "CTX_HOOKS", []) + + content = b"# bm25 module\n" + bm25_dst = hooks_dir / "_bm25" + bm25_dst.mkdir() + (bm25_dst / "tokenizer.py").write_bytes(content) + + src_dir = tmp_path / "pkg_hooks" + src_bm25 = src_dir / "_bm25" + src_bm25.mkdir(parents=True) + (src_bm25 / "tokenizer.py").write_bytes(content) + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=False, dry_run=False) + + assert "_bm25/" in result["removed"] + assert not bm25_dst.exists(), "_bm25/ must be deleted when all files match" + + +def test_cleanup_bm25_dir_kept_when_extra_files(tmp_path, monkeypatch): + """_bm25/ with extra files (not from CTX) is kept when force=False.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + monkeypatch.setattr(_install, "CTX_HOOKS", []) + + content = b"# bm25 module\n" + bm25_dst = hooks_dir / "_bm25" + bm25_dst.mkdir() + (bm25_dst / "tokenizer.py").write_bytes(content) + (bm25_dst / "user_extra.py").write_bytes(b"# user added this\n") + + src_dir = tmp_path / "pkg_hooks" + src_bm25 = src_dir / "_bm25" + src_bm25.mkdir(parents=True) + (src_bm25 / "tokenizer.py").write_bytes(content) + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=False, dry_run=False) + + assert "_bm25/" in result["kept"], "_bm25/ with extra files must be kept" + assert bm25_dst.exists(), "_bm25/ must not be deleted when it has extra files" + + +def test_cleanup_bm25_dir_force_removes_with_extra_files(tmp_path, monkeypatch): + """--force removes _bm25/ even when extra files exist.""" + hooks_dir = _override_hooks_dir(monkeypatch, tmp_path) + monkeypatch.setattr(_install, "CTX_HOOKS", []) + + bm25_dst = hooks_dir / "_bm25" + bm25_dst.mkdir() + (bm25_dst / "user_extra.py").write_bytes(b"# user file\n") + + src_dir = tmp_path / "pkg_hooks" + src_bm25 = src_dir / "_bm25" + src_bm25.mkdir(parents=True) + + with patch.object(_install, "_pkg_hooks_dir", return_value=src_dir): + result = _install._cleanup_hook_files(force=True, dry_run=False) + + assert "_bm25/" in result["removed"] + assert not bm25_dst.exists(), "--force must remove _bm25/ unconditionally" + + +# ─── tests: cmd_uninstall integration ──────────────────────────────── + + +def test_cmd_uninstall_calls_cleanup(tmp_path, monkeypatch): + """cmd_uninstall must call _cleanup_hook_files (integration smoke test).""" + _override_hooks_dir(monkeypatch, tmp_path) + + # Mock unpatch_settings to succeed immediately. + mock_result = MagicMock() + mock_result.ok = True + mock_result.summary.return_value = " removed 0 command(s)" + + with ( + patch.object(_install, "unpatch_settings", return_value=mock_result), + patch.object(_install, "_cleanup_hook_files", return_value={ + "removed": [], "kept": [], "not_found": [], + }) as mock_cleanup, + ): + args = _make_args() + ret = _install.cmd_uninstall(args) + + assert ret == 0 + mock_cleanup.assert_called_once_with(force=False, dry_run=False) + + +def test_cmd_uninstall_force_flag_passed(tmp_path, monkeypatch): + """--force flag is forwarded to _cleanup_hook_files.""" + _override_hooks_dir(monkeypatch, tmp_path) + + mock_result = MagicMock() + mock_result.ok = True + mock_result.summary.return_value = " removed 0" + + with ( + patch.object(_install, "unpatch_settings", return_value=mock_result), + patch.object(_install, "_cleanup_hook_files", return_value={ + "removed": [], "kept": [], "not_found": [], + }) as mock_cleanup, + ): + args = _make_args(force=True) + ret = _install.cmd_uninstall(args) + + assert ret == 0 + mock_cleanup.assert_called_once_with(force=True, dry_run=False)