jaytoone · jaytoone · May 10, 2026 · May 10, 2026
diff --git a/benchmarks/eval/g1_docs_bm25_eval.py b/benchmarks/eval/g1_docs_bm25_eval.py
@@ -20,9 +20,21 @@
 from typing import List, Optional, Tuple
 
 import anthropic
+import importlib.util
 from rank_bm25 import BM25Okapi
 
 
+# ── Canonical tokenizer (PR #3 unification) ────────────────────────────────
+# The upstream BM25 tokenizer lives inside `src/hooks/bm25-memory.py` (a
+# hyphenated filename rules out a normal `import`). Load it dynamically so eval
+# pipelines share the exact production tokenization (`tokenize`).
+_MONOLITH = Path(__file__).resolve().parents[2] / "src" / "hooks" / "bm25-memory.py"
+_spec = importlib.util.spec_from_file_location("bm25_memory", _MONOLITH)
+_bm25_memory = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_bm25_memory)
+tokenize = _bm25_memory.tokenize  # noqa: E305  canonical entry point
+
+
 # ──────────────────────────────────────────────────────────────────────────────
 # QA Pairs (same as g1_docs_memory_eval.py)
 # ──────────────────────────────────────────────────────────────────────────────
@@ -75,12 +87,6 @@
 # Step 1: Build BM25 index over doc chunks
 # ──────────────────────────────────────────────────────────────────────────────
 
-def tokenize(text: str) -> List[str]:
-    """Lowercase; preserve decimal numbers (0.724) and numeric ranges (7-30)."""
-    tokens = re.findall(r'\d+[-\u2013]\d+|\d+\.\d+|\w+', text.lower())
-    return [t for t in tokens if t]
-
-
 def chunk_document(filename: str, content: str) -> List[str]:
     """Split a document by ## section headers. Each chunk = filename § header\ncontent."""
     chunks = []

diff --git a/benchmarks/eval/g1_longterm_baseline_eval.py b/benchmarks/eval/g1_longterm_baseline_eval.py
@@ -18,8 +18,19 @@
 import subprocess
 import sys
 from pathlib import Path
+import importlib.util
 from typing import Dict, List, Optional, Tuple
 
+
+# Canonical tokenizer (PR #3 unification): load `bm25-memory.py` dynamically
+# (hyphen rules out a normal import) so eval and production share the exact
+# same tokenize() implementation.
+_MONOLITH = Path(__file__).resolve().parents[2] / "src" / "hooks" / "bm25-memory.py"
+_spec = importlib.util.spec_from_file_location("bm25_memory", _MONOLITH)
+_bm25_memory = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_bm25_memory)
+_canonical_tokenize = _bm25_memory.tokenize
+
 # ── LLM client ───────────────────────────────────────────────────────────────
 
 def get_llm_client():
@@ -264,9 +275,7 @@ def get_bm25_context(query: str, commit_corpus: List[Dict], top_k: int = 7) -> T
     if not commit_corpus:
         return "[Empty corpus]", 0
 
-    def tokenize(text: str) -> List[str]:
-        return re.findall(r'\b\w+\b', text.lower())
-
+    tokenize = _canonical_tokenize  # PR #3: was local re.findall(r'\b\w+\b'); now canonical
     subjects = [c.get('subject', '') for c in commit_corpus]
     tokenized = [tokenize(s) for s in subjects]
     bm25 = BM25Okapi(tokenized)

diff --git a/benchmarks/eval/g2_docs_paraphrase_eval.py b/benchmarks/eval/g2_docs_paraphrase_eval.py
@@ -16,6 +16,7 @@
   python3 benchmarks/eval/g2_docs_paraphrase_eval.py 2>&1
 """
 
+import importlib.util
 import json
 import re
 import time
@@ -25,6 +26,17 @@
 from rank_bm25 import BM25Okapi
 
 
+# ── Canonical tokenizer (PR #3 unification) ──────────────────────────────────
+# The upstream BM25 tokenizer lives inside `src/hooks/bm25-memory.py` (a
+# hyphenated filename rules out a normal `import`). Load it dynamically so eval
+# pipelines share the exact production tokenization (`tokenize`).
+_MONOLITH = Path(__file__).resolve().parents[2] / "src" / "hooks" / "bm25-memory.py"
+_spec = importlib.util.spec_from_file_location("bm25_memory", _MONOLITH)
+_bm25_memory = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_bm25_memory)
+tokenize = _bm25_memory.tokenize  # noqa: E305  canonical entry point
+
+
 # ──────────────────────────────────────────────────────────────────────────────
 # 30 Paraphrase QA Pairs
 # Each query uses different vocabulary from the answer_keywords.
@@ -317,23 +329,6 @@
 # BM25 index construction
 # ──────────────────────────────────────────────────────────────────────────────
 
-_KO_PARTICLES = re.compile(
-    r'(와|과|이|가|은|는|을|를|의|에서|으로|에게|부터|까지|처럼|같이|보다|이나|며|에|로|도|만|나|고)$'
-)
-
-
-def tokenize(text: str) -> List[str]:
-    """Preserve decimal numbers and numeric ranges. Strip Korean particles from mixed tokens."""
-    raw = re.findall(r'\d+[-\u2013]\d+|\d+\.\d+|\w+', text.lower())
-    result = []
-    for tok in raw:
-        cleaned = _KO_PARTICLES.sub('', tok)
-        if cleaned and cleaned != tok:
-            result.append(cleaned)
-        result.append(tok)
-    return list(dict.fromkeys(result))
-
-
 def chunk_document(filename: str, content: str) -> List[str]:
     """Split by ## section headers."""
     chunks = []