Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions benchmarks/eval/g1_docs_bm25_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,21 @@
from typing import List, Optional, Tuple

import anthropic
import importlib.util
from rank_bm25 import BM25Okapi


# ── Canonical tokenizer (PR #3 unification) ────────────────────────────────
# The upstream BM25 tokenizer lives inside `src/hooks/bm25-memory.py` (a
# hyphenated filename rules out a normal `import`). Load it dynamically so eval
# pipelines share the exact production tokenization (`tokenize`).
_MONOLITH = Path(__file__).resolve().parents[2] / "src" / "hooks" / "bm25-memory.py"
_spec = importlib.util.spec_from_file_location("bm25_memory", _MONOLITH)
_bm25_memory = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_bm25_memory)
tokenize = _bm25_memory.tokenize # noqa: E305 canonical entry point


# ──────────────────────────────────────────────────────────────────────────────
# QA Pairs (same as g1_docs_memory_eval.py)
# ──────────────────────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -75,12 +87,6 @@
# Step 1: Build BM25 index over doc chunks
# ──────────────────────────────────────────────────────────────────────────────

def tokenize(text: str) -> List[str]:
"""Lowercase; preserve decimal numbers (0.724) and numeric ranges (7-30)."""
tokens = re.findall(r'\d+[-\u2013]\d+|\d+\.\d+|\w+', text.lower())
return [t for t in tokens if t]


def chunk_document(filename: str, content: str) -> List[str]:
"""Split a document by ## section headers. Each chunk = filename § header\ncontent."""
chunks = []
Expand Down
15 changes: 12 additions & 3 deletions benchmarks/eval/g1_longterm_baseline_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,19 @@
import subprocess
import sys
from pathlib import Path
import importlib.util
from typing import Dict, List, Optional, Tuple


# Canonical tokenizer (PR #3 unification): load `bm25-memory.py` dynamically
# (hyphen rules out a normal import) so eval and production share the exact
# same tokenize() implementation.
_MONOLITH = Path(__file__).resolve().parents[2] / "src" / "hooks" / "bm25-memory.py"
_spec = importlib.util.spec_from_file_location("bm25_memory", _MONOLITH)
_bm25_memory = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_bm25_memory)
_canonical_tokenize = _bm25_memory.tokenize

# ── LLM client ───────────────────────────────────────────────────────────────

def get_llm_client():
Expand Down Expand Up @@ -264,9 +275,7 @@ def get_bm25_context(query: str, commit_corpus: List[Dict], top_k: int = 7) -> T
if not commit_corpus:
return "[Empty corpus]", 0

def tokenize(text: str) -> List[str]:
return re.findall(r'\b\w+\b', text.lower())

tokenize = _canonical_tokenize # PR #3: was local re.findall(r'\b\w+\b'); now canonical
subjects = [c.get('subject', '') for c in commit_corpus]
tokenized = [tokenize(s) for s in subjects]
bm25 = BM25Okapi(tokenized)
Expand Down
29 changes: 12 additions & 17 deletions benchmarks/eval/g2_docs_paraphrase_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
python3 benchmarks/eval/g2_docs_paraphrase_eval.py 2>&1
"""

import importlib.util
import json
import re
import time
Expand All @@ -25,6 +26,17 @@
from rank_bm25 import BM25Okapi


# ── Canonical tokenizer (PR #3 unification) ──────────────────────────────────
# The upstream BM25 tokenizer lives inside `src/hooks/bm25-memory.py` (a
# hyphenated filename rules out a normal `import`). Load it dynamically so eval
# pipelines share the exact production tokenization (`tokenize`).
_MONOLITH = Path(__file__).resolve().parents[2] / "src" / "hooks" / "bm25-memory.py"
_spec = importlib.util.spec_from_file_location("bm25_memory", _MONOLITH)
_bm25_memory = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_bm25_memory)
tokenize = _bm25_memory.tokenize # noqa: E305 canonical entry point


# ──────────────────────────────────────────────────────────────────────────────
# 30 Paraphrase QA Pairs
# Each query uses different vocabulary from the answer_keywords.
Expand Down Expand Up @@ -317,23 +329,6 @@
# BM25 index construction
# ──────────────────────────────────────────────────────────────────────────────

_KO_PARTICLES = re.compile(
r'(와|과|이|가|은|는|을|를|의|에서|으로|에게|부터|까지|처럼|같이|보다|이나|며|에|로|도|만|나|고)$'
)


def tokenize(text: str) -> List[str]:
"""Preserve decimal numbers and numeric ranges. Strip Korean particles from mixed tokens."""
raw = re.findall(r'\d+[-\u2013]\d+|\d+\.\d+|\w+', text.lower())
result = []
for tok in raw:
cleaned = _KO_PARTICLES.sub('', tok)
if cleaned and cleaned != tok:
result.append(cleaned)
result.append(tok)
return list(dict.fromkeys(result))


def chunk_document(filename: str, content: str) -> List[str]:
"""Split by ## section headers."""
chunks = []
Expand Down