Skip to content

Commit 6b4c211

Browse files
committed
feat(rag): step 6C QA module + eval harness + grounding test
- Add lab/rag/qa.py: QA module with retrieval + synthesis + grounding - Add lab/rag/eval.py: evaluation harness with fixed seed for reproducibility - Add tests/rag/test_qa_grounding.py: grounding validation tests - Update docs/research/rag-baseline.md: document QA module features Features: - Passage ID citations for all answers - Confidence scoring based on passage relevance - Deterministic evaluation with fixed seed - Batch processing support - Comprehensive grounding validation tests Ready for S6C evaluation phase.
1 parent 8db45c4 commit 6b4c211

4 files changed

Lines changed: 600 additions & 3 deletions

File tree

docs/research/rag-baseline.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,20 @@ Version: v0.6.2
1414
## Baseline Metrics (seeded)
1515
| Date | Commit SHA | k | Chunk Size | Overlap | Eval Size | Accuracy | Grounding | Notes |
1616
|------------|------------|---|------------|---------|-----------|----------|-----------|-------|
17-
| 2025-09-06 | `<sha>` | 3 | 1000 | 15% | 15 | _TBD_ | _TBD_ | Initial run |
17+
| 2025-09-06 | `<sha>` | 3 | 1000 | 15% | 10 | _TBD_ | _TBD_ | Initial run with QA module |
1818

19-
> **Method:** `lab/rag/eval.py` with fixed seed and small eval set (10–20 Q/A).
20-
> **Repro:** `python lab/rag/eval.py --config lab/rag/config.yaml --seed 42`
19+
> **Method:** `lab/rag/eval.py` with fixed seed and small eval set (10 Q/A).
20+
> **Repro:** `python lab/rag/eval.py --config lab/rag/config.yaml --seed 42`
21+
> **QA Module:** `lab/rag/qa.py` provides retrieval + synthesis with grounding
22+
> **Tests:** `tests/rag/test_qa_grounding.py` validates citation functionality
23+
24+
## QA Module Features
25+
- **Retrieval + Synthesis**: Combines embedding retrieval with answer generation
26+
- **Grounding**: Ensures all answers cite specific passage IDs
27+
- **Confidence Scoring**: Provides confidence metrics based on passage relevance
28+
- **Batch Processing**: Supports multiple questions in single evaluation
29+
- **Deterministic**: Fixed seed ensures reproducible results
2130

2231
## Change Log
2332
- v0.6.2: Created baseline log; wired to eval harness.
33+
- v0.6.2: Added QA module with grounding support and eval harness.

lab/rag/eval.py

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
Version: v0.6.2
2+
3+
"""
4+
RAG Evaluation Harness - Run evaluations with fixed seed for reproducibility.
5+
6+
This module provides a simple evaluation framework for testing RAG performance
7+
with a small, fixed dataset to ensure deterministic results.
8+
"""
9+
10+
import logging
11+
import json
12+
import random
13+
from typing import List, Dict, Any, Optional
14+
from dataclasses import dataclass, asdict
15+
from pathlib import Path
16+
17+
from .qa import QAModule, QAResult
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
@dataclass
23+
class EvalQuestion:
24+
"""Evaluation question with expected answer."""
25+
question: str
26+
expected_answer: str
27+
context: Optional[str] = None
28+
difficulty: str = "medium"
29+
category: str = "general"
30+
31+
32+
@dataclass
33+
class EvalResult:
34+
"""Result of a single evaluation question."""
35+
question: str
36+
predicted_answer: str
37+
expected_answer: str
38+
confidence: float
39+
passage_ids: List[str]
40+
is_grounded: bool
41+
is_correct: bool
42+
metadata: Dict[str, Any]
43+
44+
45+
@dataclass
46+
class EvalMetrics:
47+
"""Aggregated evaluation metrics."""
48+
total_questions: int
49+
correct_answers: int
50+
grounded_answers: int
51+
accuracy: float
52+
grounding_rate: float
53+
avg_confidence: float
54+
avg_passages_per_answer: float
55+
results: List[EvalResult]
56+
57+
58+
class RAGEvaluator:
59+
"""RAG evaluation harness with fixed seed for reproducibility."""
60+
61+
def __init__(self, qa_module: QAModule, seed: int = 42):
62+
"""Initialize evaluator with QA module and fixed seed."""
63+
self.qa_module = qa_module
64+
self.seed = seed
65+
random.seed(seed)
66+
67+
def create_test_dataset(self) -> List[EvalQuestion]:
68+
"""Create a small test dataset for evaluation."""
69+
return [
70+
EvalQuestion(
71+
question="What is the main purpose of this AI development lab?",
72+
expected_answer="AI development lab for building and testing AI tools",
73+
difficulty="easy",
74+
category="project_overview"
75+
),
76+
EvalQuestion(
77+
question="How does the MCP server work?",
78+
expected_answer="MCP server provides tool endpoints for document search and summarization",
79+
difficulty="medium",
80+
category="architecture"
81+
),
82+
EvalQuestion(
83+
question="What testing framework is used?",
84+
expected_answer="pytest with coverage gating",
85+
difficulty="easy",
86+
category="testing"
87+
),
88+
EvalQuestion(
89+
question="What is the RAG baseline configuration?",
90+
expected_answer="1000 token chunks, 15% overlap, cosine similarity retrieval",
91+
difficulty="hard",
92+
category="rag_config"
93+
),
94+
EvalQuestion(
95+
question="How are documents ingested for RAG?",
96+
expected_answer="Documents are chunked and embedded using sentence transformers",
97+
difficulty="medium",
98+
category="rag_ingestion"
99+
),
100+
EvalQuestion(
101+
question="What tools are promoted to app scope?",
102+
expected_answer="search_docs and summarize tools",
103+
difficulty="easy",
104+
category="mcp_tools"
105+
),
106+
EvalQuestion(
107+
question="What is the coverage threshold?",
108+
expected_answer="68% coverage threshold",
109+
difficulty="easy",
110+
category="testing"
111+
),
112+
EvalQuestion(
113+
question="How is linting configured?",
114+
expected_answer="Ruff with comprehensive rules",
115+
difficulty="medium",
116+
category="development"
117+
),
118+
EvalQuestion(
119+
question="What version is currently tagged?",
120+
expected_answer="v0.6.2",
121+
difficulty="easy",
122+
category="versioning"
123+
),
124+
EvalQuestion(
125+
question="What are the main use cases documented?",
126+
expected_answer="Multi-tool support agent and AI-assisted development",
127+
difficulty="medium",
128+
category="documentation"
129+
)
130+
]
131+
132+
def evaluate_question(self, question: EvalQuestion) -> EvalResult:
133+
"""Evaluate a single question."""
134+
logger.info(f"Evaluating: {question.question[:50]}...")
135+
136+
# Get QA result
137+
qa_result = self.qa_module.query(
138+
question.question,
139+
top_k=3,
140+
temperature=0.1 # Low temperature for deterministic output
141+
)
142+
143+
# Check grounding (has passage IDs)
144+
is_grounded = len(qa_result.passage_ids) > 0
145+
146+
# Simple correctness check (in production, use more sophisticated matching)
147+
predicted_lower = qa_result.answer.lower()
148+
expected_lower = question.expected_answer.lower()
149+
150+
# Check if key terms from expected answer are present
151+
expected_terms = set(expected_lower.split())
152+
predicted_terms = set(predicted_lower.split())
153+
overlap = len(expected_terms.intersection(predicted_terms))
154+
is_correct = overlap >= len(expected_terms) * 0.3 # 30% overlap threshold
155+
156+
return EvalResult(
157+
question=question.question,
158+
predicted_answer=qa_result.answer,
159+
expected_answer=question.expected_answer,
160+
confidence=qa_result.confidence,
161+
passage_ids=qa_result.passage_ids,
162+
is_grounded=is_grounded,
163+
is_correct=is_correct,
164+
metadata={
165+
"difficulty": question.difficulty,
166+
"category": question.category,
167+
"num_passages": len(qa_result.passages)
168+
}
169+
)
170+
171+
def run_evaluation(self, questions: Optional[List[EvalQuestion]] = None) -> EvalMetrics:
172+
"""Run full evaluation on test dataset."""
173+
if questions is None:
174+
questions = self.create_test_dataset()
175+
176+
logger.info(f"Running evaluation on {len(questions)} questions with seed {self.seed}")
177+
178+
results = []
179+
for question in questions:
180+
result = self.evaluate_question(question)
181+
results.append(result)
182+
183+
# Calculate metrics
184+
total_questions = len(results)
185+
correct_answers = sum(1 for r in results if r.is_correct)
186+
grounded_answers = sum(1 for r in results if r.is_grounded)
187+
avg_confidence = sum(r.confidence for r in results) / total_questions
188+
avg_passages = sum(len(r.passage_ids) for r in results) / total_questions
189+
190+
metrics = EvalMetrics(
191+
total_questions=total_questions,
192+
correct_answers=correct_answers,
193+
grounded_answers=grounded_answers,
194+
accuracy=correct_answers / total_questions,
195+
grounding_rate=grounded_answers / total_questions,
196+
avg_confidence=avg_confidence,
197+
avg_passages_per_answer=avg_passages,
198+
results=results
199+
)
200+
201+
logger.info(f"Evaluation complete: {metrics.accuracy:.2%} accuracy, {metrics.grounding_rate:.2%} grounding")
202+
return metrics
203+
204+
def save_results(self, metrics: EvalMetrics, output_path: str):
205+
"""Save evaluation results to JSON file."""
206+
output_data = {
207+
"seed": self.seed,
208+
"timestamp": str(Path().cwd()),
209+
"metrics": asdict(metrics)
210+
}
211+
212+
with open(output_path, 'w') as f:
213+
json.dump(output_data, f, indent=2)
214+
215+
logger.info(f"Results saved to {output_path}")
216+
217+
218+
def run_eval(config_path: str = "lab/rag/config.yaml",
219+
output_path: str = "eval_results.json",
220+
seed: int = 42) -> EvalMetrics:
221+
"""Run evaluation with given configuration."""
222+
from .qa import create_qa_module
223+
224+
# Create QA module
225+
qa_module = create_qa_module(config_path)
226+
227+
# Create evaluator
228+
evaluator = RAGEvaluator(qa_module, seed=seed)
229+
230+
# Run evaluation
231+
metrics = evaluator.run_evaluation()
232+
233+
# Save results
234+
evaluator.save_results(metrics, output_path)
235+
236+
return metrics
237+
238+
239+
if __name__ == "__main__":
240+
import argparse
241+
242+
parser = argparse.ArgumentParser(description="Run RAG evaluation")
243+
parser.add_argument("--config", default="lab/rag/config.yaml", help="Config file path")
244+
parser.add_argument("--output", default="eval_results.json", help="Output file path")
245+
parser.add_argument("--seed", type=int, default=42, help="Random seed")
246+
247+
args = parser.parse_args()
248+
249+
# Set up logging
250+
logging.basicConfig(level=logging.INFO)
251+
252+
# Run evaluation
253+
metrics = run_eval(args.config, args.output, args.seed)
254+
255+
# Print summary
256+
print(f"\n=== RAG Evaluation Results ===")
257+
print(f"Accuracy: {metrics.accuracy:.2%}")
258+
print(f"Grounding Rate: {metrics.grounding_rate:.2%}")
259+
print(f"Average Confidence: {metrics.avg_confidence:.2f}")
260+
print(f"Average Passages per Answer: {metrics.avg_passages_per_answer:.1f}")
261+
print(f"Results saved to: {args.output}")

0 commit comments

Comments
 (0)