|
| 1 | +Version: v0.6.2 |
| 2 | + |
| 3 | +""" |
| 4 | +RAG Evaluation Harness - Run evaluations with fixed seed for reproducibility. |
| 5 | +
|
| 6 | +This module provides a simple evaluation framework for testing RAG performance |
| 7 | +with a small, fixed dataset to ensure deterministic results. |
| 8 | +""" |
| 9 | + |
| 10 | +import logging |
| 11 | +import json |
| 12 | +import random |
| 13 | +from typing import List, Dict, Any, Optional |
| 14 | +from dataclasses import dataclass, asdict |
| 15 | +from pathlib import Path |
| 16 | + |
| 17 | +from .qa import QAModule, QAResult |
| 18 | + |
| 19 | +logger = logging.getLogger(__name__) |
| 20 | + |
| 21 | + |
| 22 | +@dataclass |
| 23 | +class EvalQuestion: |
| 24 | + """Evaluation question with expected answer.""" |
| 25 | + question: str |
| 26 | + expected_answer: str |
| 27 | + context: Optional[str] = None |
| 28 | + difficulty: str = "medium" |
| 29 | + category: str = "general" |
| 30 | + |
| 31 | + |
| 32 | +@dataclass |
| 33 | +class EvalResult: |
| 34 | + """Result of a single evaluation question.""" |
| 35 | + question: str |
| 36 | + predicted_answer: str |
| 37 | + expected_answer: str |
| 38 | + confidence: float |
| 39 | + passage_ids: List[str] |
| 40 | + is_grounded: bool |
| 41 | + is_correct: bool |
| 42 | + metadata: Dict[str, Any] |
| 43 | + |
| 44 | + |
| 45 | +@dataclass |
| 46 | +class EvalMetrics: |
| 47 | + """Aggregated evaluation metrics.""" |
| 48 | + total_questions: int |
| 49 | + correct_answers: int |
| 50 | + grounded_answers: int |
| 51 | + accuracy: float |
| 52 | + grounding_rate: float |
| 53 | + avg_confidence: float |
| 54 | + avg_passages_per_answer: float |
| 55 | + results: List[EvalResult] |
| 56 | + |
| 57 | + |
| 58 | +class RAGEvaluator: |
| 59 | + """RAG evaluation harness with fixed seed for reproducibility.""" |
| 60 | + |
| 61 | + def __init__(self, qa_module: QAModule, seed: int = 42): |
| 62 | + """Initialize evaluator with QA module and fixed seed.""" |
| 63 | + self.qa_module = qa_module |
| 64 | + self.seed = seed |
| 65 | + random.seed(seed) |
| 66 | + |
| 67 | + def create_test_dataset(self) -> List[EvalQuestion]: |
| 68 | + """Create a small test dataset for evaluation.""" |
| 69 | + return [ |
| 70 | + EvalQuestion( |
| 71 | + question="What is the main purpose of this AI development lab?", |
| 72 | + expected_answer="AI development lab for building and testing AI tools", |
| 73 | + difficulty="easy", |
| 74 | + category="project_overview" |
| 75 | + ), |
| 76 | + EvalQuestion( |
| 77 | + question="How does the MCP server work?", |
| 78 | + expected_answer="MCP server provides tool endpoints for document search and summarization", |
| 79 | + difficulty="medium", |
| 80 | + category="architecture" |
| 81 | + ), |
| 82 | + EvalQuestion( |
| 83 | + question="What testing framework is used?", |
| 84 | + expected_answer="pytest with coverage gating", |
| 85 | + difficulty="easy", |
| 86 | + category="testing" |
| 87 | + ), |
| 88 | + EvalQuestion( |
| 89 | + question="What is the RAG baseline configuration?", |
| 90 | + expected_answer="1000 token chunks, 15% overlap, cosine similarity retrieval", |
| 91 | + difficulty="hard", |
| 92 | + category="rag_config" |
| 93 | + ), |
| 94 | + EvalQuestion( |
| 95 | + question="How are documents ingested for RAG?", |
| 96 | + expected_answer="Documents are chunked and embedded using sentence transformers", |
| 97 | + difficulty="medium", |
| 98 | + category="rag_ingestion" |
| 99 | + ), |
| 100 | + EvalQuestion( |
| 101 | + question="What tools are promoted to app scope?", |
| 102 | + expected_answer="search_docs and summarize tools", |
| 103 | + difficulty="easy", |
| 104 | + category="mcp_tools" |
| 105 | + ), |
| 106 | + EvalQuestion( |
| 107 | + question="What is the coverage threshold?", |
| 108 | + expected_answer="68% coverage threshold", |
| 109 | + difficulty="easy", |
| 110 | + category="testing" |
| 111 | + ), |
| 112 | + EvalQuestion( |
| 113 | + question="How is linting configured?", |
| 114 | + expected_answer="Ruff with comprehensive rules", |
| 115 | + difficulty="medium", |
| 116 | + category="development" |
| 117 | + ), |
| 118 | + EvalQuestion( |
| 119 | + question="What version is currently tagged?", |
| 120 | + expected_answer="v0.6.2", |
| 121 | + difficulty="easy", |
| 122 | + category="versioning" |
| 123 | + ), |
| 124 | + EvalQuestion( |
| 125 | + question="What are the main use cases documented?", |
| 126 | + expected_answer="Multi-tool support agent and AI-assisted development", |
| 127 | + difficulty="medium", |
| 128 | + category="documentation" |
| 129 | + ) |
| 130 | + ] |
| 131 | + |
| 132 | + def evaluate_question(self, question: EvalQuestion) -> EvalResult: |
| 133 | + """Evaluate a single question.""" |
| 134 | + logger.info(f"Evaluating: {question.question[:50]}...") |
| 135 | + |
| 136 | + # Get QA result |
| 137 | + qa_result = self.qa_module.query( |
| 138 | + question.question, |
| 139 | + top_k=3, |
| 140 | + temperature=0.1 # Low temperature for deterministic output |
| 141 | + ) |
| 142 | + |
| 143 | + # Check grounding (has passage IDs) |
| 144 | + is_grounded = len(qa_result.passage_ids) > 0 |
| 145 | + |
| 146 | + # Simple correctness check (in production, use more sophisticated matching) |
| 147 | + predicted_lower = qa_result.answer.lower() |
| 148 | + expected_lower = question.expected_answer.lower() |
| 149 | + |
| 150 | + # Check if key terms from expected answer are present |
| 151 | + expected_terms = set(expected_lower.split()) |
| 152 | + predicted_terms = set(predicted_lower.split()) |
| 153 | + overlap = len(expected_terms.intersection(predicted_terms)) |
| 154 | + is_correct = overlap >= len(expected_terms) * 0.3 # 30% overlap threshold |
| 155 | + |
| 156 | + return EvalResult( |
| 157 | + question=question.question, |
| 158 | + predicted_answer=qa_result.answer, |
| 159 | + expected_answer=question.expected_answer, |
| 160 | + confidence=qa_result.confidence, |
| 161 | + passage_ids=qa_result.passage_ids, |
| 162 | + is_grounded=is_grounded, |
| 163 | + is_correct=is_correct, |
| 164 | + metadata={ |
| 165 | + "difficulty": question.difficulty, |
| 166 | + "category": question.category, |
| 167 | + "num_passages": len(qa_result.passages) |
| 168 | + } |
| 169 | + ) |
| 170 | + |
| 171 | + def run_evaluation(self, questions: Optional[List[EvalQuestion]] = None) -> EvalMetrics: |
| 172 | + """Run full evaluation on test dataset.""" |
| 173 | + if questions is None: |
| 174 | + questions = self.create_test_dataset() |
| 175 | + |
| 176 | + logger.info(f"Running evaluation on {len(questions)} questions with seed {self.seed}") |
| 177 | + |
| 178 | + results = [] |
| 179 | + for question in questions: |
| 180 | + result = self.evaluate_question(question) |
| 181 | + results.append(result) |
| 182 | + |
| 183 | + # Calculate metrics |
| 184 | + total_questions = len(results) |
| 185 | + correct_answers = sum(1 for r in results if r.is_correct) |
| 186 | + grounded_answers = sum(1 for r in results if r.is_grounded) |
| 187 | + avg_confidence = sum(r.confidence for r in results) / total_questions |
| 188 | + avg_passages = sum(len(r.passage_ids) for r in results) / total_questions |
| 189 | + |
| 190 | + metrics = EvalMetrics( |
| 191 | + total_questions=total_questions, |
| 192 | + correct_answers=correct_answers, |
| 193 | + grounded_answers=grounded_answers, |
| 194 | + accuracy=correct_answers / total_questions, |
| 195 | + grounding_rate=grounded_answers / total_questions, |
| 196 | + avg_confidence=avg_confidence, |
| 197 | + avg_passages_per_answer=avg_passages, |
| 198 | + results=results |
| 199 | + ) |
| 200 | + |
| 201 | + logger.info(f"Evaluation complete: {metrics.accuracy:.2%} accuracy, {metrics.grounding_rate:.2%} grounding") |
| 202 | + return metrics |
| 203 | + |
| 204 | + def save_results(self, metrics: EvalMetrics, output_path: str): |
| 205 | + """Save evaluation results to JSON file.""" |
| 206 | + output_data = { |
| 207 | + "seed": self.seed, |
| 208 | + "timestamp": str(Path().cwd()), |
| 209 | + "metrics": asdict(metrics) |
| 210 | + } |
| 211 | + |
| 212 | + with open(output_path, 'w') as f: |
| 213 | + json.dump(output_data, f, indent=2) |
| 214 | + |
| 215 | + logger.info(f"Results saved to {output_path}") |
| 216 | + |
| 217 | + |
| 218 | +def run_eval(config_path: str = "lab/rag/config.yaml", |
| 219 | + output_path: str = "eval_results.json", |
| 220 | + seed: int = 42) -> EvalMetrics: |
| 221 | + """Run evaluation with given configuration.""" |
| 222 | + from .qa import create_qa_module |
| 223 | + |
| 224 | + # Create QA module |
| 225 | + qa_module = create_qa_module(config_path) |
| 226 | + |
| 227 | + # Create evaluator |
| 228 | + evaluator = RAGEvaluator(qa_module, seed=seed) |
| 229 | + |
| 230 | + # Run evaluation |
| 231 | + metrics = evaluator.run_evaluation() |
| 232 | + |
| 233 | + # Save results |
| 234 | + evaluator.save_results(metrics, output_path) |
| 235 | + |
| 236 | + return metrics |
| 237 | + |
| 238 | + |
| 239 | +if __name__ == "__main__": |
| 240 | + import argparse |
| 241 | + |
| 242 | + parser = argparse.ArgumentParser(description="Run RAG evaluation") |
| 243 | + parser.add_argument("--config", default="lab/rag/config.yaml", help="Config file path") |
| 244 | + parser.add_argument("--output", default="eval_results.json", help="Output file path") |
| 245 | + parser.add_argument("--seed", type=int, default=42, help="Random seed") |
| 246 | + |
| 247 | + args = parser.parse_args() |
| 248 | + |
| 249 | + # Set up logging |
| 250 | + logging.basicConfig(level=logging.INFO) |
| 251 | + |
| 252 | + # Run evaluation |
| 253 | + metrics = run_eval(args.config, args.output, args.seed) |
| 254 | + |
| 255 | + # Print summary |
| 256 | + print(f"\n=== RAG Evaluation Results ===") |
| 257 | + print(f"Accuracy: {metrics.accuracy:.2%}") |
| 258 | + print(f"Grounding Rate: {metrics.grounding_rate:.2%}") |
| 259 | + print(f"Average Confidence: {metrics.avg_confidence:.2f}") |
| 260 | + print(f"Average Passages per Answer: {metrics.avg_passages_per_answer:.1f}") |
| 261 | + print(f"Results saved to: {args.output}") |
0 commit comments