Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/strands_evals/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import detectors, evaluators, extractors, generators, providers, simulation, telemetry, types
from . import aggregators, chaos, detectors, evaluators, extractors, generators, providers, simulation, telemetry, types
from .aggregators import CaseAggregator
from .case import Case
from .eval_task_handler import EvalTaskHandler, TracedHandler, eval_task
from .evaluation_data_store import EvaluationDataStore
Expand All @@ -17,6 +18,8 @@
"EvalTaskHandler",
"TracedHandler",
"eval_task",
"aggregators",
"chaos",
"detectors",
"evaluators",
"extractors",
Expand All @@ -29,4 +32,5 @@
"get_tracer",
"ActorSimulator",
"UserSimulator",
"CaseAggregator",
]
13 changes: 13 additions & 0 deletions src/strands_evals/aggregators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Batch evaluation aggregators for Strands Evals.

Aggregators analyze evaluation results across multiple cases, scenarios,
or trials to produce summary reports and cross-case insights.
"""

from .base import CaseAggregator
from .types import AggregationResult

__all__ = [
"CaseAggregator",
"AggregationResult",
]
164 changes: 164 additions & 0 deletions src/strands_evals/aggregators/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""Base CaseAggregator class.

Provides a default implementation that groups results by evaluator and
computes numeric statistics (mean/min/max score, pass rate). Derived
classes override `summarize_reasons()` to add LLM-based or domain-specific
narrative summaries.
"""

import logging
from collections import defaultdict
from typing import Any

from ..types.evaluation_report import EvaluationReport
from .types import AggregationResult

logger = logging.getLogger(__name__)


class CaseAggregator:
"""Base class for evaluation aggregators.

An aggregator takes a flat list of EvaluationReports (produced by an
Experiment) and re-groups/analyzes them along a specific dimension
(e.g., chaos scenarios, trials, case categories).

The default implementation groups by evaluator name and computes numeric
stats across all cases. Subclasses can override:
- `aggregate()` for custom grouping logic
- `summarize_reasons()` for LLM-based or domain-specific narrative generation

Example::

from strands_evals.aggregators import CaseAggregator

aggregator = CaseAggregator()
reports = experiment.run_evaluations(task=my_task)
results = aggregator.aggregate(reports)

for r in results:
print(f"{r.group_key}: mean={r.mean_score:.2f}, pass_rate={r.pass_rate:.0%}")
"""

def __init__(self, name: str | None = None):
"""Initialize the aggregator.

Args:
name: Optional human-readable name for this aggregator.
"""
self.name = name or self.__class__.__name__

def aggregate(self, reports: list[EvaluationReport]) -> list[AggregationResult]:
"""Aggregate evaluation reports into summary results.

Default implementation groups all case results by evaluator name and
computes numeric statistics. The `summary` field is populated by
calling `summarize_reasons()`.

Args:
reports: Flat list of EvaluationReport objects from an Experiment run.

Returns:
List of AggregationResult objects, one per evaluator.
"""
if not reports:
return []

results = []
for report in reports:
stats = self._compute_stats(report.scores, report.test_passes)
summary = self.summarize_reasons(report.reasons)

results.append(
AggregationResult(
group_key=report.evaluator_name or "Unknown",
evaluator_name=report.evaluator_name or "Unknown",
summary=summary,
**stats,
)
)

return results

def summarize_reasons(self, reasons: list[str]) -> str:
"""Produce a narrative summary from a list of per-case reason strings.

The base implementation concatenates unique non-empty reasons.
Override in subclasses to use LLM-as-a-Judge or domain-specific logic.

Args:
reasons: List of reason strings from individual evaluations.

Returns:
A summary string.
"""
return self._concatenate_reasons(reasons)

# ------------------------------------------------------------------
# Shared utilities
# ------------------------------------------------------------------

@staticmethod
def _compute_stats(scores: list[float], passes: list[bool]) -> dict[str, Any]:
"""Compute basic statistics from a list of scores and pass/fail flags.

Args:
scores: List of numeric scores.
passes: List of boolean pass/fail indicators.

Returns:
Dict with mean_score, min_score, max_score, pass_rate,
num_results, num_passed, num_failed.
"""
if not scores:
return {
"mean_score": 0.0,
"min_score": 0.0,
"max_score": 0.0,
"pass_rate": 0.0,
"num_results": 0,
"num_passed": 0,
"num_failed": 0,
}

num_passed = sum(1 for p in passes if p)
num_failed = len(passes) - num_passed

return {
"mean_score": sum(scores) / len(scores),
"min_score": min(scores),
"max_score": max(scores),
"pass_rate": num_passed / len(passes) if passes else 0.0,
"num_results": len(scores),
"num_passed": num_passed,
"num_failed": num_failed,
}

@staticmethod
def _concatenate_reasons(reasons: list[str], max_reasons: int = 10) -> str:
"""Combine multiple reason strings by deduplication and concatenation.

Args:
reasons: List of reason strings from individual evaluations.
max_reasons: Maximum number of unique reasons to include.

Returns:
Combined summary string.
"""
unique_reasons = []
seen: set[str] = set()
for reason in reasons:
if reason and reason not in seen:
seen.add(reason)
unique_reasons.append(reason)
if len(unique_reasons) >= max_reasons:
break

if not unique_reasons:
return ""

if len(unique_reasons) == 1:
return unique_reasons[0]

summary_parts = [f"({i + 1}) {r}" for i, r in enumerate(unique_reasons)]
return " | ".join(summary_parts)
26 changes: 26 additions & 0 deletions src/strands_evals/aggregators/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Data models for evaluation aggregation results."""

from pydantic import BaseModel, Field


class AggregationResult(BaseModel):
"""Base aggregation result for a group of evaluation results.

Provides quantitative statistics that any aggregator can produce
regardless of the grouping dimension.
"""

group_key: str = Field(..., description="Identifier for this group (e.g., case name)")
evaluator_name: str

# --- Quantitative stats ---
mean_score: float
min_score: float
max_score: float
pass_rate: float # Fraction of results that passed (0.0 to 1.0)
num_results: int
num_passed: int
num_failed: int

# --- Narrative summary ---
summary: str = Field(default="", description="Aggregated summary of all reason fields")
62 changes: 62 additions & 0 deletions src/strands_evals/chaos/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Chaos testing module for Strands Evals.

Provides deterministic fault injection for evaluating agent resilience
under tool failures and response corruption scenarios.
"""

from .aggregation_display import ChaosAggregationDisplay, display_chaos_aggregation
from .aggregator import ChaosScenarioAggregator
from .aggregator_types import (
ChaosAggregationReport,
ChaosScenarioAggregation,
CoverageStatus,
ToolEffectResult,
)
from .effects import (
TOOL_CORRUPTION_EFFECTS,
TOOL_ERROR_EFFECTS,
ChaosEffect,
CorruptValues,
RemoveFields,
ToolCallFailure,
ToolEffect,
TruncateFields,
)
from .evaluators import (
FailureCommunicationEvaluator,
PartialCompletionEvaluator,
RecoveryStrategyEvaluator,
)
from .experiment import ChaosExperiment
from .plugin import ChaosPlugin
from .scenario import ChaosScenario

__all__ = [
# Core classes
"ChaosExperiment",
"ChaosPlugin",
"ChaosScenario",
# Effect hierarchy
"ChaosEffect",
"ToolEffect",
# Concrete effects
"ToolCallFailure",
"TruncateFields",
"RemoveFields",
"CorruptValues",
# Aggregation
"ChaosAggregationDisplay",
"ChaosAggregationReport",
"ChaosScenarioAggregator",
"ChaosScenarioAggregation",
"CoverageStatus",
"ToolEffectResult",
"display_chaos_aggregation",
# Evaluators
"FailureCommunicationEvaluator",
"PartialCompletionEvaluator",
"RecoveryStrategyEvaluator",
# Classification sets
"TOOL_ERROR_EFFECTS",
"TOOL_CORRUPTION_EFFECTS",
]
21 changes: 21 additions & 0 deletions src/strands_evals/chaos/_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Internal context variable for tracking the active chaos scenario.

The ChaosPlugin reads from this ContextVar at hook time.
The ChaosExperiment sets and resets it around each case's task invocation.

Using a ContextVar ensures correct behavior under:
- Sequential execution (trivially correct)
- Async execution (each asyncio.Task inherits the var from its parent)
- Threaded execution (each thread gets its own copy)
"""

from contextvars import ContextVar
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from .scenario import ChaosScenario

_current_scenario: ContextVar["ChaosScenario | None"] = ContextVar(
"chaos_current_scenario",
default=None,
)
Loading