strands-agents · venkatkrish543re · May 5, 2026 · May 6, 2026 · May 8, 2026 · May 6, 2026
diff --git a/src/strands_evals/__init__.py b/src/strands_evals/__init__.py
@@ -1,4 +1,5 @@
-from . import detectors, evaluators, extractors, generators, providers, simulation, telemetry, types
+from . import aggregators, chaos, detectors, evaluators, extractors, generators, providers, simulation, telemetry, types
+from .aggregators import CaseAggregator
 from .case import Case
 from .eval_task_handler import EvalTaskHandler, TracedHandler, eval_task
 from .evaluation_data_store import EvaluationDataStore
@@ -17,6 +18,8 @@
     "EvalTaskHandler",
     "TracedHandler",
     "eval_task",
+    "aggregators",
+    "chaos",
     "detectors",
     "evaluators",
     "extractors",
@@ -29,4 +32,5 @@
     "get_tracer",
     "ActorSimulator",
     "UserSimulator",
+    "CaseAggregator",
 ]
diff --git a/src/strands_evals/aggregators/__init__.py b/src/strands_evals/aggregators/__init__.py
@@ -0,0 +1,13 @@
+"""Batch evaluation aggregators for Strands Evals.
+
+Aggregators analyze evaluation results across multiple cases, scenarios,
+or trials to produce summary reports and cross-case insights.
+"""
+
+from .base import CaseAggregator
+from .types import AggregationResult
+
+__all__ = [
+    "CaseAggregator",
+    "AggregationResult",
+]
diff --git a/src/strands_evals/aggregators/base.py b/src/strands_evals/aggregators/base.py
@@ -0,0 +1,164 @@
+"""Base CaseAggregator class.
+
+Provides a default implementation that groups results by evaluator and
+computes numeric statistics (mean/min/max score, pass rate). Derived
+classes override `summarize_reasons()` to add LLM-based or domain-specific
+narrative summaries.
+"""
+
+import logging
+from collections import defaultdict
+from typing import Any
+
+from ..types.evaluation_report import EvaluationReport
+from .types import AggregationResult
+
+logger = logging.getLogger(__name__)
+
+
+class CaseAggregator:
+    """Base class for evaluation aggregators.
+
+    An aggregator takes a flat list of EvaluationReports (produced by an
+    Experiment) and re-groups/analyzes them along a specific dimension
+    (e.g., chaos scenarios, trials, case categories).
+
+    The default implementation groups by evaluator name and computes numeric
+    stats across all cases. Subclasses can override:
+        - `aggregate()` for custom grouping logic
+        - `summarize_reasons()` for LLM-based or domain-specific narrative generation
+
+    Example::
+
+        from strands_evals.aggregators import CaseAggregator
+
+        aggregator = CaseAggregator()
+        reports = experiment.run_evaluations(task=my_task)
+        results = aggregator.aggregate(reports)
+
+        for r in results:
+            print(f"{r.group_key}: mean={r.mean_score:.2f}, pass_rate={r.pass_rate:.0%}")
+    """
+
+    def __init__(self, name: str | None = None):
+        """Initialize the aggregator.
+
+        Args:
+            name: Optional human-readable name for this aggregator.
+        """
+        self.name = name or self.__class__.__name__
+
+    def aggregate(self, reports: list[EvaluationReport]) -> list[AggregationResult]:
+        """Aggregate evaluation reports into summary results.
+
+        Default implementation groups all case results by evaluator name and
+        computes numeric statistics. The `summary` field is populated by
+        calling `summarize_reasons()`.
+
+        Args:
+            reports: Flat list of EvaluationReport objects from an Experiment run.
+
+        Returns:
+            List of AggregationResult objects, one per evaluator.
+        """
+        if not reports:
+            return []
+
+        results = []
+        for report in reports:
+            stats = self._compute_stats(report.scores, report.test_passes)
+            summary = self.summarize_reasons(report.reasons)
+
+            results.append(
+                AggregationResult(
+                    group_key=report.evaluator_name or "Unknown",
+                    evaluator_name=report.evaluator_name or "Unknown",
+                    summary=summary,
+                    **stats,
+                )
+            )
+
+        return results
+
+    def summarize_reasons(self, reasons: list[str]) -> str:
+        """Produce a narrative summary from a list of per-case reason strings.
+
+        The base implementation concatenates unique non-empty reasons.
+        Override in subclasses to use LLM-as-a-Judge or domain-specific logic.
+
+        Args:
+            reasons: List of reason strings from individual evaluations.
+
+        Returns:
+            A summary string.
+        """
+        return self._concatenate_reasons(reasons)
+
+    # ------------------------------------------------------------------
+    # Shared utilities
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _compute_stats(scores: list[float], passes: list[bool]) -> dict[str, Any]:
+        """Compute basic statistics from a list of scores and pass/fail flags.
+
+        Args:
+            scores: List of numeric scores.
+            passes: List of boolean pass/fail indicators.
+
+        Returns:
+            Dict with mean_score, min_score, max_score, pass_rate,
+            num_results, num_passed, num_failed.
+        """
+        if not scores:
+            return {
+                "mean_score": 0.0,
+                "min_score": 0.0,
+                "max_score": 0.0,
+                "pass_rate": 0.0,
+                "num_results": 0,
+                "num_passed": 0,
+                "num_failed": 0,
+            }
+
+        num_passed = sum(1 for p in passes if p)
+        num_failed = len(passes) - num_passed
+
+        return {
+            "mean_score": sum(scores) / len(scores),
+            "min_score": min(scores),
+            "max_score": max(scores),
+            "pass_rate": num_passed / len(passes) if passes else 0.0,
+            "num_results": len(scores),
+            "num_passed": num_passed,
+            "num_failed": num_failed,
+        }
+
+    @staticmethod
+    def _concatenate_reasons(reasons: list[str], max_reasons: int = 10) -> str:
+        """Combine multiple reason strings by deduplication and concatenation.
+
+        Args:
+            reasons: List of reason strings from individual evaluations.
+            max_reasons: Maximum number of unique reasons to include.
+
+        Returns:
+            Combined summary string.
+        """
+        unique_reasons = []
+        seen: set[str] = set()
+        for reason in reasons:
+            if reason and reason not in seen:
+                seen.add(reason)
+                unique_reasons.append(reason)
+                if len(unique_reasons) >= max_reasons:
+                    break
+
+        if not unique_reasons:
+            return ""
+
+        if len(unique_reasons) == 1:
+            return unique_reasons[0]
+
+        summary_parts = [f"({i + 1}) {r}" for i, r in enumerate(unique_reasons)]
+        return " | ".join(summary_parts)
diff --git a/src/strands_evals/aggregators/types.py b/src/strands_evals/aggregators/types.py
@@ -0,0 +1,26 @@
+"""Data models for evaluation aggregation results."""
+
+from pydantic import BaseModel, Field
+
+
+class AggregationResult(BaseModel):
+    """Base aggregation result for a group of evaluation results.
+
+    Provides quantitative statistics that any aggregator can produce
+    regardless of the grouping dimension.
+    """
+
+    group_key: str = Field(..., description="Identifier for this group (e.g., case name)")
+    evaluator_name: str
+
+    # --- Quantitative stats ---
+    mean_score: float
+    min_score: float
+    max_score: float
+    pass_rate: float  # Fraction of results that passed (0.0 to 1.0)
+    num_results: int
+    num_passed: int
+    num_failed: int
+
+    # --- Narrative summary ---
+    summary: str = Field(default="", description="Aggregated summary of all reason fields")
diff --git a/src/strands_evals/chaos/__init__.py b/src/strands_evals/chaos/__init__.py
@@ -0,0 +1,62 @@
+"""Chaos testing module for Strands Evals.
+
+Provides deterministic fault injection for evaluating agent resilience
+under tool failures and response corruption scenarios.
+"""
+
+from .aggregation_display import ChaosAggregationDisplay, display_chaos_aggregation
+from .aggregator import ChaosScenarioAggregator
+from .aggregator_types import (
+    ChaosAggregationReport,
+    ChaosScenarioAggregation,
+    CoverageStatus,
+    ToolEffectResult,
+)
+from .effects import (
+    TOOL_CORRUPTION_EFFECTS,
+    TOOL_ERROR_EFFECTS,
+    ChaosEffect,
+    CorruptValues,
+    RemoveFields,
+    ToolCallFailure,
+    ToolEffect,
+    TruncateFields,
+)
+from .evaluators import (
+    FailureCommunicationEvaluator,
+    PartialCompletionEvaluator,
+    RecoveryStrategyEvaluator,
+)
+from .experiment import ChaosExperiment
+from .plugin import ChaosPlugin
+from .scenario import ChaosScenario
+
+__all__ = [
+    # Core classes
+    "ChaosExperiment",
+    "ChaosPlugin",
+    "ChaosScenario",
+    # Effect hierarchy
+    "ChaosEffect",
+    "ToolEffect",
+    # Concrete effects
+    "ToolCallFailure",
+    "TruncateFields",
+    "RemoveFields",
+    "CorruptValues",
+    # Aggregation
+    "ChaosAggregationDisplay",
+    "ChaosAggregationReport",
+    "ChaosScenarioAggregator",
+    "ChaosScenarioAggregation",
+    "CoverageStatus",
+    "ToolEffectResult",
+    "display_chaos_aggregation",
+    # Evaluators
+    "FailureCommunicationEvaluator",
+    "PartialCompletionEvaluator",
+    "RecoveryStrategyEvaluator",
+    # Classification sets
+    "TOOL_ERROR_EFFECTS",
+    "TOOL_CORRUPTION_EFFECTS",
+]
diff --git a/src/strands_evals/chaos/_context.py b/src/strands_evals/chaos/_context.py
@@ -0,0 +1,21 @@
+"""Internal context variable for tracking the active chaos scenario.
+
+The ChaosPlugin reads from this ContextVar at hook time.
+The ChaosExperiment sets and resets it around each case's task invocation.
+
+Using a ContextVar ensures correct behavior under:
+- Sequential execution (trivially correct)
+- Async execution (each asyncio.Task inherits the var from its parent)
+- Threaded execution (each thread gets its own copy)
+"""
+
+from contextvars import ContextVar
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .scenario import ChaosScenario
+
+_current_scenario: ContextVar["ChaosScenario | None"] = ContextVar(
+    "chaos_current_scenario",
+    default=None,
+)