Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import logging
from typing import Any

from pydantic import BaseModel, Field

from strands import Agent
from strands_evals import Case
from strands_evals.chaos import (
ChaosCase,
ChaosExperiment,
ChaosPlugin,
CorruptValues,
RemoveFields,
ToolCallFailure,
TruncateFields,
)
from strands_evals.evaluators import GoalSuccessRateEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.simulation.tool_simulator import ToolSimulator
from strands_evals.telemetry import StrandsEvalsTelemetry

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

# Setup telemetry
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

# 1. Set up ToolSimulator and register tools
tool_simulator = ToolSimulator()

class FlightSearchResponse(BaseModel):
"""Response from the flight search tool."""

flights: list[dict[str, Any]] = Field(default_factory=list, description="List of available flights")
total_results: int = Field(default=0, description="Total number of results found")
status: str = Field(default="success", description="Operation status")

class BookFlightResponse(BaseModel):
"""Response from the flight booking tool."""

booking_id: str = Field(default="", description="Booking confirmation ID")
flight_id: str = Field(default="", description="The booked flight ID")
status: str = Field(default="success", description="Booking status")
message: str = Field(default="", description="Status message")

class BookingConfirmationResponse(BaseModel):
"""Response from the booking confirmation tool."""

confirmation_sent: bool = Field(default=False, description="Whether confirmation was sent")
method: str = Field(default="email", description="Delivery method")
message: str = Field(default="", description="Confirmation details")

@tool_simulator.tool(output_schema=FlightSearchResponse)
def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
"""Search for available flights between two cities on a given date."""
pass

@tool_simulator.tool(output_schema=BookFlightResponse)
def book_flight(flight_id: str) -> dict[str, Any]:
"""Book a specific flight by its flight ID. Returns booking confirmation."""
pass

@tool_simulator.tool(output_schema=BookingConfirmationResponse)
def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]:
"""Send booking confirmation or fallback link to the user via email or SMS."""
pass

# 2. Create the ChaosPlugin
chaos_plugin = ChaosPlugin()

# 3. Define named effect maps
effect_maps = {
# Single-tool, pre-hook: tool call is cancelled before execution
"search_timeout": {
"search_flights": [ToolCallFailure(error_type="timeout")],
},
# Two-tool, post-hook: tools execute but responses are silently corrupted
"book_corrupt_and_confirm_truncated": {
"book_flight": [CorruptValues(corrupt_ratio=0.8)],
"send_booking_confirmation": [TruncateFields(max_length=5)],
},
# All-tool, mixed pre+post: combines hard failures with silent corruption
"total_chaos": {
"search_flights": [ToolCallFailure(error_type="network_error")],
"book_flight": [ToolCallFailure(error_type="execution_error")],
"send_booking_confirmation": [RemoveFields(remove_ratio=0.7)],
},
}

# 4. Define the task function
# Pre-create tool instances once (avoids registry issues across runs)
_search_tool = tool_simulator.get_tool("search_flights")
_book_tool = tool_simulator.get_tool("book_flight")
_confirm_tool = tool_simulator.get_tool("send_booking_confirmation")

def travel_agent_task(case: ChaosCase) -> dict:
"""Run the travel agent with a single user query."""
logger.info(f"\n{'─'*60}")
logger.info(f" Case: {case.name}")
logger.info(f" User: {case.input}")

agent = Agent(
system_prompt=(
"You are a travel booking assistant. You help users search for flights, "
"book them, and send confirmations. Use the available tools to complete "
"the user's request. Today's date is May 18, 2025.\n\n"
"Always use the tools directly — do not ask the user for clarification "
"if you can infer reasonable values from context.\n\n"
"If a tool fails or returns an error:\n"
"- Acknowledge the failure honestly to the user\n"
"- Try an alternative approach if possible\n"
"- Do NOT hallucinate successful results\n"
"- Do NOT retry more than once\n\n"
"If tool results look suspicious (e.g., $0 fares, past dates):\n"
"- Inform the user that results seem unreliable\n"
"- Suggest alternatives"
),
tools=[_search_tool, _book_tool, _confirm_tool],
plugins=[chaos_plugin],
callback_handler=None,
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
)

memory_exporter.clear()
try:
result = agent(case.input)
output = str(result)
except Exception as e:
output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"

logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
logger.info(f"{'─'*60}")

finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": output, "trajectory": session}

# 5. Define test cases and expand with effect maps
test_cases = [
Case(
name="book_a_flight",
input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.",
),
Case(
name="search_and_confirm",
input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.",
),
]

# Expand: 2 cases × (3 effect maps + 1 baseline) = 8 ChaosCase objects
chaos_cases = ChaosCase.expand(test_cases, effect_maps, include_no_effect_baseline=True)

# 6. Create and run the ChaosExperiment
evaluators = [GoalSuccessRateEvaluator()]

experiment = ChaosExperiment(
cases=chaos_cases,
evaluators=evaluators,
)

# Run: 8 chaos cases = 8 agent invocations
reports = experiment.run_evaluations(task=travel_agent_task)
reports[0].run_display()