From 65778329696b31775c73a5325401488f993da620 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Sat, 30 May 2026 17:39:46 +0000 Subject: [PATCH] Vary datadog.yaml in test/antithesis, assert aliveness This PR introduces variation in the datadog.yaml we use under test in the antithesis rig. The goal here is to explore variation in buffer sizes etc and also startup panics on truly weird configs. ADP aliveness on bootup via the way it is rigged into the compose cluster and we assert a 'sometimes' check to forwarding in datadog/io.rs. This later 'sometimes' acts as a checkpoint for antithesis, allowing it to figure that ADP has reached a nominally functional state and can be explored from that point. The antithesis setup checkpoint is done before datadog.yaml is sampled. Notable things: * first_sample_config runs after setup-checkpoint and before ADP boots, is responsible for creating datadog.yaml and other configs in the future * eventually_adp_alive is a weak check and we may drop it in the future as our coverage improves, but it doesn't hurt anything now * I introduced a harness::rand to encode antithesis-friendly sampling of large domains, this will expand over time * Skill `antithesis-research` has updated its 'scratchbook' but this is a mechanical domain for now, will later convert it to a human-hybrid material --- Cargo.lock | 5 + bin/agent-data-plane/Cargo.toml | 2 +- lib/saluki-components/Cargo.toml | 2 + .../src/common/datadog/io.rs | 13 + test/antithesis/deploy/Dockerfile | 31 ++- test/antithesis/deploy/adp/entrypoint.sh | 21 ++ test/antithesis/deploy/docker-compose.yaml | 16 +- test/antithesis/deploy/workload/entrypoint.sh | 48 +--- test/antithesis/harness/Cargo.toml | 4 + .../harness/src/bin/eventually_adp_alive.rs | 75 ++++++ .../src/bin/first_sample_config/config.rs | 246 ++++++++++++++++++ .../src/bin/first_sample_config/main.rs | 75 ++++++ .../src/bin/parallel_driver_send_dogstatsd.rs | 20 +- test/antithesis/harness/src/lib.rs | 4 + test/antithesis/harness/src/rand.rs | 65 +++++ .../scratchbook/existing-assertions.md | 45 ++-- .../properties/adp-keeps-delivering.md | 60 +++++ .../scratchbook/properties/adp-stays-alive.md | 104 ++++++++ .../properties/forwarder-eventual-delivery.md | 20 +- .../scratchbook/property-catalog.md | 99 ++++++- .../scratchbook/property-relationships.md | 29 ++- test/antithesis/scratchbook/sut-analysis.md | 35 ++- 22 files changed, 904 insertions(+), 115 deletions(-) create mode 100644 test/antithesis/deploy/adp/entrypoint.sh create mode 100644 test/antithesis/harness/src/bin/eventually_adp_alive.rs create mode 100644 test/antithesis/harness/src/bin/first_sample_config/config.rs create mode 100644 test/antithesis/harness/src/bin/first_sample_config/main.rs create mode 100644 test/antithesis/harness/src/lib.rs create mode 100644 test/antithesis/harness/src/rand.rs create mode 100644 test/antithesis/scratchbook/properties/adp-keeps-delivering.md create mode 100644 test/antithesis/scratchbook/properties/adp-stays-alive.md diff --git a/Cargo.lock b/Cargo.lock index 837fab9538..b8bfd4d21c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1678,8 +1678,12 @@ dependencies = [ "antithesis_sdk", "anyhow", "clap", + "num-traits", "rand 0.10.1", + "rand_distr", + "serde", "serde_json", + "serde_yaml", ] [[package]] @@ -4165,6 +4169,7 @@ dependencies = [ name = "saluki-components" version = "0.1.0" dependencies = [ + "antithesis_sdk", "arc-swap", "async-trait", "axum", diff --git a/bin/agent-data-plane/Cargo.toml b/bin/agent-data-plane/Cargo.toml index 7b479635d4..17103a09ee 100644 --- a/bin/agent-data-plane/Cargo.toml +++ b/bin/agent-data-plane/Cargo.toml @@ -11,7 +11,7 @@ workspace = true [features] default = [] fips = ["saluki-app/tls-fips", "saluki-components/fips"] -antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full", "dep:antithesis-instrumentation"] +antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full", "dep:antithesis-instrumentation", "saluki-components/antithesis"] [dependencies] antithesis-instrumentation = { workspace = true, optional = true } diff --git a/lib/saluki-components/Cargo.toml b/lib/saluki-components/Cargo.toml index 9056455e58..7110808bb5 100644 --- a/lib/saluki-components/Cargo.toml +++ b/lib/saluki-components/Cargo.toml @@ -12,8 +12,10 @@ workspace = true default = [] config-test-support = [] fips = ["saluki-io/fips"] +antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full"] [dependencies] +antithesis_sdk = { workspace = true, optional = true } arc-swap = { workspace = true } async-trait = { workspace = true } axum = { workspace = true } diff --git a/lib/saluki-components/src/common/datadog/io.rs b/lib/saluki-components/src/common/datadog/io.rs index 5ecc3ca209..26289e52f9 100644 --- a/lib/saluki-components/src/common/datadog/io.rs +++ b/lib/saluki-components/src/common/datadog/io.rs @@ -546,6 +546,19 @@ async fn process_http_response( if status.is_success() { debug!(endpoint_url, %status, "Request completed."); + // Reaching a successful intake response means the whole pipeline + // ran. This is a useful signal for process health but also + // acts as a checkpoint anchor for Antithesis replay: at this point + // there is a nominally functional system. + // + // No-op outside the `antithesis` feature build. + #[cfg(feature = "antithesis")] + antithesis_sdk::assert_sometimes!( + true, + "ADP forwarded a payload to the intake", + &serde_json::json!({ "domain": domain }) + ); + telemetry.track_successful_transaction(&metadata, domain); } else { telemetry.track_permanently_failed_transaction(&metadata, Some(status), domain); diff --git a/test/antithesis/deploy/Dockerfile b/test/antithesis/deploy/Dockerfile index d8fe97108d..d8de1bb5c0 100644 --- a/test/antithesis/deploy/Dockerfile +++ b/test/antithesis/deploy/Dockerfile @@ -32,11 +32,14 @@ RUN --mount=type=bind,source=rust-toolchain.toml,target=/tmp/rust-toolchain.toml # --------------------------------------------------------------------------- # Build the instrumented Agent Data Plane. # -# Coverage instrumentation uses the modern Antithesis Rust flow (post-2026-05-22): the -# `antithesis-instrumentation` crate (referenced once in main.rs behind the `antithesis` feature) -# provides the runtime shim, and these RUSTFLAGS enable LLVM sancov coverage. `--build-id` is -# required for symbolization; the release profile sets `debug = true`, so the binary keeps DWARF -# for /symbols. LTO is disabled to keep sancov instrumentation predictable. +# Coverage instrumentation uses the modern Antithesis Rust flow +# (post-2026-05-22): the `antithesis-instrumentation` crate (referenced once in +# main.rs behind the `antithesis` feature) provides the runtime shim, and these +# RUSTFLAGS enable LLVM sancov coverage. `--build-id` is required for +# symbolization; the release profile sets `debug = true`, so the binary keeps +# DWARF for /symbols. LTO is disabled to keep sancov instrumentation +# predictable. `panic = "abort"` (antithesis build only) turns any ADP panic +# into SIGABRT, caught as a hard crash. # --------------------------------------------------------------------------- FROM build-base AS adp-builder ENV APP_FULL_NAME="Agent Data Plane" \ @@ -55,6 +58,7 @@ RUN --mount=type=cache,target=/adp/target,id=antithesis-adp-target \ --mount=type=cache,target=/root/.cargo/git,id=cargo-git \ cargo build --release --package agent-data-plane --features antithesis \ --target x86_64-unknown-linux-gnu \ + --config 'profile.release.panic="abort"' \ --config 'target.x86_64-unknown-linux-gnu.rustflags=["--cfg","tokio_unstable","-Ccodegen-units=1","-Cpasses=sancov-module","-Cllvm-args=-sanitizer-coverage-level=3","-Cllvm-args=-sanitizer-coverage-trace-pc-guard","-Clink-args=-Wl,--build-id"]' && \ cp /adp/target/x86_64-unknown-linux-gnu/release/agent-data-plane /usr/local/bin/agent-data-plane && \ echo "Validating Antithesis instrumentation symbols..." && \ @@ -74,11 +78,14 @@ RUN --mount=type=cache,target=/tools/target,id=antithesis-tools-target \ --mount=type=cache,target=/root/.cargo/git,id=cargo-git \ cargo build --release \ --bin datadog-intake --bin millstone \ - --bin parallel_driver_send_dogstatsd --bin finally_verify_delivery && \ + --bin parallel_driver_send_dogstatsd --bin finally_verify_delivery --bin eventually_adp_alive \ + --bin first_sample_config && \ cp /tools/target/release/datadog-intake /usr/local/bin/datadog-intake && \ cp /tools/target/release/millstone /usr/local/bin/millstone && \ cp /tools/target/release/parallel_driver_send_dogstatsd /usr/local/bin/parallel_driver_send_dogstatsd && \ - cp /tools/target/release/finally_verify_delivery /usr/local/bin/finally_verify_delivery + cp /tools/target/release/finally_verify_delivery /usr/local/bin/finally_verify_delivery && \ + cp /tools/target/release/eventually_adp_alive /usr/local/bin/eventually_adp_alive && \ + cp /tools/target/release/first_sample_config /usr/local/bin/first_sample_config # --------------------------------------------------------------------------- # Runtime: Agent Data Plane (SUT). @@ -92,8 +99,12 @@ RUN apt-get update && \ COPY --from=adp-builder /usr/local/bin/agent-data-plane /usr/local/bin/agent-data-plane # Expose DWARF/build-id symbols to Antithesis for symbolization (one-hop symlink to the unstripped binary). RUN mkdir -p /symbols && ln -s /usr/local/bin/agent-data-plane /symbols/agent-data-plane -# main.rs requires the bootstrap config file to exist at the default path; ship a minimal standalone config. +# main.rs requires the bootstrap config file to exist at the default path; ship a minimal standalone +# config as a fallback. The boot wrapper overwrites it with the per-replay config written by the +# `first_sample_config` workload command onto the shared `agent-config` volume. COPY test/antithesis/deploy/adp/datadog.yaml /etc/datadog-agent/datadog.yaml +# Boot wrapper: waits for the drawn config sentinel, copies the config into place, then execs ADP. +COPY --chmod=755 test/antithesis/deploy/adp/entrypoint.sh /entrypoint.sh # ADP's control-plane secure API requires an IPC TLS cert (a single PEM holding both certificate and # private key) that the Core Agent normally generates. In standalone mode there is no Core Agent, so # generate a self-signed cert+key. An empty auth_token satisfies the IPC auth config at startup. @@ -103,7 +114,7 @@ RUN openssl req -x509 -newkey rsa:2048 -nodes -days 3650 \ cat /tmp/ipc_cert.pem /tmp/ipc_key.pem > /etc/datadog-agent/ipc_cert.pem && \ rm -f /tmp/ipc_cert.pem /tmp/ipc_key.pem && \ touch /etc/datadog-agent/auth_token -ENTRYPOINT ["/usr/local/bin/agent-data-plane"] +ENTRYPOINT ["/entrypoint.sh"] CMD ["run"] # --------------------------------------------------------------------------- @@ -128,7 +139,9 @@ COPY --from=tools-builder /usr/local/bin/millstone /usr/local/bin/millstone COPY --chmod=755 test/antithesis/deploy/workload/setup-complete.sh /opt/antithesis/setup-complete.sh COPY test/antithesis/deploy/workload/test/ /opt/antithesis/test/ # Inject the compiled test-command binaries into the "main" test template. +COPY --from=tools-builder --chmod=755 /usr/local/bin/first_sample_config /opt/antithesis/test/v1/main/first_sample_config COPY --from=tools-builder --chmod=755 /usr/local/bin/parallel_driver_send_dogstatsd /opt/antithesis/test/v1/main/parallel_driver_send_dogstatsd COPY --from=tools-builder --chmod=755 /usr/local/bin/finally_verify_delivery /opt/antithesis/test/v1/main/finally_verify_delivery +COPY --from=tools-builder --chmod=755 /usr/local/bin/eventually_adp_alive /opt/antithesis/test/v1/main/eventually_adp_alive COPY --chmod=755 test/antithesis/deploy/workload/entrypoint.sh /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] diff --git a/test/antithesis/deploy/adp/entrypoint.sh b/test/antithesis/deploy/adp/entrypoint.sh new file mode 100644 index 0000000000..8593748ba4 --- /dev/null +++ b/test/antithesis/deploy/adp/entrypoint.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Agent Data Plane boot wrapper. +# +# first_sample_config writes this timeline's datadog.yaml + a `ready` sentinel to +# the shared volume; we block on it, copy the config, then `exec` one stable ADP. +# We block indefinitely rather than timing out and exiting non-zero, which would +# be read as an ADP crash. The startup log below makes the wait visible in triage, +# so a missing release shows as "waiting…" with no boot rather than a silent hang. + +CONFIG_DIR="${AGENT_CONFIG_DIR:-/agent-config}" + +echo "adp: waiting for ${CONFIG_DIR}/ready (released by first_sample_config)" >&2 +while [ ! -f "${CONFIG_DIR}/ready" ]; do + sleep 1 +done + +cp "${CONFIG_DIR}/datadog.yaml" /etc/datadog-agent/datadog.yaml + +exec /usr/local/bin/agent-data-plane "$@" diff --git a/test/antithesis/deploy/docker-compose.yaml b/test/antithesis/deploy/docker-compose.yaml index 335c5b9b1e..6e3b3bf9a7 100644 --- a/test/antithesis/deploy/docker-compose.yaml +++ b/test/antithesis/deploy/docker-compose.yaml @@ -40,15 +40,11 @@ services: DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" volumes: - dogstatsd-socket:/var/run/datadog + # first_sample_config (workload) writes this timeline's datadog.yaml + ready sentinel here. + - agent-config:/agent-config:ro depends_on: intake: condition: service_healthy - healthcheck: - # ADP's unprivileged API listens on TCP :5100 once the internal supervisor is up. - test: ["CMD-SHELL", "bash -c 'exec 3<>/dev/tcp/localhost/5100'"] - interval: 2s - timeout: 2s - retries: 60 workload: container_name: workload @@ -62,19 +58,15 @@ services: image: workload:latest environment: NO_COLOR: "1" - ADP_HOST: "adp" - ADP_API_PORT: "5100" DSD_SOCKET: "/var/run/datadog/dsd.socket" INTAKE_ADDR: "intake:2049" - INTAKE_HOST: "intake" - INTAKE_PORT: "2049" volumes: - dogstatsd-socket:/var/run/datadog + - agent-config:/agent-config depends_on: - adp: - condition: service_healthy intake: condition: service_healthy volumes: dogstatsd-socket: + agent-config: diff --git a/test/antithesis/deploy/workload/entrypoint.sh b/test/antithesis/deploy/workload/entrypoint.sh index 3ff5e46908..fc51192f30 100644 --- a/test/antithesis/deploy/workload/entrypoint.sh +++ b/test/antithesis/deploy/workload/entrypoint.sh @@ -3,51 +3,9 @@ set -euo pipefail # Workload client entrypoint. # -# By the time this runs, docker-compose has gated startup on the `adp` and `intake` services being -# healthy (depends_on: condition: service_healthy). We re-confirm reachability defensively, emit the -# Antithesis `setup_complete` signal, then idle so Antithesis can run test commands from the test -# template at /opt/antithesis/test/v1/. +# Gated on intake-healthy (compose `depends_on`). Emit `setup_complete`, then +# idle so Antithesis runs the test commands. -ADP_HOST="${ADP_HOST:-adp}" -ADP_API_PORT="${ADP_API_PORT:-5100}" -DSD_SOCKET="${DSD_SOCKET:-/var/run/datadog/dsd.socket}" -INTAKE_HOST="${INTAKE_HOST:-intake}" -INTAKE_PORT="${INTAKE_PORT:-2049}" - -wait_for_tcp() { - local host="$1" port="$2" name="$3" tries=60 - echo "Waiting for ${name} (${host}:${port})..." - while (( tries-- > 0 )); do - if (exec 3<>"/dev/tcp/${host}/${port}") 2>/dev/null; then - echo "${name} is reachable." - return 0 - fi - sleep 1 - done - echo "Timed out waiting for ${name} (${host}:${port})." >&2 - return 1 -} - -wait_for_socket() { - local path="$1" name="$2" tries=60 - echo "Waiting for ${name} (${path})..." - while (( tries-- > 0 )); do - if [[ -S "${path}" ]]; then - echo "${name} is reachable." - return 0 - fi - sleep 1 - done - echo "Timed out waiting for ${name} (${path})." >&2 - return 1 -} - -wait_for_tcp "${ADP_HOST}" "${ADP_API_PORT}" "agent-data-plane API" -wait_for_socket "${DSD_SOCKET}" "agent-data-plane DogStatsD socket" -wait_for_tcp "${INTAKE_HOST}" "${INTAKE_PORT}" "datadog-intake" - -echo "System is ready. Emitting setup_complete." /opt/antithesis/setup-complete.sh - -echo "Workload client idle; awaiting Antithesis test commands." +echo "setup_complete emitted; workload idle, awaiting Antithesis test commands." exec tail -f /dev/null diff --git a/test/antithesis/harness/Cargo.toml b/test/antithesis/harness/Cargo.toml index 0190ffc517..9d35aa056c 100644 --- a/test/antithesis/harness/Cargo.toml +++ b/test/antithesis/harness/Cargo.toml @@ -16,8 +16,12 @@ clap = { workspace = true, features = [ "std", "usage", ] } +num-traits = { workspace = true } rand = { workspace = true } +rand_distr = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } +serde_yaml = { workspace = true } [lints.clippy] all = "deny" diff --git a/test/antithesis/harness/src/bin/eventually_adp_alive.rs b/test/antithesis/harness/src/bin/eventually_adp_alive.rs new file mode 100644 index 0000000000..507fa41f27 --- /dev/null +++ b/test/antithesis/harness/src/bin/eventually_adp_alive.rs @@ -0,0 +1,75 @@ +//! Antithesis `eventually_` liveness check: ADP booted and became reachable +//! within a bounded window. +//! +//! `eventually_` commands run in a fault-quiet period, so a node-fault induced +//! kill of ADP does not trip this check but a self-inflicted process exit +//! does. This triggers on ADP's own bugs, rather than antithesis fault +//! injection. +//! +//! We check two signals. First that ADP is reachable on :5100 and second that +//! it created a `DogStatsD` listener socket. + +use std::net::{TcpStream, ToSocketAddrs}; +use std::os::unix::fs::FileTypeExt; +use std::path::PathBuf; +use std::thread::sleep; +use std::time::Duration; + +use antithesis_sdk::prelude::*; +use clap::{builder::NonEmptyStringValueParser, Parser}; +use serde_json::json; + +#[derive(Debug, Parser)] +#[command(name = "eventually_adp_alive")] +struct Config { + #[arg( + long = "adp-api-addr", + env = "ADP_API_ADDR", + default_value = "adp:5100", + value_parser = NonEmptyStringValueParser::new() + )] + adp_api_addr: String, + #[arg( + long = "dsd-socket", + env = "DSD_SOCKET", + default_value = "/var/run/datadog/dsd.socket" + )] + dsd_socket: PathBuf, +} + +fn main() -> anyhow::Result<()> { + antithesis_init(); + let config = Config::try_parse()?; + + let mut api_reachable = false; + let mut socket_present = false; + // Check that the adp-api is reachable and the DogStatsD socket exists for + // about 60 seconds. A 1s connect timeout keeps the poll cadence bounded + // even when the API host is unresponsive. + for _ in 0..60 { + api_reachable = config + .adp_api_addr + .to_socket_addrs() + .ok() + .and_then(|mut addrs| addrs.next()) + .is_some_and(|addr| TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok()); + socket_present = config.dsd_socket.metadata().is_ok_and(|m| m.file_type().is_socket()); + if api_reachable && socket_present { + break; + } + sleep(Duration::from_secs(1)); + } + + assert_always!( + api_reachable && socket_present, + "ADP booted: API reachable and DogStatsD socket present", + &json!({ + "adp_api_addr": config.adp_api_addr, + "dsd_socket": config.dsd_socket.display().to_string(), + "api_reachable": api_reachable, + "socket_present": socket_present, + }) + ); + + Ok(()) +} diff --git a/test/antithesis/harness/src/bin/first_sample_config/config.rs b/test/antithesis/harness/src/bin/first_sample_config/config.rs new file mode 100644 index 0000000000..d84f0756ea --- /dev/null +++ b/test/antithesis/harness/src/bin/first_sample_config/config.rs @@ -0,0 +1,246 @@ +//! Configuration model and rendering for Datadog Agent configuration. +//! +//! Primary focus is currently `DogStatsD` but this is, hopefully, easy to expand +//! in the future. + +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use anyhow::Context as _; +use harness::rand::Probe; +use rand::distr::{Distribution, StandardUniform}; +use rand::{Rng, RngExt}; +use serde::{Serialize, Serializer}; + +/// Yaml flags the Agent reads at boot that never vary. +const STATIC_YAML_TAIL: &str = "use_dogstatsd: true +use_v2_api_series: true +inventories_enabled: false +enable_metadata_collection: false +cloud_provider_metadata: [] +"; + +/// A Go `time.Duration`, rendered as a Go duration string (for example `100ms`) +/// — the form the Agent's duration config keys parse. +#[derive(Debug, Clone, Copy)] +struct GoDuration(Duration); + +impl Serialize for GoDuration { + fn serialize(&self, serializer: S) -> Result { + serializer.collect_str(&format_args!("{}ms", self.0.as_millis())) + } +} + +impl Distribution for Probe { + fn sample(&self, rng: &mut R) -> GoDuration { + let millis: u64 = self.sample(rng); + GoDuration(Duration::from_millis(millis)) + } +} + +/// A duration the Agent reads as a plain integer number of seconds (`GetInt`), +/// rendered as that integer. +#[derive(Debug, Clone, Copy)] +struct DurationSeconds(Duration); + +impl Serialize for DurationSeconds { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_u64(self.0.as_secs()) + } +} + +impl Distribution for Probe { + fn sample(&self, rng: &mut R) -> DurationSeconds { + let secs: u64 = self.sample(rng); + DurationSeconds(Duration::from_secs(secs)) + } +} + +/// Agent log level +/// +/// Restricted to quiet levels on purpose. Antithesis enforces a per-hour +/// log-output budget per run and `info`/`debug`/`trace` is a whole awful lot of +/// logs. +#[derive(Debug, Clone, Copy, Serialize)] +#[serde(rename_all = "lowercase")] +pub(crate) enum LogLevel { + /// Warnings and above. + Warn, + /// Errors only. + Error, +} + +impl Distribution for StandardUniform { + fn sample(&self, rng: &mut R) -> LogLevel { + match rng.random_range(0..2u8) { + 0 => LogLevel::Warn, + _ => LogLevel::Error, + } + } +} + +/// Tag granularity for origin-detected `DogStatsD` tags. +#[derive(Debug, Clone, Copy, Serialize)] +#[serde(rename_all = "lowercase")] +pub(crate) enum TagCardinality { + /// Low-cardinality objects: clusters, hosts, deployments, images. Agent + /// default. + Low, + /// Orchestrator-level: pod (Kubernetes) or task (ECS/Mesos) cardinality. + Orchestrator, + /// High-cardinality objects: individual containers, request user IDs, etc. + High, +} + +impl Distribution for StandardUniform { + fn sample(&self, rng: &mut R) -> TagCardinality { + match rng.random_range(0..3u8) { + 0 => TagCardinality::Low, + 1 => TagCardinality::Orchestrator, + _ => TagCardinality::High, + } + } +} + +/// The Agent's `DogStatsD` configuration surface. `dogstatsd_socket` is +/// supplied by the environment; the rest are sampled. +/// +/// Numeric fields are sampled with [`Probe`]: usually a typical value (so ADP +/// boots and runs), occasionally a boundary value to probe overflow and +/// wraparound. +#[allow(clippy::struct_field_names, clippy::struct_excessive_bools)] +#[derive(Debug, Serialize)] +pub(crate) struct DogStatsdConfig { + /// Unix socket the server listens on. Supplied by the environment. + dogstatsd_socket: PathBuf, + /// Buffer used to receive statsd packets, in bytes. + dogstatsd_buffer_size: u64, + /// Bytes for the socket receive buffer (`POSIX`); `0` keeps the OS default. + dogstatsd_so_rcvbuf: u64, + /// Packets buffered before flushing to the processing queue. + dogstatsd_packet_buffer_size: u64, + /// Maximum time packets sit in the packet buffer before a flush. + dogstatsd_packet_buffer_flush_timeout: GoDuration, + /// Internal queue size of the server; smaller caps memory but risks packet + /// drops. + dogstatsd_queue_size: u64, + /// Number of processing pipelines. + dogstatsd_pipeline_count: u64, + /// Worker count processing packets; `0` lets the Agent choose. + dogstatsd_workers_count: u64, + /// Seconds a counter is sampled to `0` after its last value before expiring. + dogstatsd_expiry_seconds: DurationSeconds, + /// Seconds a metric context is kept in memory after its last sample. + dogstatsd_context_expiry_seconds: DurationSeconds, + /// Maximum entries in the string interner cache. + dogstatsd_string_interner_size: u64, + /// Max number of metric-mapping results cached by the mapper. + dogstatsd_mapper_cache_size: u64, + /// Max metrics per payload from the no-aggregation pipeline. + dogstatsd_no_aggregation_pipeline_batch_size: u64, + /// Tag granularity for origin-detected tags. + dogstatsd_tag_cardinality: TagCardinality, + /// Listen for non-local UDP traffic (binds `0.0.0.0`). + dogstatsd_non_local_traffic: bool, + /// Tag metrics with container metadata from the Unix socket peer. + dogstatsd_origin_detection: bool, + /// Use a client-provided container ID to enrich metrics. + dogstatsd_origin_detection_client: bool, + /// Let clients opt out of origin detection via cardinality `none`. + dogstatsd_origin_optout_enabled: bool, + /// Collect basic per-metric statistics (count / last seen). + dogstatsd_metrics_stats_enable: bool, + /// When an `Entity-ID` is set, skip origin-detection tag enrichment. + dogstatsd_entity_id_precedence: bool, + /// Enable the no-aggregation pipeline (forward timestamped metrics with + /// tagging only). + dogstatsd_no_aggregation_pipeline: bool, + /// Flush incomplete metric time buckets on shutdown. + dogstatsd_flush_incomplete_buckets: bool, + /// Automatically adjust the number of processing pipelines. + dogstatsd_pipeline_autoadjust: bool, + /// Publish `DogStatsD` internal stats as Go expvars. + dogstatsd_stats_enable: bool, +} + +impl DogStatsdConfig { + /// Sample the `DogStatsD` options from `rng`, taking the socket from the + /// environment. + fn sample(rng: &mut R, dogstatsd_socket: &Path) -> Self { + Self { + dogstatsd_socket: dogstatsd_socket.to_path_buf(), + dogstatsd_buffer_size: Probe.sample(rng), + dogstatsd_so_rcvbuf: Probe.sample(rng), + dogstatsd_packet_buffer_size: Probe.sample(rng), + dogstatsd_packet_buffer_flush_timeout: Probe.sample(rng), + dogstatsd_queue_size: Probe.sample(rng), + dogstatsd_pipeline_count: Probe.sample(rng), + dogstatsd_workers_count: Probe.sample(rng), + dogstatsd_expiry_seconds: Probe.sample(rng), + dogstatsd_context_expiry_seconds: Probe.sample(rng), + dogstatsd_string_interner_size: Probe.sample(rng), + dogstatsd_mapper_cache_size: Probe.sample(rng), + dogstatsd_no_aggregation_pipeline_batch_size: Probe.sample(rng), + dogstatsd_tag_cardinality: rng.random(), + dogstatsd_non_local_traffic: rng.random(), + dogstatsd_origin_detection: rng.random(), + dogstatsd_origin_detection_client: rng.random(), + dogstatsd_origin_optout_enabled: rng.random(), + dogstatsd_metrics_stats_enable: rng.random(), + dogstatsd_entity_id_precedence: rng.random(), + dogstatsd_no_aggregation_pipeline: rng.random(), + dogstatsd_flush_incomplete_buckets: rng.random(), + dogstatsd_pipeline_autoadjust: rng.random(), + dogstatsd_stats_enable: rng.random(), + } + } +} + +/// Agent-facing config. `hostname`, `api_key`, `dd_url`, and the socket are +/// supplied by the environment; `log_level` and the `DogStatsD` options are +/// sampled per branch. The static flags are appended by [`Self::to_yaml`], not +/// fields here. +#[derive(Debug, Serialize)] +pub(crate) struct DatadogConfig { + /// Agent hostname. Supplied by the environment. ADP requires it + /// (`FixedHostProvider`); absent it refuses to boot. + hostname: String, + /// Agent API key. Supplied by the environment. + api_key: String, + /// Metrics intake base URL. Supplied by the environment. + dd_url: String, + /// Agent log verbosity. Sampled; restricted to quiet levels (see [`LogLevel`]). + log_level: LogLevel, + /// `DogStatsD` options, flattened to top-level `dogstatsd_*` keys. + #[serde(flatten)] + dogstatsd: DogStatsdConfig, +} + +impl DatadogConfig { + /// Generate a config: the environmental fields come from the caller, the + /// rest are sampled from `rng`. With an Antithesis-backed rng, each call after + /// the snapshot yields an independent draw per replay branch. + pub(crate) fn sample( + rng: &mut R, hostname: &str, api_key: &str, dd_url: &str, dogstatsd_socket: &Path, + ) -> Self { + Self { + hostname: hostname.to_owned(), + api_key: api_key.to_owned(), + dd_url: dd_url.to_owned(), + log_level: rng.random(), + dogstatsd: DogStatsdConfig::sample(rng, dogstatsd_socket), + } + } + + /// Render `self` as a `datadog.yaml` string, followed by the static-tail + /// flags. + /// + /// # Errors + /// + /// Returns an error if serialization fails. + pub(crate) fn to_yaml(&self) -> anyhow::Result { + let mut yaml = serde_yaml::to_string(self).context("serialize datadog.yaml")?; + yaml.push_str(STATIC_YAML_TAIL); + Ok(yaml) + } +} diff --git a/test/antithesis/harness/src/bin/first_sample_config/main.rs b/test/antithesis/harness/src/bin/first_sample_config/main.rs new file mode 100644 index 0000000000..7ea3cfba7b --- /dev/null +++ b/test/antithesis/harness/src/bin/first_sample_config/main.rs @@ -0,0 +1,75 @@ +//! Antithesis `first_` command: sample this timeline's `datadog.yaml` and release ADP. +//! +//! Runs once per execution path after `setup_complete`, so the sample (see +//! [`config`], Antithesis SDK randomness) is a post-snapshot, per-timeline +//! decision Antithesis branches. Writes the config to the shared `agent-config` +//! volume then a `ready` sentinel the blocked ADP entrypoint waits on; running +//! upstream of ADP's boot is what makes each timeline boot under its own config. +//! Deployment fields come from the environment (see [`Cli`]). + +mod config; + +use std::fs; +use std::path::PathBuf; + +use antithesis_sdk::prelude::*; +use antithesis_sdk::random::AntithesisRng; +use anyhow::Context as _; +use clap::Parser; +use config::DatadogConfig; +use rand::rand_core::UnwrapErr; +use serde_json::json; + +/// Deployment inputs, sourced from the environment (or flags). +#[derive(Debug, Parser)] +#[command(name = "first_sample_config")] +struct Cli { + /// Directory to write `datadog.yaml` and the `ready` sentinel into (shared + /// `agent-config` volume; the ADP container reads it). + #[arg(long, env = "CONFIG_DIR", default_value = "/agent-config")] + config_dir: PathBuf, + /// Agent hostname written into the config. (`DD_HOSTNAME`, not the ambient + /// `HOSTNAME`, so a container's own hostname does not leak in.) + #[arg(long, env = "DD_HOSTNAME", default_value = "antithesis-adp")] + hostname: String, + /// Agent API key written into the config. + #[arg(long, env = "API_KEY", default_value = "antithesis-test-api-key")] + api_key: String, + /// Metrics intake base URL. + #[arg(long, env = "DD_URL", default_value = "http://intake:2049")] + dd_url: String, + /// `DogStatsD` unix datagram socket path. + #[arg(long, env = "DOGSTATSD_SOCKET", default_value = "/var/run/datadog/dsd.socket")] + dogstatsd_socket: PathBuf, +} + +fn main() -> anyhow::Result<()> { + antithesis_init(); + let cli = Cli::parse(); + + fs::create_dir_all(&cli.config_dir) + .with_context(|| format!("create agent config dir {}", cli.config_dir.display()))?; + + let mut rng = UnwrapErr(AntithesisRng); + let config = DatadogConfig::sample( + &mut rng, + &cli.hostname, + &cli.api_key, + &cli.dd_url, + &cli.dogstatsd_socket, + ); + + let yaml_path = cli.config_dir.join("datadog.yaml"); + fs::write(&yaml_path, config.to_yaml()?.as_bytes()) + .with_context(|| format!("write agent config {}", yaml_path.display()))?; + + // Per-timeline anchor: counting these in triage tells us how many distinct + // configs the run sampled. + let details = serde_json::to_value(&config).unwrap_or_else(|e| json!({ "serialize_error": e.to_string() })); + assert_reachable!("first_sample_config.config_sampled", &details); + + // Release ADP: it blocks on this sentinel, then boots under the config above. + let ready_path = cli.config_dir.join("ready"); + fs::write(&ready_path, b"ready\n").with_context(|| format!("write sentinel {}", ready_path.display()))?; + Ok(()) +} diff --git a/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs b/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs index ac9f887a3e..9f19dd7c59 100644 --- a/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs +++ b/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs @@ -6,7 +6,9 @@ //! grow without bound under sustained high cardinality). use std::os::unix::net::UnixDatagram; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +use std::thread::sleep; +use std::time::{Duration, Instant}; use antithesis_sdk::prelude::*; use antithesis_sdk::random::AntithesisRng; @@ -49,8 +51,7 @@ fn main() -> anyhow::Result<()> { }; let count: u64 = rng.random_range(50..=2000); - let socket = UnixDatagram::unbound()?; - socket.connect(&config.dogstatsd_socket)?; + let socket = connect_with_retry(&config.dogstatsd_socket)?; let names = ["adp.test.foo", "adp.test.bar", "adp.test.balkajsldfkjasdlfkjasdfz"]; let metric_types = ["c", "g"]; @@ -92,3 +93,16 @@ fn main() -> anyhow::Result<()> { Ok(()) } + +// Wait for ADP to bind the socket, intentionally naive. +fn connect_with_retry(path: &Path) -> anyhow::Result { + let deadline = Instant::now() + Duration::from_secs(30); + loop { + let socket = UnixDatagram::unbound()?; + match socket.connect(path) { + Ok(()) => return Ok(socket), + Err(_) if Instant::now() < deadline => sleep(Duration::from_millis(250)), + Err(e) => return Err(e).with_context(|| format!("ADP did not bind {} within 30s", path.display())), + } + } +} diff --git a/test/antithesis/harness/src/lib.rs b/test/antithesis/harness/src/lib.rs new file mode 100644 index 0000000000..8c05e117db --- /dev/null +++ b/test/antithesis/harness/src/lib.rs @@ -0,0 +1,4 @@ +//! Shared helpers for the Antithesis harness, used by the `src/bin/*` test +//! commands. + +pub mod rand; diff --git a/test/antithesis/harness/src/rand.rs b/test/antithesis/harness/src/rand.rs new file mode 100644 index 0000000000..125babecfa --- /dev/null +++ b/test/antithesis/harness/src/rand.rs @@ -0,0 +1,65 @@ +//! Randomness utilities. + +use rand::distr::Distribution; +use rand::{Rng, RngExt}; +use rand_distr::LogNormal; + +/// Boundary values for the u64 field. +const BOUNDARIES: &[u64] = &[ + 0, + 1, + i8::MAX as u64 - 1, + i8::MAX as u64, + i8::MAX as u64 + 1, + u8::MAX as u64 - 1, + u8::MAX as u64, + u8::MAX as u64 + 1, + i16::MAX as u64 - 1, + i16::MAX as u64, + i16::MAX as u64 + 1, + u16::MAX as u64 - 1, + u16::MAX as u64, + u16::MAX as u64 + 1, + i32::MAX as u64 - 1, + i32::MAX as u64, + i32::MAX as u64 + 1, + u32::MAX as u64 - 1, + u32::MAX as u64, + u32::MAX as u64 + 1, + i64::MAX as u64 - 1, + i64::MAX as u64, + i64::MAX as u64 + 1, + u64::MAX - 1, + u64::MAX, +]; + +/// Produces `u64` values that are generally 'normal' and with some being +/// boundary values. +#[derive(Debug, Clone, Copy)] +pub struct Probe; + +impl Distribution for Probe { + fn sample(&self, rng: &mut R) -> u64 { + if rng.random_ratio(1, 8) { + BOUNDARIES[rng.random_range(0..BOUNDARIES.len())] + } else { + typical(rng) + } + } +} + +/// Approximate probability of a typical draw landing in each range: +/// +/// | Value range | Probability | +/// |------------------------|-------------| +/// | `<= 16` | ~15% | +/// | `16 ..= 256` | ~21% | +/// | `256 ..= 1_024` | ~14% | +/// | `1_024 ..= 4_096` | ~14% | +/// | `4_096 ..= 65_536` | ~22% | +/// | `65_536 ..= 1_048_576` | ~11% | +/// | `> 1_048_576` | ~4% | +fn typical(rng: &mut R) -> u64 { + let dist = LogNormal::new(1024.0_f64.ln(), 4.0).expect("median > 0 and sigma >= 0"); + num_traits::cast::(dist.sample(rng).round()).unwrap_or(u64::MAX) +} diff --git a/test/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md index 8a80c280b4..479c17494f 100644 --- a/test/antithesis/scratchbook/existing-assertions.md +++ b/test/antithesis/scratchbook/existing-assertions.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: Datadog ADP Confluence space (design notes, weekly summaries, gap analyses) consulted for grounding. @@ -15,17 +15,20 @@ external_references: ## Summary -**A small bootstrap-and-workload assertion set exists**, added by the Antithesis harness commit -(`chore(agent-data-plane): Antithesis test harness and workload`, the parent of this scratchbook -commit). It comprises **6 SDK call sites** across three binaries: one lifecycle init and one -bootstrap reachability probe in ADP (both gated behind the `antithesis` cargo feature, no-op in -production), plus two workload-side `assert_reachable!`/`assert_sometimes!` pairs in the harness test -commands. These are **integration probes and anti-vacuity anchors**, not the property-catalog -invariants — none of the 35 cataloged property assertions is implemented yet. +**A bootstrap-and-workload assertion set exists, now with the first liveness instrumentation.** It +comprises **8 SDK call sites**: one lifecycle init and one bootstrap reachability probe in ADP, two +workload-side `assert_reachable!`/`assert_sometimes!` pairs in the harness drivers, and — added +2026-05-31 — the external `eventually_adp_alive` liveness `assert_always!` plus the **first in-SUT +property assertion**, an `assert_sometimes!` at the forwarder 2xx site in `saluki-components`. All +ADP/`saluki-components` sites are gated behind an `antithesis` cargo feature (no-op in production). +The bootstrap probe and the two driver anchors remain **integration probes / anti-vacuity anchors**; +the two new sites are real liveness instrumentation (Category H `adp-stays-alive` and the +good-function half of `adp-keeps-delivering` / in-SUT seed of `forwarder-eventual-delivery`). > [!NOTE] -> A prior version of this file stated no SDK assertions existed. That was true before the harness -> commit landed; it is now stale. Re-research on 2026-05-30 corrected it. +> History: an early version of this file claimed no SDK assertions existed (true before the harness +> commit; corrected 2026-05-30). Updated again 2026-05-31 when the liveness pieces landed (6 → 8 +> sites). ## Assertions present @@ -37,12 +40,17 @@ invariants — none of the 35 cataloged property assertions is implemented yet. | `test/antithesis/harness/src/bin/finally_verify_delivery.rs:59` | `assert_sometimes!` | "metrics delivered end-to-end to the intake" (`delivered > 0`) | harness binary | Workload-side liveness anchor — partially seeds `forwarder-eventual-delivery`. | | `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:77` | `assert_reachable!` | "workload sent a dogstatsd batch" | harness binary | Confirms the DSD driver actually emitted load. | | `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:87` | `assert_sometimes!` | "workload drove a high-cardinality dogstatsd flood" (`regime == High`) | harness binary | Anti-vacuity anchor that timelines reach the high-cardinality regime — seeds `rss-bounded-under-cardinality`. | +| `test/antithesis/harness/src/bin/eventually_adp_alive.rs:62` | `assert_always!` | "ADP booted: API reachable and DogStatsD socket present" | harness binary (`eventually_`, faults-paused) | Death-liveness for `adp-stays-alive` — fails the branch when ADP self-crashed (config panic / load) but stayed down through the quiet period. | +| `lib/saluki-components/src/common/datadog/io.rs:553` | `assert_sometimes!` | "ADP forwarded a payload to the intake" (`{ domain }`) | `#[cfg(feature = "antithesis")]` | First in-SUT property assertion — good-function liveness (the full pipeline ran to a 2xx) + replay checkpoint; good-function half of `adp-keeps-delivering`, in-SUT seed of `forwarder-eventual-delivery`. | Dependency wiring: ADP gains the SDK only under the `antithesis` feature (`bin/agent-data-plane/Cargo.toml:14` → `dep:antithesis_sdk`, `antithesis_sdk/full`, -`dep:antithesis-instrumentation`); the harness crate depends on `antithesis_sdk` unconditionally -(`test/antithesis/harness/Cargo.toml`). `antithesis-instrumentation` is an external build-time -instrumentation crate, not a source of in-tree assertions. +`dep:antithesis-instrumentation`, and now `saluki-components/antithesis`). As of 2026-05-31 +`saluki-components` has its own optional `antithesis` feature (`dep:antithesis_sdk`, +`antithesis_sdk/full`), enabled transitively by the ADP feature — this is what lets in-SUT property +assertions live in the component crate, not just in the ADP binary. The harness crate depends on +`antithesis_sdk` unconditionally (`test/antithesis/harness/Cargo.toml`). `antithesis-instrumentation` +is an external build-time instrumentation crate, not a source of in-tree assertions. ## How this was determined @@ -55,11 +63,12 @@ Searched the repository with ripgrep over `*.rs` and `*.toml`: ## Implication for property work -The catalog's invariants are still **net-new instrumentation**. The two `assert_sometimes!` anchors -above are workload-side only and serve anti-vacuity, not the safety/liveness invariants themselves: +Most catalog invariants are still **net-new instrumentation**, but the pattern is now proven in-SUT: -- `forwarder-eventual-delivery` has a workload-side `Sometimes(delivered > 0)` but no SUT-side - no-loss `Always`/accounting assertion — that remains to be added. +- `forwarder-eventual-delivery` now has an **in-SUT** `Sometimes(forwarded a payload)` at the 2xx + site (io.rs:553) in addition to the workload-side `Sometimes(delivered > 0)`. The full no-loss + `Always`/accounting reconciliation (delivered == accepted-and-retryable after a transient outage) + is still net-new. - `rss-bounded-under-cardinality` has its high-cardinality `Sometimes` anchor but no SUT-side RSS or interner-bound `Always` — also net-new. - The ~17 properties requiring in-process SUT-side assertions (per evaluation R2) still need ADP to diff --git a/test/antithesis/scratchbook/properties/adp-keeps-delivering.md b/test/antithesis/scratchbook/properties/adp-keeps-delivering.md new file mode 100644 index 0000000000..67b571b22f --- /dev/null +++ b/test/antithesis/scratchbook/properties/adp-keeps-delivering.md @@ -0,0 +1,60 @@ +--- +slug: adp-keeps-delivering +title: ADP still processes and delivers after load (functional liveness) +type: Liveness +priority: Medium +sut_path: /home/ssm-user/src/saluki +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 +status: partial — in-SUT good-function `Sometimes` LANDED; per-branch wedge `assert_always` MISSING +--- + +# adp-keeps-delivering — ADP still processes and delivers after load + +## Property (one sentence) +After the load drivers run, in a faults-paused window the mock intake has received metrics *and* ADP +still serves `:5100` — i.e. ADP is not merely up but not wedged. + +## Origin +Strengthens [`adp-stays-alive`](adp-stays-alive.md): a process can be reachable on `:5100` yet have +stopped processing/forwarding (deadlock, stalled pipeline, dropped-everything). The existing +`finally_verify_delivery` harness command already polls the mock intake and fires a +`Reachable`/`Sometimes` anchor for end-to-end delivery — this property upgrades that to a per-branch +liveness assertion so a wedged-but-alive ADP becomes a counterexample. + +## Relationship to existing instrumentation +- `existing-assertions.md`: `finally_verify_delivery` carries a `Sometimes(delivered > 0)` anchor — + good for "delivery happens at least once across the run," but it does **not** fail on a branch + where ADP wedged after accepting load. This property is the missing per-branch `assert`. + +## The fault-gating mechanism +Same as `adp-stays-alive`: evaluate in a quiet period (`finally_`, or `ANTITHESIS_STOP_FAULTS`), so a +fault that merely delayed delivery recovers and passes, while a self-inflicted wedge persists and +fails. Note: no-loss/delivery reconciliation must use UDS or TCP ingress, not UDP (catalog R3). + +## Implementation status +- **Landed (good-function half):** an in-SUT `assert_sometimes!("ADP forwarded a payload to the + intake", { domain })` at the forwarder's 2xx site + (`lib/saluki-components/src/common/datadog/io.rs`, in the `status.is_success()` branch of + `process_http_response`), behind the new `saluki-components/antithesis` feature (enabled + transitively by `agent-data-plane/antithesis`). A 2xx means the whole ingest→aggregate→encode→ + forward pipeline ran, so this proves a *booted ADP actually works*, and as a `Sometimes` it also + gives Antithesis a replay checkpoint anchored on a healthy-forwarding state. This is the in-SUT + counterpart to the workload-side `Sometimes(delivered > 0)` already in `finally_verify_delivery`, + and it doubles as the in-SUT seed for [`forwarder-eventual-delivery`](forwarder-eventual-delivery.md). +- **Still net-new (the per-branch wedge detector):** extend the `finally_verify_delivery` command so + that, in the faults-paused window, it polls the mock intake's dump endpoint and `:5100` and asserts + `assert_always!(delivered_recently && reachable, …)`. This is what catches "ADP accepted load, then + wedged" on a *specific* branch — neither the run-wide `Sometimes` above nor a bare `:5100` + reachability check fails on that branch. + +## Assertion-type rationale +**Liveness** (a good thing — delivery — eventually happens after load), realized as an +`assert_always!` inside the faults-paused `finally_` after a bounded poll, for the same reason as +`adp-stays-alive`. + +## Open Questions +- "Delivered recently" needs a freshness window relative to the last driver batch — define it so a + stale earlier delivery doesn't mask a current wedge. +- Whether to count only metrics or also events/service-checks delivered (ties to + `events-sc-no-silent-loss`). diff --git a/test/antithesis/scratchbook/properties/adp-stays-alive.md b/test/antithesis/scratchbook/properties/adp-stays-alive.md new file mode 100644 index 0000000000..df3bcd2bcc --- /dev/null +++ b/test/antithesis/scratchbook/properties/adp-stays-alive.md @@ -0,0 +1,104 @@ +--- +slug: adp-stays-alive +title: ADP boots and stays serving (self-inflicted-crash liveness) +type: Liveness +priority: High +sut_path: /home/ssm-user/src/saluki +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 +status: LANDED as the `eventually_adp_alive` test command +--- + +# adp-stays-alive — ADP boots and stays serving + +## Property (one sentence) +After the per-replay `datadog.yaml` is applied and the workload runs, ADP's unprivileged API +(`:5100`) is reachable within a faults-paused window; if it never comes up, ADP died of its own +config or load — not of an injected node fault. + +## Origin +The harness now generates per-replay configs (`datadog-yaml-config-gen`) and adversarial load whose +boundary values *should* sometimes crash ADP — e.g. an oversized `dogstatsd_string_interner_size` +panics at boot (`capacity would overflow isize::MAX`). But nothing demonstrated ADP is alive: the +only ADP-side assertion was a `reachable` bootstrap probe, which fires on success and is satisfied as +long as *some* branch boots — it cannot flag the branch where ADP died. A process that panics also +cannot self-assert its own liveness after the fact. So the catch must be an external observer. + +> **Provenance clarification (the misunderstanding worth recording).** That interner boot-panic was +> first seen only via local **`snouty validate`** — the single-config smoke run at launch time, +> *outside* any Antithesis timeline. That is **not** the same as an Antithesis shot finding the crash: +> `validate` exercises one static `datadog.yaml`, whereas a run draws the whole config space across +> timelines. Pre-`eventually_adp_alive`, **no in-run mechanism** would have turned such a boot/load +> crash into a counterexample. This property is exactly that missing in-run catch; do not conflate +> "validate rejected a config locally" with "a shot found the bug." + +## The fault-gating mechanism (the crux) +The requirement is: trigger on self-inflicted death (panic on startup, crash from load) but **not** +on death caused by injected node faults (kill/pause/stop/throttle/clock). A quiet period separates +the two: +- `eventually_` and `finally_` test commands run with **faults already paused**; `ANTITHESIS_STOP_FAULTS` + gives the same mid-run. +- During a quiet period a **fault-killed** container is restored by the platform, so fault-induced + down recovers → the liveness check passes. +- A **self-inflicted** crash is config/load-driven and deterministic: it crash-loops or stays dead + even with faults paused → `:5100` never binds → the check fails. + +## Observation points considered +- **`:5100` API reachability (chosen):** external TCP check; survives the crash; the existing + `deploy/workload/entrypoint.sh` already polls it. Clean and direct. +- **End-to-end intake delivery:** stronger (alive *and* functional) — split into + [`adp-keeps-delivering`](adp-keeps-delivering.md). +- **Container-exit / built-in crash detection:** opaque; runs show it not firing for our boot panic + (open question); and it would also trip on fault-kills (not gated). Rejected as the primary. +- **In-SUT assertion:** rejected *for the death case* — a panicking process can't report its own + death, so liveness-on-crash must be observed externally. (Note: `saluki-components` *has* since + gained an `antithesis` feature + SDK dep for the **good-function** anchor in + [`adp-keeps-delivering`](adp-keeps-delivering.md); that proves a *booted* ADP works, which is a + different question from detecting a *dead* one.) + +## Why this image makes API liveness valid (vs. catalog note R1) +Note R1 says container/API liveness is vacuously green because the **production** ADP image runs an +s6 supervisor that auto-restarts ADP. The **harness** adp image is different: `deploy/Dockerfile` +adp stage is a bare binary + boot wrapper (`ENTRYPOINT ["/entrypoint.sh"]` → `agent-data-plane +run`), no supervisor. So a crash is not silently restarted, and a deterministic config/load crash +leaves `:5100` permanently unbound — API liveness is a real signal here. If the harness ever adopts +an auto-restart image, this property must move to a restart-count assertion (per R1). + +## Implementation (landed) +Realized as the `eventually_adp_alive` test command +(`test/antithesis/harness/src/bin/eventually_adp_alive.rs`). In the faults-paused `eventually_` +window it polls **both** ADP's `:5100` API (`TcpStream::connect`) and the DogStatsD listener socket +(`/var/run/datadog/dsd.socket` exists) for up to ~60×1s, then +`assert_always!(api_reachable && socket_present, …)` with the addresses in the details. Checking the +socket as well as `:5100` is slightly stronger than the original sketch: it confirms ADP got far +enough through bootstrap to *accept metrics*, not just to bind its control API. The assertion fires +once per branch; a branch where ADP self-crashed never satisfies both → counterexample. The workload +container intentionally gates on adp `service_started` (not `service_healthy`) so this command still +runs — and can observe a dead ADP — when ADP never becomes healthy. + +## Assertion-type rationale +**Liveness**, realized as an `assert_always!` *inside a faults-paused command after a bounded +recovery poll* — within that command ADP must be up, so a single always-evaluation is the right fit; +the quiet-period prefix supplies the fault discrimination rather than the assertion type. + +## Open Questions +- Does Antithesis's built-in container-exit detection already observe ADP boot-panics in this + topology? Runs show it reporting nothing — confirm via an `antithesis-query-logs` search for the + adp exit / the `isize::MAX` panic. `(needs human input)` +- Does a deterministic boot crash actually crash-loop, or exit once and stay down, under the harness + compose (no `restart:` policy on the adp service)? Either way `:5100` stays down; confirm. +- Mid-run crash coverage (workload masks a transient crash) needs an `ANTITHESIS_STOP_FAULTS` + liveness loop — deferred to a follow-up. + +## Investigation Log +- 2026-05-31: **Landed** as `eventually_adp_alive` (poll `:5100` + DSD socket, faults-paused + `eventually_`, `assert_always!`). Decoupled the workload from adp health (`service_started`) and + made the workload entrypoint non-gating so the check runs even when ADP is down. `snouty validate` + registers it ("1 eventually script"). Clarified detection provenance in Origin: the interner + boot-panic was a `snouty validate` finding, not an in-run one — this command is the in-run catch. +- 2026-05-31: Confirmed harness adp image is bare (no s6) from `deploy/Dockerfile` adp stage + (`ENTRYPOINT ["/entrypoint.sh"]`, `CMD ["run"]`) — so API liveness is non-vacuous here, reconciling + with catalog R1 which describes the production s6 image. +- 2026-05-31: Reviewed Antithesis fault model: `eventually_`/`finally_` run faults-paused; + `ANTITHESIS_STOP_FAULTS` for mid-run quiet periods; killed containers are restored during the + quiet period — basis for the self-inflicted-vs-fault discrimination above. diff --git a/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md b/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md index 98864f58b7..d8c8c940ef 100644 --- a/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md +++ b/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md @@ -5,7 +5,7 @@ commit: 042f41db3bd97118c38981765fd49696fce9d318 updated: 2026-05-28 type: Liveness priority: High -assertion_status: MISSING (net-new instrumentation) +assertion_status: PARTIAL — in-SUT `Sometimes(forwarded a payload)` landed 2026-05-31; recovery-reconciliation `Sometimes` still net-new --- # Property: After a transient intake outage clears, accepted-and-retryable transactions are eventually delivered @@ -79,12 +79,18 @@ expectation: every transaction that was (a) accepted and (b) retryable is eventu - Circuit breaker backoff schedule (exponential + jitter) — sets recovery latency, hence the size of the "eventually" window the assertion must allow. -## Suggested assertion (MISSING — net-new) -- **Sometimes(all-accepted-retryable-delivered-after-recovery)**: at least once, after a transient - outage clears and within a bounded window, the count of delivered transactions equals the count - of accepted-and-retryable transactions submitted before/during the outage (queue did not overflow). - This proves recovery actually happens. Best evaluated workload-side by reconciling the controlled - input set against the mock-intake received set. +## Suggested assertion +- **Landed 2026-05-31 — in-SUT delivery anchor:** `assert_sometimes!("ADP forwarded a payload to the + intake", { domain })` at the success branch of `process_http_response` (io.rs:553, behind + `saluki-components/antithesis`). This is the in-SUT proof that delivery *happens* (a 2xx from the + intake) and a replay checkpoint on a healthy-forwarding state — but it is **not** the recovery + property: a run-wide `Sometimes(forwarded)` is satisfied by any single delivery and says nothing + about *post-outage* completeness. +- **Still net-new — Sometimes(all-accepted-retryable-delivered-after-recovery):** at least once, after + a transient outage clears and within a bounded window, the count of delivered transactions equals + the count of accepted-and-retryable transactions submitted before/during the outage (queue did not + overflow). This proves recovery actually happens. Best evaluated workload-side by reconciling the + controlled input set against the mock-intake received set. - Supporting **Reachable**: the `Error::Open` re-enqueue path (`io.rs:468-474`) is hit at least once (proves the circuit breaker engaged and re-enqueued, not silently dropped). diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index bfcf0f7cba..73160b962a 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees and gap analyses that seed properties. @@ -15,7 +15,7 @@ external_references: # Property Catalog: Agent Data Plane (ADP) -35 properties across 7 categories. The system makes one headline guarantee — **"ADP will not +37 properties across 8 categories. The system makes one headline guarantee — **"ADP will not crash under load, losing customer data"** — which decomposes into the *Memory & Resource Bounds* and *Data Integrity & No Silent Loss* families. The remaining categories cover aggregation correctness, lifecycle/config, untrusted-input parsing, concurrency, and **transform & enrichment @@ -25,11 +25,12 @@ correctness** (Category G, added after evaluation — ADP as a *transformer*, no > service-checks; G2 transform-chain + runtime filter config-reload), applied 9 refinements, and > escalated one scope bias (traces/APM/logs/OTLP coverage). See `evaluation/synthesis.md`. -**Only bootstrap/workload SDK probes exist so far** (`existing-assertions.md`: 6 call sites — an -ADP `antithesis_init()` + bootstrap `assert_reachable!` behind the `antithesis` feature, and two -workload-side `assert_reachable!`/`assert_sometimes!` pairs in the harness). Every `Invariant` -below is still **net-new** SUT-side instrumentation. Several properties are **expected to fail by -design** under default config (memory limiter disabled, interner heap-fallback enabled, disk +**Instrumentation is mostly net-new, with the first pieces landed** (`existing-assertions.md`: 8 call +sites). Beyond the ADP bootstrap probe and workload-side anchors, two liveness pieces landed +2026-05-31: the external `eventually_adp_alive` command (Category H) and the **first in-SUT property +assertion** — an `assert_sometimes!` at the forwarder 2xx site in `saluki-components`. Every other +`Invariant` below is still **net-new** SUT-side instrumentation. Several properties are **expected to +fail by design** under default config (memory limiter disabled, interner heap-fallback enabled, disk persistence off) — these are flagged; they are the highest-value findings, not catalog errors. Provenance tags `[Fn]` after each slug name the discovery focus that surfaced it: @@ -648,6 +649,60 @@ partner's documented focus (the "Tag Filter RC Relay Stress Test"). These proper - Is the prefix-filter-after-mapper ordering load-bearing for equivalence, with any guard besides this property? - A reload updating one filter but lagging the other could filter at one stage but not the other for the same rule — confirm reachability. +## Category H — Liveness & Availability + +Properties that demonstrate ADP **boots and stays alive**. They exist because the +generated per-replay `datadog.yaml` configs and adversarial load *should* sometimes crash ADP +(`interner-full-bounded`, `rss-bounded-under-cardinality`, `config-incompatible-refuses-start`), yet +the bootstrap `assert_reachable!` only fires on success and cannot report a branch where ADP died. +The death-liveness catch watches from *outside* the SUT and is **fault-gated**: it evaluates in a +quiet period (faults paused — the `eventually_`/`finally_` prefixes do this, or +`ANTITHESIS_STOP_FAULTS`), so a node-fault-induced outage recovers and passes while a self-inflicted +crash (panic on startup, crash from load) persists and fails. That gating is exactly the requirement: +*trigger on self-inflicted death, not on injected node faults.* A complementary **in-SUT +good-function** `Sometimes` (the forwarder shipped a payload) shows a booted ADP actually works. + +> **Detection provenance (clarification, 2026-05-31).** Before this category landed, the *only* place +> a config-driven boot panic was ever observed was local **`snouty validate`** — a single-config +> smoke run done at launch time, outside any Antithesis timeline (e.g. an oversized +> `dogstatsd_string_interner_size` → `capacity would overflow isize::MAX`). No **in-run** mechanism +> caught it: a panicking ADP cannot self-assert, and the bootstrap `assert_reachable!` is silent on +> the dead branch. `eventually_adp_alive` (below) is what makes such boot/load crashes +> **in-run-detectable** — `snouty validate` catching one static config is *not* the same as an +> Antithesis shot finding it across the drawn-config space. + +### adp-stays-alive — ADP boots and stays serving (self-inflicted-crash liveness) +> **Status (2026-05-31): LANDED** as the `eventually_adp_alive` test command +> (`test/antithesis/harness/src/bin/eventually_adp_alive.rs`). Valid in *this* harness because the adp +> image is a bare binary + boot wrapper (no s6 supervisor) — unlike the production image, where API +> liveness is vacuously green (note R1; use restart-count there). +| | | +|---|---| +| **Type** | Liveness (ADP eventually serves), evaluated in a faults-paused window | +| **Property** | After the per-replay config is applied and the workload runs, ADP's unprivileged API (`:5100`) is reachable **and** the DogStatsD listener socket exists; if neither comes up in a quiet period, ADP died of its own config/load, not an injected fault. | +| **Invariant** | The `eventually_` command (faults already paused) polls `:5100` and the DSD socket for ~60×1s, then `assert_always!(api_reachable && socket_present, …)`. Fault-induced down recovers in the quiet period → passes; a deterministic config/load crash crash-loops or stays dead → never binds → fails. | +| **Antithesis Angle** | The crash is config-driven (drawn `dogstatsd_*` boundary values); a deterministic boot panic stays down across the whole quiet period regardless of restart policy, so the quiet period cleanly separates a real bug from a transient node fault. | + +### adp-keeps-delivering — ADP still processes and delivers after load (functional liveness) +> **Status (2026-05-31): PARTIAL.** The in-SUT good-function half **landed**: an `assert_sometimes!` +> at the forwarder's 2xx site (`lib/saluki-components/src/common/datadog/io.rs`, behind the new +> `saluki-components/antithesis` feature) fires when ADP ships a payload — proving a booted ADP runs +> the whole pipeline, and giving Antithesis a replay checkpoint anchored on a healthy state. The +> stronger **per-branch wedge detector** (an `assert_always!(delivered_recently && reachable)` in a +> faults-paused `finally_`) is **still net-new** — the landed `Sometimes` does not fail a branch where +> ADP accepted load then wedged. +| | | +|---|---| +| **Type** | Liveness (accepted load is eventually delivered), faults-paused window | +| **Property** | After the load drivers, in a quiet period the mock intake has received metrics *and* ADP still serves `:5100` — ADP is not merely up but not wedged. | +| **Invariant** | _Landed:_ in-SUT `Sometimes(forwarder shipped a payload)`. _Pending:_ `finally_` command that, in the faults-paused window, polls the mock intake's dump endpoint and `:5100` and asserts `Always(delivered_recently && reachable)` — catches "alive but stuck" that a bare reachability check (and a run-wide `Sometimes`) both miss. | + +**Open Questions (Category H)** +- Does Antithesis's built-in container-exit detection already see ADP boot-panics here? Runs show it + not firing — confirm via log search before assuming Category H is the only catch. `(needs human input)` +- Terminal `finally_`/`eventually_` only checks end-of-branch; a mid-run crash the workload papers + over needs a `ANTITHESIS_STOP_FAULTS` liveness loop (deferred). + ## Catalog-wide notes - **Default config is hostile to the bounded-memory family:** memory limiter disabled @@ -677,12 +732,14 @@ partner's documented focus (the "Tag Filter RC Relay Stress Test"). These proper (`malformed-dsd-no-crash`, `malformed-event-sc-no-crash`, `replay-no-panic-on-malformed-capture`, the aggregate-crash pair) must assert SUT-side `Unreachable` at panic sites — or assert on restart-count — **never** container liveness. -- **(R2, updated 2026-05-30) The Antithesis Rust SDK is now wired into ADP** behind the `antithesis` - cargo feature (`antithesis_init()` + a bootstrap `assert_reachable!`), and the harness binaries - carry workload-side anchors — so the "fork ADP + add the SDK + build an instrumented image" - prerequisite is largely satisfied (the wiring is proven). ~17 properties still need their net-new - in-process SUT-side **invariant** assertions landed on top of that scaffold; the ~10 workload-only - properties (forwarder delivery, retry-queue bounds, shutdown, config-gate, RSS) can run first. +- **(R2, updated 2026-05-31) The Antithesis Rust SDK is wired into ADP** behind the `antithesis` + cargo feature (`antithesis_init()` + a bootstrap `assert_reachable!`), and as of 2026-05-31 the + **first in-SUT property assertion** is landed: an `assert_sometimes!` at the forwarder 2xx site in + `saluki-components` (its own new `antithesis` feature, enabled transitively by + `agent-data-plane/antithesis`). So in-process instrumentation is no longer just the bootstrap probe + — the path from a catalog property to a real SUT-side assertion is proven end-to-end. ~16 properties + still need their net-new in-process SUT-side **invariant** assertions; the remaining workload-only + properties (retry-queue bounds, shutdown, config-gate, RSS) can run first. - **(R3) No-loss properties must use TCP or UDS ingress, not UDP** — UDP's inherent packet loss confounds any "accepted == delivered" reconciliation (`no-silent-interconnect-drop`, `forwarder-eventual-delivery`, `disk-persisted-retry-survives-restart`, `shutdown-drains-no-loss`, @@ -696,6 +753,20 @@ partner's documented focus (the "Tag Filter RC Relay Stress Test"). These proper properties and `config-runtime-update-not-revalidated`) require the **config-stream add-on topology** (Core Agent or stub) — they pass vacuously in standalone mode because the config watcher never fires. +- **(R5, updated 2026-05-31) Liveness observability is valid in this harness, and the catch is now + landed (cf. R1's production case).** The harness `adp` image is a bare binary + boot wrapper + (`deploy/Dockerfile` adp stage: `ENTRYPOINT ["/entrypoint.sh"]` → `agent-data-plane run`) with **no + s6 supervisor**, so external `:5100` / DSD-socket liveness is a real signal for Category H — a + deterministic config/load crash leaves them permanently unbound. As of 2026-05-31 the + `eventually_adp_alive` command realizes this, and the workload no longer gates on adp health + (`docker-compose.yaml`: `service_started`, not `service_healthy`) so the check runs even when ADP is + down. R1's "never use container liveness" applies to the production s6 image; if the harness ever + adopts an auto-restart image, switch Category H to a restart-count assertion. +- **(R6) Observed fault availability contradicts the Scope note.** Despite faults being recorded as + tenant-enabled below, runs launched with the `basic_test` webhook have injected **zero** fault + events — the `node - kill/pause/stop/throttle` and `clock - skip` total-fault-event properties + report `0/0`. So Category H's fault-gating is currently moot (no faults to mistake for a crash) but + is the right design for when faults fire; the webhook's fault configuration needs confirming. ## Scope (confirmed with user, 2026-05-28) diff --git a/test/antithesis/scratchbook/property-relationships.md b/test/antithesis/scratchbook/property-relationships.md index 866de00617..4cca3536b0 100644 --- a/test/antithesis/scratchbook/property-relationships.md +++ b/test/antithesis/scratchbook/property-relationships.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees and gap analyses that seed properties. @@ -15,7 +15,7 @@ external_references: # Property Relationships -Lightweight clustering of the 35 catalog properties by shared code paths, failure mechanisms, and +Lightweight clustering of the 37 catalog properties by shared code paths, failure mechanisms, and suspected dominance. Slugs match `property-catalog.md`. ## Cluster 1 — Bounded memory (the determinism story) @@ -160,7 +160,28 @@ Properties: `events-sc-no-silent-loss`, `malformed-event-sc-no-crash`, `events-s - **Anti-vacuity dependency:** `events-sc-pipeline-reachable` is the R4 anchor that keeps the other two from passing trivially under a metrics-dominated workload — a hard dependency, not just a relation. -## Shared-scenario pairs (R10 — count is not 35 independent test efforts) +## Cluster 10 — Liveness & availability observers (added 2026-05-31) + +Properties: `adp-stays-alive`, `adp-keeps-delivering`. + +- **Status (2026-05-31): landed.** `adp-stays-alive` is realized as the external `eventually_adp_alive` + command; the good-function half of `adp-keeps-delivering` is realized as an in-SUT + `assert_sometimes!` at the forwarder 2xx site. The per-branch "alive but wedged" `assert_always` for + `adp-keeps-delivering` remains net-new. +- **The external catch for every crash property.** Clusters 1, 4, 5, and 6 predict crashes + (interner/RSS overflow, aggregate panics, config-driven startup refusal, malformed-input panics); + Cluster 10 is what turns those into counterexamples, with the *death* check from *outside* the SUT + (a panicking ADP cannot self-assert) and the *good-function* check from inside. `adp-stays-alive` + dominates — `adp-keeps-delivering` adds the "alive but wedged" facet; its landed half lives at the + forwarder 2xx site, and its pending per-branch wedge detector would share the `finally_verify_delivery` + site. +- **Fault-gated, not fault-blind.** Both evaluate in a faults-paused window (`eventually_`/`finally_` + or `ANTITHESIS_STOP_FAULTS`), which is the line between a self-inflicted crash (persists → fail) + and an injected node fault (recovers → pass). This is a hard design dependency, not a relation. +- **Image dependency (R5).** Valid only because the harness adp image has no auto-restart supervisor; + on a production s6 image these become restart-count assertions (R1). + +## Shared-scenario pairs (R10 — count is not 37 independent test efforts) These pairs share a fault scenario / assertion site and should be implemented together; treat them as one test effort each for portfolio-sizing: diff --git a/test/antithesis/scratchbook/sut-analysis.md b/test/antithesis/scratchbook/sut-analysis.md index d3d6e76198..33f5cf86e5 100644 --- a/test/antithesis/scratchbook/sut-analysis.md +++ b/test/antithesis/scratchbook/sut-analysis.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees, Phase 1 bug bash, gap analyses, weekly summaries. @@ -305,3 +305,34 @@ into aggregation**, and **timing/interleaving** in the forwarder and interner. T mirrors the correctness harness — ADP + a controllable mock intake + a deterministic load generator — but adds Antithesis fault injection (network, process, clock) that the harness lacks. See `property-catalog.md` and `deployment-topology.md`. + +### 10a. Liveness observability & fault-gating (added 2026-05-31) + +The harness generates configs/load that *should* crash ADP, but nothing observed whether ADP is +alive, so those crashes were invisible **in-run** (Category H closes this; **landed 2026-05-31**). +Two facts shape the design: + +- **Where liveness is observable, and which check goes where:** the process/container, the + unprivileged API on `:5100`, the DogStatsD socket, and end-to-end delivery at the mock intake. + - *Death-liveness must be external:* a panicking process cannot self-assert, so "did ADP stay up?" + is observed from a workload-side command (`eventually_adp_alive`). + - *Good-function can be in-SUT:* a *booted* ADP can report that it works. `saluki-components` gained + an `antithesis` feature + SDK dep for an `assert_sometimes!` at the forwarder 2xx site — proving + the pipeline ran. (The death case and the good-function case are different questions; only the + former is forced external.) +- **Detection provenance (clarification):** the config-driven boot panic (oversized interner → + `isize::MAX`) was first seen via local `snouty validate` on a *single* config — not by an in-run + timeline. `validate`-rejects-a-config is not the same as a shot finding the bug across the drawn + config space; `eventually_adp_alive` is the in-run catch that was missing. +- **Image matters:** the harness adp image is a bare binary + boot wrapper (`deploy/Dockerfile` adp + stage, no s6 supervisor), so external `:5100` liveness is a *valid, non-vacuous* signal here — a + deterministic config/load crash leaves `:5100` permanently unbound. The production s6 image would + auto-restart and make this vacuous (catalog R1); there, use a restart-count assertion. +- **Distinguishing self-inflicted death from injected faults:** evaluate liveness in a **quiet + period** (`eventually_`/`finally_` run faults-paused; `ANTITHESIS_STOP_FAULTS` for mid-run). A + fault-killed container is restored during the quiet period → passes; a self-inflicted crash + persists → fails. This is exactly "trigger on panic-on-startup / crash-from-load, not on node + faults." +- **Caveat:** runs to date inject **zero** faults (the `node - *` / `clock - skip` total-event + properties report `0/0`), so the gating is currently moot but is the right design for when faults + are enabled; the launch webhook's fault configuration needs confirming.