diff --git a/Cargo.lock b/Cargo.lock index 837fab9538..b8bfd4d21c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1678,8 +1678,12 @@ dependencies = [ "antithesis_sdk", "anyhow", "clap", + "num-traits", "rand 0.10.1", + "rand_distr", + "serde", "serde_json", + "serde_yaml", ] [[package]] @@ -4165,6 +4169,7 @@ dependencies = [ name = "saluki-components" version = "0.1.0" dependencies = [ + "antithesis_sdk", "arc-swap", "async-trait", "axum", diff --git a/bin/agent-data-plane/Cargo.toml b/bin/agent-data-plane/Cargo.toml index 7b479635d4..17103a09ee 100644 --- a/bin/agent-data-plane/Cargo.toml +++ b/bin/agent-data-plane/Cargo.toml @@ -11,7 +11,7 @@ workspace = true [features] default = [] fips = ["saluki-app/tls-fips", "saluki-components/fips"] -antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full", "dep:antithesis-instrumentation"] +antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full", "dep:antithesis-instrumentation", "saluki-components/antithesis"] [dependencies] antithesis-instrumentation = { workspace = true, optional = true } diff --git a/lib/saluki-components/Cargo.toml b/lib/saluki-components/Cargo.toml index 9056455e58..7110808bb5 100644 --- a/lib/saluki-components/Cargo.toml +++ b/lib/saluki-components/Cargo.toml @@ -12,8 +12,10 @@ workspace = true default = [] config-test-support = [] fips = ["saluki-io/fips"] +antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full"] [dependencies] +antithesis_sdk = { workspace = true, optional = true } arc-swap = { workspace = true } async-trait = { workspace = true } axum = { workspace = true } diff --git a/lib/saluki-components/src/common/datadog/io.rs b/lib/saluki-components/src/common/datadog/io.rs index 5ecc3ca209..26289e52f9 100644 --- a/lib/saluki-components/src/common/datadog/io.rs +++ b/lib/saluki-components/src/common/datadog/io.rs @@ -546,6 +546,19 @@ async fn process_http_response( if status.is_success() { debug!(endpoint_url, %status, "Request completed."); + // Reaching a successful intake response means the whole pipeline + // ran. This is a useful signal for process health but also + // acts as a checkpoint anchor for Antithesis replay: at this point + // there is a nominally functional system. + // + // No-op outside the `antithesis` feature build. + #[cfg(feature = "antithesis")] + antithesis_sdk::assert_sometimes!( + true, + "ADP forwarded a payload to the intake", + &serde_json::json!({ "domain": domain }) + ); + telemetry.track_successful_transaction(&metadata, domain); } else { telemetry.track_permanently_failed_transaction(&metadata, Some(status), domain); diff --git a/test/antithesis/deploy/Dockerfile b/test/antithesis/deploy/Dockerfile index d8fe97108d..d8de1bb5c0 100644 --- a/test/antithesis/deploy/Dockerfile +++ b/test/antithesis/deploy/Dockerfile @@ -32,11 +32,14 @@ RUN --mount=type=bind,source=rust-toolchain.toml,target=/tmp/rust-toolchain.toml # --------------------------------------------------------------------------- # Build the instrumented Agent Data Plane. # -# Coverage instrumentation uses the modern Antithesis Rust flow (post-2026-05-22): the -# `antithesis-instrumentation` crate (referenced once in main.rs behind the `antithesis` feature) -# provides the runtime shim, and these RUSTFLAGS enable LLVM sancov coverage. `--build-id` is -# required for symbolization; the release profile sets `debug = true`, so the binary keeps DWARF -# for /symbols. LTO is disabled to keep sancov instrumentation predictable. +# Coverage instrumentation uses the modern Antithesis Rust flow +# (post-2026-05-22): the `antithesis-instrumentation` crate (referenced once in +# main.rs behind the `antithesis` feature) provides the runtime shim, and these +# RUSTFLAGS enable LLVM sancov coverage. `--build-id` is required for +# symbolization; the release profile sets `debug = true`, so the binary keeps +# DWARF for /symbols. LTO is disabled to keep sancov instrumentation +# predictable. `panic = "abort"` (antithesis build only) turns any ADP panic +# into SIGABRT, caught as a hard crash. # --------------------------------------------------------------------------- FROM build-base AS adp-builder ENV APP_FULL_NAME="Agent Data Plane" \ @@ -55,6 +58,7 @@ RUN --mount=type=cache,target=/adp/target,id=antithesis-adp-target \ --mount=type=cache,target=/root/.cargo/git,id=cargo-git \ cargo build --release --package agent-data-plane --features antithesis \ --target x86_64-unknown-linux-gnu \ + --config 'profile.release.panic="abort"' \ --config 'target.x86_64-unknown-linux-gnu.rustflags=["--cfg","tokio_unstable","-Ccodegen-units=1","-Cpasses=sancov-module","-Cllvm-args=-sanitizer-coverage-level=3","-Cllvm-args=-sanitizer-coverage-trace-pc-guard","-Clink-args=-Wl,--build-id"]' && \ cp /adp/target/x86_64-unknown-linux-gnu/release/agent-data-plane /usr/local/bin/agent-data-plane && \ echo "Validating Antithesis instrumentation symbols..." && \ @@ -74,11 +78,14 @@ RUN --mount=type=cache,target=/tools/target,id=antithesis-tools-target \ --mount=type=cache,target=/root/.cargo/git,id=cargo-git \ cargo build --release \ --bin datadog-intake --bin millstone \ - --bin parallel_driver_send_dogstatsd --bin finally_verify_delivery && \ + --bin parallel_driver_send_dogstatsd --bin finally_verify_delivery --bin eventually_adp_alive \ + --bin first_sample_config && \ cp /tools/target/release/datadog-intake /usr/local/bin/datadog-intake && \ cp /tools/target/release/millstone /usr/local/bin/millstone && \ cp /tools/target/release/parallel_driver_send_dogstatsd /usr/local/bin/parallel_driver_send_dogstatsd && \ - cp /tools/target/release/finally_verify_delivery /usr/local/bin/finally_verify_delivery + cp /tools/target/release/finally_verify_delivery /usr/local/bin/finally_verify_delivery && \ + cp /tools/target/release/eventually_adp_alive /usr/local/bin/eventually_adp_alive && \ + cp /tools/target/release/first_sample_config /usr/local/bin/first_sample_config # --------------------------------------------------------------------------- # Runtime: Agent Data Plane (SUT). @@ -92,8 +99,12 @@ RUN apt-get update && \ COPY --from=adp-builder /usr/local/bin/agent-data-plane /usr/local/bin/agent-data-plane # Expose DWARF/build-id symbols to Antithesis for symbolization (one-hop symlink to the unstripped binary). RUN mkdir -p /symbols && ln -s /usr/local/bin/agent-data-plane /symbols/agent-data-plane -# main.rs requires the bootstrap config file to exist at the default path; ship a minimal standalone config. +# main.rs requires the bootstrap config file to exist at the default path; ship a minimal standalone +# config as a fallback. The boot wrapper overwrites it with the per-replay config written by the +# `first_sample_config` workload command onto the shared `agent-config` volume. COPY test/antithesis/deploy/adp/datadog.yaml /etc/datadog-agent/datadog.yaml +# Boot wrapper: waits for the drawn config sentinel, copies the config into place, then execs ADP. +COPY --chmod=755 test/antithesis/deploy/adp/entrypoint.sh /entrypoint.sh # ADP's control-plane secure API requires an IPC TLS cert (a single PEM holding both certificate and # private key) that the Core Agent normally generates. In standalone mode there is no Core Agent, so # generate a self-signed cert+key. An empty auth_token satisfies the IPC auth config at startup. @@ -103,7 +114,7 @@ RUN openssl req -x509 -newkey rsa:2048 -nodes -days 3650 \ cat /tmp/ipc_cert.pem /tmp/ipc_key.pem > /etc/datadog-agent/ipc_cert.pem && \ rm -f /tmp/ipc_cert.pem /tmp/ipc_key.pem && \ touch /etc/datadog-agent/auth_token -ENTRYPOINT ["/usr/local/bin/agent-data-plane"] +ENTRYPOINT ["/entrypoint.sh"] CMD ["run"] # --------------------------------------------------------------------------- @@ -128,7 +139,9 @@ COPY --from=tools-builder /usr/local/bin/millstone /usr/local/bin/millstone COPY --chmod=755 test/antithesis/deploy/workload/setup-complete.sh /opt/antithesis/setup-complete.sh COPY test/antithesis/deploy/workload/test/ /opt/antithesis/test/ # Inject the compiled test-command binaries into the "main" test template. +COPY --from=tools-builder --chmod=755 /usr/local/bin/first_sample_config /opt/antithesis/test/v1/main/first_sample_config COPY --from=tools-builder --chmod=755 /usr/local/bin/parallel_driver_send_dogstatsd /opt/antithesis/test/v1/main/parallel_driver_send_dogstatsd COPY --from=tools-builder --chmod=755 /usr/local/bin/finally_verify_delivery /opt/antithesis/test/v1/main/finally_verify_delivery +COPY --from=tools-builder --chmod=755 /usr/local/bin/eventually_adp_alive /opt/antithesis/test/v1/main/eventually_adp_alive COPY --chmod=755 test/antithesis/deploy/workload/entrypoint.sh /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] diff --git a/test/antithesis/deploy/adp/entrypoint.sh b/test/antithesis/deploy/adp/entrypoint.sh new file mode 100644 index 0000000000..8593748ba4 --- /dev/null +++ b/test/antithesis/deploy/adp/entrypoint.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Agent Data Plane boot wrapper. +# +# first_sample_config writes this timeline's datadog.yaml + a `ready` sentinel to +# the shared volume; we block on it, copy the config, then `exec` one stable ADP. +# We block indefinitely rather than timing out and exiting non-zero, which would +# be read as an ADP crash. The startup log below makes the wait visible in triage, +# so a missing release shows as "waiting…" with no boot rather than a silent hang. + +CONFIG_DIR="${AGENT_CONFIG_DIR:-/agent-config}" + +echo "adp: waiting for ${CONFIG_DIR}/ready (released by first_sample_config)" >&2 +while [ ! -f "${CONFIG_DIR}/ready" ]; do + sleep 1 +done + +cp "${CONFIG_DIR}/datadog.yaml" /etc/datadog-agent/datadog.yaml + +exec /usr/local/bin/agent-data-plane "$@" diff --git a/test/antithesis/deploy/docker-compose.yaml b/test/antithesis/deploy/docker-compose.yaml index 335c5b9b1e..6e3b3bf9a7 100644 --- a/test/antithesis/deploy/docker-compose.yaml +++ b/test/antithesis/deploy/docker-compose.yaml @@ -40,15 +40,11 @@ services: DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" volumes: - dogstatsd-socket:/var/run/datadog + # first_sample_config (workload) writes this timeline's datadog.yaml + ready sentinel here. + - agent-config:/agent-config:ro depends_on: intake: condition: service_healthy - healthcheck: - # ADP's unprivileged API listens on TCP :5100 once the internal supervisor is up. - test: ["CMD-SHELL", "bash -c 'exec 3<>/dev/tcp/localhost/5100'"] - interval: 2s - timeout: 2s - retries: 60 workload: container_name: workload @@ -62,19 +58,15 @@ services: image: workload:latest environment: NO_COLOR: "1" - ADP_HOST: "adp" - ADP_API_PORT: "5100" DSD_SOCKET: "/var/run/datadog/dsd.socket" INTAKE_ADDR: "intake:2049" - INTAKE_HOST: "intake" - INTAKE_PORT: "2049" volumes: - dogstatsd-socket:/var/run/datadog + - agent-config:/agent-config depends_on: - adp: - condition: service_healthy intake: condition: service_healthy volumes: dogstatsd-socket: + agent-config: diff --git a/test/antithesis/deploy/workload/entrypoint.sh b/test/antithesis/deploy/workload/entrypoint.sh index 3ff5e46908..fc51192f30 100644 --- a/test/antithesis/deploy/workload/entrypoint.sh +++ b/test/antithesis/deploy/workload/entrypoint.sh @@ -3,51 +3,9 @@ set -euo pipefail # Workload client entrypoint. # -# By the time this runs, docker-compose has gated startup on the `adp` and `intake` services being -# healthy (depends_on: condition: service_healthy). We re-confirm reachability defensively, emit the -# Antithesis `setup_complete` signal, then idle so Antithesis can run test commands from the test -# template at /opt/antithesis/test/v1/. +# Gated on intake-healthy (compose `depends_on`). Emit `setup_complete`, then +# idle so Antithesis runs the test commands. -ADP_HOST="${ADP_HOST:-adp}" -ADP_API_PORT="${ADP_API_PORT:-5100}" -DSD_SOCKET="${DSD_SOCKET:-/var/run/datadog/dsd.socket}" -INTAKE_HOST="${INTAKE_HOST:-intake}" -INTAKE_PORT="${INTAKE_PORT:-2049}" - -wait_for_tcp() { - local host="$1" port="$2" name="$3" tries=60 - echo "Waiting for ${name} (${host}:${port})..." - while (( tries-- > 0 )); do - if (exec 3<>"/dev/tcp/${host}/${port}") 2>/dev/null; then - echo "${name} is reachable." - return 0 - fi - sleep 1 - done - echo "Timed out waiting for ${name} (${host}:${port})." >&2 - return 1 -} - -wait_for_socket() { - local path="$1" name="$2" tries=60 - echo "Waiting for ${name} (${path})..." - while (( tries-- > 0 )); do - if [[ -S "${path}" ]]; then - echo "${name} is reachable." - return 0 - fi - sleep 1 - done - echo "Timed out waiting for ${name} (${path})." >&2 - return 1 -} - -wait_for_tcp "${ADP_HOST}" "${ADP_API_PORT}" "agent-data-plane API" -wait_for_socket "${DSD_SOCKET}" "agent-data-plane DogStatsD socket" -wait_for_tcp "${INTAKE_HOST}" "${INTAKE_PORT}" "datadog-intake" - -echo "System is ready. Emitting setup_complete." /opt/antithesis/setup-complete.sh - -echo "Workload client idle; awaiting Antithesis test commands." +echo "setup_complete emitted; workload idle, awaiting Antithesis test commands." exec tail -f /dev/null diff --git a/test/antithesis/harness/Cargo.toml b/test/antithesis/harness/Cargo.toml index 0190ffc517..9d35aa056c 100644 --- a/test/antithesis/harness/Cargo.toml +++ b/test/antithesis/harness/Cargo.toml @@ -16,8 +16,12 @@ clap = { workspace = true, features = [ "std", "usage", ] } +num-traits = { workspace = true } rand = { workspace = true } +rand_distr = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } +serde_yaml = { workspace = true } [lints.clippy] all = "deny" diff --git a/test/antithesis/harness/src/bin/eventually_adp_alive.rs b/test/antithesis/harness/src/bin/eventually_adp_alive.rs new file mode 100644 index 0000000000..507fa41f27 --- /dev/null +++ b/test/antithesis/harness/src/bin/eventually_adp_alive.rs @@ -0,0 +1,75 @@ +//! Antithesis `eventually_` liveness check: ADP booted and became reachable +//! within a bounded window. +//! +//! `eventually_` commands run in a fault-quiet period, so a node-fault induced +//! kill of ADP does not trip this check but a self-inflicted process exit +//! does. This triggers on ADP's own bugs, rather than antithesis fault +//! injection. +//! +//! We check two signals. First that ADP is reachable on :5100 and second that +//! it created a `DogStatsD` listener socket. + +use std::net::{TcpStream, ToSocketAddrs}; +use std::os::unix::fs::FileTypeExt; +use std::path::PathBuf; +use std::thread::sleep; +use std::time::Duration; + +use antithesis_sdk::prelude::*; +use clap::{builder::NonEmptyStringValueParser, Parser}; +use serde_json::json; + +#[derive(Debug, Parser)] +#[command(name = "eventually_adp_alive")] +struct Config { + #[arg( + long = "adp-api-addr", + env = "ADP_API_ADDR", + default_value = "adp:5100", + value_parser = NonEmptyStringValueParser::new() + )] + adp_api_addr: String, + #[arg( + long = "dsd-socket", + env = "DSD_SOCKET", + default_value = "/var/run/datadog/dsd.socket" + )] + dsd_socket: PathBuf, +} + +fn main() -> anyhow::Result<()> { + antithesis_init(); + let config = Config::try_parse()?; + + let mut api_reachable = false; + let mut socket_present = false; + // Check that the adp-api is reachable and the DogStatsD socket exists for + // about 60 seconds. A 1s connect timeout keeps the poll cadence bounded + // even when the API host is unresponsive. + for _ in 0..60 { + api_reachable = config + .adp_api_addr + .to_socket_addrs() + .ok() + .and_then(|mut addrs| addrs.next()) + .is_some_and(|addr| TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok()); + socket_present = config.dsd_socket.metadata().is_ok_and(|m| m.file_type().is_socket()); + if api_reachable && socket_present { + break; + } + sleep(Duration::from_secs(1)); + } + + assert_always!( + api_reachable && socket_present, + "ADP booted: API reachable and DogStatsD socket present", + &json!({ + "adp_api_addr": config.adp_api_addr, + "dsd_socket": config.dsd_socket.display().to_string(), + "api_reachable": api_reachable, + "socket_present": socket_present, + }) + ); + + Ok(()) +} diff --git a/test/antithesis/harness/src/bin/first_sample_config/config.rs b/test/antithesis/harness/src/bin/first_sample_config/config.rs new file mode 100644 index 0000000000..d84f0756ea --- /dev/null +++ b/test/antithesis/harness/src/bin/first_sample_config/config.rs @@ -0,0 +1,246 @@ +//! Configuration model and rendering for Datadog Agent configuration. +//! +//! Primary focus is currently `DogStatsD` but this is, hopefully, easy to expand +//! in the future. + +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use anyhow::Context as _; +use harness::rand::Probe; +use rand::distr::{Distribution, StandardUniform}; +use rand::{Rng, RngExt}; +use serde::{Serialize, Serializer}; + +/// Yaml flags the Agent reads at boot that never vary. +const STATIC_YAML_TAIL: &str = "use_dogstatsd: true +use_v2_api_series: true +inventories_enabled: false +enable_metadata_collection: false +cloud_provider_metadata: [] +"; + +/// A Go `time.Duration`, rendered as a Go duration string (for example `100ms`) +/// — the form the Agent's duration config keys parse. +#[derive(Debug, Clone, Copy)] +struct GoDuration(Duration); + +impl Serialize for GoDuration { + fn serialize(&self, serializer: S) -> Result { + serializer.collect_str(&format_args!("{}ms", self.0.as_millis())) + } +} + +impl Distribution for Probe { + fn sample(&self, rng: &mut R) -> GoDuration { + let millis: u64 = self.sample(rng); + GoDuration(Duration::from_millis(millis)) + } +} + +/// A duration the Agent reads as a plain integer number of seconds (`GetInt`), +/// rendered as that integer. +#[derive(Debug, Clone, Copy)] +struct DurationSeconds(Duration); + +impl Serialize for DurationSeconds { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_u64(self.0.as_secs()) + } +} + +impl Distribution for Probe { + fn sample(&self, rng: &mut R) -> DurationSeconds { + let secs: u64 = self.sample(rng); + DurationSeconds(Duration::from_secs(secs)) + } +} + +/// Agent log level +/// +/// Restricted to quiet levels on purpose. Antithesis enforces a per-hour +/// log-output budget per run and `info`/`debug`/`trace` is a whole awful lot of +/// logs. +#[derive(Debug, Clone, Copy, Serialize)] +#[serde(rename_all = "lowercase")] +pub(crate) enum LogLevel { + /// Warnings and above. + Warn, + /// Errors only. + Error, +} + +impl Distribution for StandardUniform { + fn sample(&self, rng: &mut R) -> LogLevel { + match rng.random_range(0..2u8) { + 0 => LogLevel::Warn, + _ => LogLevel::Error, + } + } +} + +/// Tag granularity for origin-detected `DogStatsD` tags. +#[derive(Debug, Clone, Copy, Serialize)] +#[serde(rename_all = "lowercase")] +pub(crate) enum TagCardinality { + /// Low-cardinality objects: clusters, hosts, deployments, images. Agent + /// default. + Low, + /// Orchestrator-level: pod (Kubernetes) or task (ECS/Mesos) cardinality. + Orchestrator, + /// High-cardinality objects: individual containers, request user IDs, etc. + High, +} + +impl Distribution for StandardUniform { + fn sample(&self, rng: &mut R) -> TagCardinality { + match rng.random_range(0..3u8) { + 0 => TagCardinality::Low, + 1 => TagCardinality::Orchestrator, + _ => TagCardinality::High, + } + } +} + +/// The Agent's `DogStatsD` configuration surface. `dogstatsd_socket` is +/// supplied by the environment; the rest are sampled. +/// +/// Numeric fields are sampled with [`Probe`]: usually a typical value (so ADP +/// boots and runs), occasionally a boundary value to probe overflow and +/// wraparound. +#[allow(clippy::struct_field_names, clippy::struct_excessive_bools)] +#[derive(Debug, Serialize)] +pub(crate) struct DogStatsdConfig { + /// Unix socket the server listens on. Supplied by the environment. + dogstatsd_socket: PathBuf, + /// Buffer used to receive statsd packets, in bytes. + dogstatsd_buffer_size: u64, + /// Bytes for the socket receive buffer (`POSIX`); `0` keeps the OS default. + dogstatsd_so_rcvbuf: u64, + /// Packets buffered before flushing to the processing queue. + dogstatsd_packet_buffer_size: u64, + /// Maximum time packets sit in the packet buffer before a flush. + dogstatsd_packet_buffer_flush_timeout: GoDuration, + /// Internal queue size of the server; smaller caps memory but risks packet + /// drops. + dogstatsd_queue_size: u64, + /// Number of processing pipelines. + dogstatsd_pipeline_count: u64, + /// Worker count processing packets; `0` lets the Agent choose. + dogstatsd_workers_count: u64, + /// Seconds a counter is sampled to `0` after its last value before expiring. + dogstatsd_expiry_seconds: DurationSeconds, + /// Seconds a metric context is kept in memory after its last sample. + dogstatsd_context_expiry_seconds: DurationSeconds, + /// Maximum entries in the string interner cache. + dogstatsd_string_interner_size: u64, + /// Max number of metric-mapping results cached by the mapper. + dogstatsd_mapper_cache_size: u64, + /// Max metrics per payload from the no-aggregation pipeline. + dogstatsd_no_aggregation_pipeline_batch_size: u64, + /// Tag granularity for origin-detected tags. + dogstatsd_tag_cardinality: TagCardinality, + /// Listen for non-local UDP traffic (binds `0.0.0.0`). + dogstatsd_non_local_traffic: bool, + /// Tag metrics with container metadata from the Unix socket peer. + dogstatsd_origin_detection: bool, + /// Use a client-provided container ID to enrich metrics. + dogstatsd_origin_detection_client: bool, + /// Let clients opt out of origin detection via cardinality `none`. + dogstatsd_origin_optout_enabled: bool, + /// Collect basic per-metric statistics (count / last seen). + dogstatsd_metrics_stats_enable: bool, + /// When an `Entity-ID` is set, skip origin-detection tag enrichment. + dogstatsd_entity_id_precedence: bool, + /// Enable the no-aggregation pipeline (forward timestamped metrics with + /// tagging only). + dogstatsd_no_aggregation_pipeline: bool, + /// Flush incomplete metric time buckets on shutdown. + dogstatsd_flush_incomplete_buckets: bool, + /// Automatically adjust the number of processing pipelines. + dogstatsd_pipeline_autoadjust: bool, + /// Publish `DogStatsD` internal stats as Go expvars. + dogstatsd_stats_enable: bool, +} + +impl DogStatsdConfig { + /// Sample the `DogStatsD` options from `rng`, taking the socket from the + /// environment. + fn sample(rng: &mut R, dogstatsd_socket: &Path) -> Self { + Self { + dogstatsd_socket: dogstatsd_socket.to_path_buf(), + dogstatsd_buffer_size: Probe.sample(rng), + dogstatsd_so_rcvbuf: Probe.sample(rng), + dogstatsd_packet_buffer_size: Probe.sample(rng), + dogstatsd_packet_buffer_flush_timeout: Probe.sample(rng), + dogstatsd_queue_size: Probe.sample(rng), + dogstatsd_pipeline_count: Probe.sample(rng), + dogstatsd_workers_count: Probe.sample(rng), + dogstatsd_expiry_seconds: Probe.sample(rng), + dogstatsd_context_expiry_seconds: Probe.sample(rng), + dogstatsd_string_interner_size: Probe.sample(rng), + dogstatsd_mapper_cache_size: Probe.sample(rng), + dogstatsd_no_aggregation_pipeline_batch_size: Probe.sample(rng), + dogstatsd_tag_cardinality: rng.random(), + dogstatsd_non_local_traffic: rng.random(), + dogstatsd_origin_detection: rng.random(), + dogstatsd_origin_detection_client: rng.random(), + dogstatsd_origin_optout_enabled: rng.random(), + dogstatsd_metrics_stats_enable: rng.random(), + dogstatsd_entity_id_precedence: rng.random(), + dogstatsd_no_aggregation_pipeline: rng.random(), + dogstatsd_flush_incomplete_buckets: rng.random(), + dogstatsd_pipeline_autoadjust: rng.random(), + dogstatsd_stats_enable: rng.random(), + } + } +} + +/// Agent-facing config. `hostname`, `api_key`, `dd_url`, and the socket are +/// supplied by the environment; `log_level` and the `DogStatsD` options are +/// sampled per branch. The static flags are appended by [`Self::to_yaml`], not +/// fields here. +#[derive(Debug, Serialize)] +pub(crate) struct DatadogConfig { + /// Agent hostname. Supplied by the environment. ADP requires it + /// (`FixedHostProvider`); absent it refuses to boot. + hostname: String, + /// Agent API key. Supplied by the environment. + api_key: String, + /// Metrics intake base URL. Supplied by the environment. + dd_url: String, + /// Agent log verbosity. Sampled; restricted to quiet levels (see [`LogLevel`]). + log_level: LogLevel, + /// `DogStatsD` options, flattened to top-level `dogstatsd_*` keys. + #[serde(flatten)] + dogstatsd: DogStatsdConfig, +} + +impl DatadogConfig { + /// Generate a config: the environmental fields come from the caller, the + /// rest are sampled from `rng`. With an Antithesis-backed rng, each call after + /// the snapshot yields an independent draw per replay branch. + pub(crate) fn sample( + rng: &mut R, hostname: &str, api_key: &str, dd_url: &str, dogstatsd_socket: &Path, + ) -> Self { + Self { + hostname: hostname.to_owned(), + api_key: api_key.to_owned(), + dd_url: dd_url.to_owned(), + log_level: rng.random(), + dogstatsd: DogStatsdConfig::sample(rng, dogstatsd_socket), + } + } + + /// Render `self` as a `datadog.yaml` string, followed by the static-tail + /// flags. + /// + /// # Errors + /// + /// Returns an error if serialization fails. + pub(crate) fn to_yaml(&self) -> anyhow::Result { + let mut yaml = serde_yaml::to_string(self).context("serialize datadog.yaml")?; + yaml.push_str(STATIC_YAML_TAIL); + Ok(yaml) + } +} diff --git a/test/antithesis/harness/src/bin/first_sample_config/main.rs b/test/antithesis/harness/src/bin/first_sample_config/main.rs new file mode 100644 index 0000000000..7ea3cfba7b --- /dev/null +++ b/test/antithesis/harness/src/bin/first_sample_config/main.rs @@ -0,0 +1,75 @@ +//! Antithesis `first_` command: sample this timeline's `datadog.yaml` and release ADP. +//! +//! Runs once per execution path after `setup_complete`, so the sample (see +//! [`config`], Antithesis SDK randomness) is a post-snapshot, per-timeline +//! decision Antithesis branches. Writes the config to the shared `agent-config` +//! volume then a `ready` sentinel the blocked ADP entrypoint waits on; running +//! upstream of ADP's boot is what makes each timeline boot under its own config. +//! Deployment fields come from the environment (see [`Cli`]). + +mod config; + +use std::fs; +use std::path::PathBuf; + +use antithesis_sdk::prelude::*; +use antithesis_sdk::random::AntithesisRng; +use anyhow::Context as _; +use clap::Parser; +use config::DatadogConfig; +use rand::rand_core::UnwrapErr; +use serde_json::json; + +/// Deployment inputs, sourced from the environment (or flags). +#[derive(Debug, Parser)] +#[command(name = "first_sample_config")] +struct Cli { + /// Directory to write `datadog.yaml` and the `ready` sentinel into (shared + /// `agent-config` volume; the ADP container reads it). + #[arg(long, env = "CONFIG_DIR", default_value = "/agent-config")] + config_dir: PathBuf, + /// Agent hostname written into the config. (`DD_HOSTNAME`, not the ambient + /// `HOSTNAME`, so a container's own hostname does not leak in.) + #[arg(long, env = "DD_HOSTNAME", default_value = "antithesis-adp")] + hostname: String, + /// Agent API key written into the config. + #[arg(long, env = "API_KEY", default_value = "antithesis-test-api-key")] + api_key: String, + /// Metrics intake base URL. + #[arg(long, env = "DD_URL", default_value = "http://intake:2049")] + dd_url: String, + /// `DogStatsD` unix datagram socket path. + #[arg(long, env = "DOGSTATSD_SOCKET", default_value = "/var/run/datadog/dsd.socket")] + dogstatsd_socket: PathBuf, +} + +fn main() -> anyhow::Result<()> { + antithesis_init(); + let cli = Cli::parse(); + + fs::create_dir_all(&cli.config_dir) + .with_context(|| format!("create agent config dir {}", cli.config_dir.display()))?; + + let mut rng = UnwrapErr(AntithesisRng); + let config = DatadogConfig::sample( + &mut rng, + &cli.hostname, + &cli.api_key, + &cli.dd_url, + &cli.dogstatsd_socket, + ); + + let yaml_path = cli.config_dir.join("datadog.yaml"); + fs::write(&yaml_path, config.to_yaml()?.as_bytes()) + .with_context(|| format!("write agent config {}", yaml_path.display()))?; + + // Per-timeline anchor: counting these in triage tells us how many distinct + // configs the run sampled. + let details = serde_json::to_value(&config).unwrap_or_else(|e| json!({ "serialize_error": e.to_string() })); + assert_reachable!("first_sample_config.config_sampled", &details); + + // Release ADP: it blocks on this sentinel, then boots under the config above. + let ready_path = cli.config_dir.join("ready"); + fs::write(&ready_path, b"ready\n").with_context(|| format!("write sentinel {}", ready_path.display()))?; + Ok(()) +} diff --git a/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs b/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs index ac9f887a3e..9f19dd7c59 100644 --- a/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs +++ b/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs @@ -6,7 +6,9 @@ //! grow without bound under sustained high cardinality). use std::os::unix::net::UnixDatagram; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +use std::thread::sleep; +use std::time::{Duration, Instant}; use antithesis_sdk::prelude::*; use antithesis_sdk::random::AntithesisRng; @@ -49,8 +51,7 @@ fn main() -> anyhow::Result<()> { }; let count: u64 = rng.random_range(50..=2000); - let socket = UnixDatagram::unbound()?; - socket.connect(&config.dogstatsd_socket)?; + let socket = connect_with_retry(&config.dogstatsd_socket)?; let names = ["adp.test.foo", "adp.test.bar", "adp.test.balkajsldfkjasdlfkjasdfz"]; let metric_types = ["c", "g"]; @@ -92,3 +93,16 @@ fn main() -> anyhow::Result<()> { Ok(()) } + +// Wait for ADP to bind the socket, intentionally naive. +fn connect_with_retry(path: &Path) -> anyhow::Result { + let deadline = Instant::now() + Duration::from_secs(30); + loop { + let socket = UnixDatagram::unbound()?; + match socket.connect(path) { + Ok(()) => return Ok(socket), + Err(_) if Instant::now() < deadline => sleep(Duration::from_millis(250)), + Err(e) => return Err(e).with_context(|| format!("ADP did not bind {} within 30s", path.display())), + } + } +} diff --git a/test/antithesis/harness/src/lib.rs b/test/antithesis/harness/src/lib.rs new file mode 100644 index 0000000000..8c05e117db --- /dev/null +++ b/test/antithesis/harness/src/lib.rs @@ -0,0 +1,4 @@ +//! Shared helpers for the Antithesis harness, used by the `src/bin/*` test +//! commands. + +pub mod rand; diff --git a/test/antithesis/harness/src/rand.rs b/test/antithesis/harness/src/rand.rs new file mode 100644 index 0000000000..125babecfa --- /dev/null +++ b/test/antithesis/harness/src/rand.rs @@ -0,0 +1,65 @@ +//! Randomness utilities. + +use rand::distr::Distribution; +use rand::{Rng, RngExt}; +use rand_distr::LogNormal; + +/// Boundary values for the u64 field. +const BOUNDARIES: &[u64] = &[ + 0, + 1, + i8::MAX as u64 - 1, + i8::MAX as u64, + i8::MAX as u64 + 1, + u8::MAX as u64 - 1, + u8::MAX as u64, + u8::MAX as u64 + 1, + i16::MAX as u64 - 1, + i16::MAX as u64, + i16::MAX as u64 + 1, + u16::MAX as u64 - 1, + u16::MAX as u64, + u16::MAX as u64 + 1, + i32::MAX as u64 - 1, + i32::MAX as u64, + i32::MAX as u64 + 1, + u32::MAX as u64 - 1, + u32::MAX as u64, + u32::MAX as u64 + 1, + i64::MAX as u64 - 1, + i64::MAX as u64, + i64::MAX as u64 + 1, + u64::MAX - 1, + u64::MAX, +]; + +/// Produces `u64` values that are generally 'normal' and with some being +/// boundary values. +#[derive(Debug, Clone, Copy)] +pub struct Probe; + +impl Distribution for Probe { + fn sample(&self, rng: &mut R) -> u64 { + if rng.random_ratio(1, 8) { + BOUNDARIES[rng.random_range(0..BOUNDARIES.len())] + } else { + typical(rng) + } + } +} + +/// Approximate probability of a typical draw landing in each range: +/// +/// | Value range | Probability | +/// |------------------------|-------------| +/// | `<= 16` | ~15% | +/// | `16 ..= 256` | ~21% | +/// | `256 ..= 1_024` | ~14% | +/// | `1_024 ..= 4_096` | ~14% | +/// | `4_096 ..= 65_536` | ~22% | +/// | `65_536 ..= 1_048_576` | ~11% | +/// | `> 1_048_576` | ~4% | +fn typical(rng: &mut R) -> u64 { + let dist = LogNormal::new(1024.0_f64.ln(), 4.0).expect("median > 0 and sigma >= 0"); + num_traits::cast::(dist.sample(rng).round()).unwrap_or(u64::MAX) +} diff --git a/test/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md index 8a80c280b4..479c17494f 100644 --- a/test/antithesis/scratchbook/existing-assertions.md +++ b/test/antithesis/scratchbook/existing-assertions.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: Datadog ADP Confluence space (design notes, weekly summaries, gap analyses) consulted for grounding. @@ -15,17 +15,20 @@ external_references: ## Summary -**A small bootstrap-and-workload assertion set exists**, added by the Antithesis harness commit -(`chore(agent-data-plane): Antithesis test harness and workload`, the parent of this scratchbook -commit). It comprises **6 SDK call sites** across three binaries: one lifecycle init and one -bootstrap reachability probe in ADP (both gated behind the `antithesis` cargo feature, no-op in -production), plus two workload-side `assert_reachable!`/`assert_sometimes!` pairs in the harness test -commands. These are **integration probes and anti-vacuity anchors**, not the property-catalog -invariants — none of the 35 cataloged property assertions is implemented yet. +**A bootstrap-and-workload assertion set exists, now with the first liveness instrumentation.** It +comprises **8 SDK call sites**: one lifecycle init and one bootstrap reachability probe in ADP, two +workload-side `assert_reachable!`/`assert_sometimes!` pairs in the harness drivers, and — added +2026-05-31 — the external `eventually_adp_alive` liveness `assert_always!` plus the **first in-SUT +property assertion**, an `assert_sometimes!` at the forwarder 2xx site in `saluki-components`. All +ADP/`saluki-components` sites are gated behind an `antithesis` cargo feature (no-op in production). +The bootstrap probe and the two driver anchors remain **integration probes / anti-vacuity anchors**; +the two new sites are real liveness instrumentation (Category H `adp-stays-alive` and the +good-function half of `adp-keeps-delivering` / in-SUT seed of `forwarder-eventual-delivery`). > [!NOTE] -> A prior version of this file stated no SDK assertions existed. That was true before the harness -> commit landed; it is now stale. Re-research on 2026-05-30 corrected it. +> History: an early version of this file claimed no SDK assertions existed (true before the harness +> commit; corrected 2026-05-30). Updated again 2026-05-31 when the liveness pieces landed (6 → 8 +> sites). ## Assertions present @@ -37,12 +40,17 @@ invariants — none of the 35 cataloged property assertions is implemented yet. | `test/antithesis/harness/src/bin/finally_verify_delivery.rs:59` | `assert_sometimes!` | "metrics delivered end-to-end to the intake" (`delivered > 0`) | harness binary | Workload-side liveness anchor — partially seeds `forwarder-eventual-delivery`. | | `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:77` | `assert_reachable!` | "workload sent a dogstatsd batch" | harness binary | Confirms the DSD driver actually emitted load. | | `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:87` | `assert_sometimes!` | "workload drove a high-cardinality dogstatsd flood" (`regime == High`) | harness binary | Anti-vacuity anchor that timelines reach the high-cardinality regime — seeds `rss-bounded-under-cardinality`. | +| `test/antithesis/harness/src/bin/eventually_adp_alive.rs:62` | `assert_always!` | "ADP booted: API reachable and DogStatsD socket present" | harness binary (`eventually_`, faults-paused) | Death-liveness for `adp-stays-alive` — fails the branch when ADP self-crashed (config panic / load) but stayed down through the quiet period. | +| `lib/saluki-components/src/common/datadog/io.rs:553` | `assert_sometimes!` | "ADP forwarded a payload to the intake" (`{ domain }`) | `#[cfg(feature = "antithesis")]` | First in-SUT property assertion — good-function liveness (the full pipeline ran to a 2xx) + replay checkpoint; good-function half of `adp-keeps-delivering`, in-SUT seed of `forwarder-eventual-delivery`. | Dependency wiring: ADP gains the SDK only under the `antithesis` feature (`bin/agent-data-plane/Cargo.toml:14` → `dep:antithesis_sdk`, `antithesis_sdk/full`, -`dep:antithesis-instrumentation`); the harness crate depends on `antithesis_sdk` unconditionally -(`test/antithesis/harness/Cargo.toml`). `antithesis-instrumentation` is an external build-time -instrumentation crate, not a source of in-tree assertions. +`dep:antithesis-instrumentation`, and now `saluki-components/antithesis`). As of 2026-05-31 +`saluki-components` has its own optional `antithesis` feature (`dep:antithesis_sdk`, +`antithesis_sdk/full`), enabled transitively by the ADP feature — this is what lets in-SUT property +assertions live in the component crate, not just in the ADP binary. The harness crate depends on +`antithesis_sdk` unconditionally (`test/antithesis/harness/Cargo.toml`). `antithesis-instrumentation` +is an external build-time instrumentation crate, not a source of in-tree assertions. ## How this was determined @@ -55,11 +63,12 @@ Searched the repository with ripgrep over `*.rs` and `*.toml`: ## Implication for property work -The catalog's invariants are still **net-new instrumentation**. The two `assert_sometimes!` anchors -above are workload-side only and serve anti-vacuity, not the safety/liveness invariants themselves: +Most catalog invariants are still **net-new instrumentation**, but the pattern is now proven in-SUT: -- `forwarder-eventual-delivery` has a workload-side `Sometimes(delivered > 0)` but no SUT-side - no-loss `Always`/accounting assertion — that remains to be added. +- `forwarder-eventual-delivery` now has an **in-SUT** `Sometimes(forwarded a payload)` at the 2xx + site (io.rs:553) in addition to the workload-side `Sometimes(delivered > 0)`. The full no-loss + `Always`/accounting reconciliation (delivered == accepted-and-retryable after a transient outage) + is still net-new. - `rss-bounded-under-cardinality` has its high-cardinality `Sometimes` anchor but no SUT-side RSS or interner-bound `Always` — also net-new. - The ~17 properties requiring in-process SUT-side assertions (per evaluation R2) still need ADP to diff --git a/test/antithesis/scratchbook/properties/adp-keeps-delivering.md b/test/antithesis/scratchbook/properties/adp-keeps-delivering.md new file mode 100644 index 0000000000..67b571b22f --- /dev/null +++ b/test/antithesis/scratchbook/properties/adp-keeps-delivering.md @@ -0,0 +1,60 @@ +--- +slug: adp-keeps-delivering +title: ADP still processes and delivers after load (functional liveness) +type: Liveness +priority: Medium +sut_path: /home/ssm-user/src/saluki +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 +status: partial — in-SUT good-function `Sometimes` LANDED; per-branch wedge `assert_always` MISSING +--- + +# adp-keeps-delivering — ADP still processes and delivers after load + +## Property (one sentence) +After the load drivers run, in a faults-paused window the mock intake has received metrics *and* ADP +still serves `:5100` — i.e. ADP is not merely up but not wedged. + +## Origin +Strengthens [`adp-stays-alive`](adp-stays-alive.md): a process can be reachable on `:5100` yet have +stopped processing/forwarding (deadlock, stalled pipeline, dropped-everything). The existing +`finally_verify_delivery` harness command already polls the mock intake and fires a +`Reachable`/`Sometimes` anchor for end-to-end delivery — this property upgrades that to a per-branch +liveness assertion so a wedged-but-alive ADP becomes a counterexample. + +## Relationship to existing instrumentation +- `existing-assertions.md`: `finally_verify_delivery` carries a `Sometimes(delivered > 0)` anchor — + good for "delivery happens at least once across the run," but it does **not** fail on a branch + where ADP wedged after accepting load. This property is the missing per-branch `assert`. + +## The fault-gating mechanism +Same as `adp-stays-alive`: evaluate in a quiet period (`finally_`, or `ANTITHESIS_STOP_FAULTS`), so a +fault that merely delayed delivery recovers and passes, while a self-inflicted wedge persists and +fails. Note: no-loss/delivery reconciliation must use UDS or TCP ingress, not UDP (catalog R3). + +## Implementation status +- **Landed (good-function half):** an in-SUT `assert_sometimes!("ADP forwarded a payload to the + intake", { domain })` at the forwarder's 2xx site + (`lib/saluki-components/src/common/datadog/io.rs`, in the `status.is_success()` branch of + `process_http_response`), behind the new `saluki-components/antithesis` feature (enabled + transitively by `agent-data-plane/antithesis`). A 2xx means the whole ingest→aggregate→encode→ + forward pipeline ran, so this proves a *booted ADP actually works*, and as a `Sometimes` it also + gives Antithesis a replay checkpoint anchored on a healthy-forwarding state. This is the in-SUT + counterpart to the workload-side `Sometimes(delivered > 0)` already in `finally_verify_delivery`, + and it doubles as the in-SUT seed for [`forwarder-eventual-delivery`](forwarder-eventual-delivery.md). +- **Still net-new (the per-branch wedge detector):** extend the `finally_verify_delivery` command so + that, in the faults-paused window, it polls the mock intake's dump endpoint and `:5100` and asserts + `assert_always!(delivered_recently && reachable, …)`. This is what catches "ADP accepted load, then + wedged" on a *specific* branch — neither the run-wide `Sometimes` above nor a bare `:5100` + reachability check fails on that branch. + +## Assertion-type rationale +**Liveness** (a good thing — delivery — eventually happens after load), realized as an +`assert_always!` inside the faults-paused `finally_` after a bounded poll, for the same reason as +`adp-stays-alive`. + +## Open Questions +- "Delivered recently" needs a freshness window relative to the last driver batch — define it so a + stale earlier delivery doesn't mask a current wedge. +- Whether to count only metrics or also events/service-checks delivered (ties to + `events-sc-no-silent-loss`). diff --git a/test/antithesis/scratchbook/properties/adp-stays-alive.md b/test/antithesis/scratchbook/properties/adp-stays-alive.md new file mode 100644 index 0000000000..df3bcd2bcc --- /dev/null +++ b/test/antithesis/scratchbook/properties/adp-stays-alive.md @@ -0,0 +1,104 @@ +--- +slug: adp-stays-alive +title: ADP boots and stays serving (self-inflicted-crash liveness) +type: Liveness +priority: High +sut_path: /home/ssm-user/src/saluki +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 +status: LANDED as the `eventually_adp_alive` test command +--- + +# adp-stays-alive — ADP boots and stays serving + +## Property (one sentence) +After the per-replay `datadog.yaml` is applied and the workload runs, ADP's unprivileged API +(`:5100`) is reachable within a faults-paused window; if it never comes up, ADP died of its own +config or load — not of an injected node fault. + +## Origin +The harness now generates per-replay configs (`datadog-yaml-config-gen`) and adversarial load whose +boundary values *should* sometimes crash ADP — e.g. an oversized `dogstatsd_string_interner_size` +panics at boot (`capacity would overflow isize::MAX`). But nothing demonstrated ADP is alive: the +only ADP-side assertion was a `reachable` bootstrap probe, which fires on success and is satisfied as +long as *some* branch boots — it cannot flag the branch where ADP died. A process that panics also +cannot self-assert its own liveness after the fact. So the catch must be an external observer. + +> **Provenance clarification (the misunderstanding worth recording).** That interner boot-panic was +> first seen only via local **`snouty validate`** — the single-config smoke run at launch time, +> *outside* any Antithesis timeline. That is **not** the same as an Antithesis shot finding the crash: +> `validate` exercises one static `datadog.yaml`, whereas a run draws the whole config space across +> timelines. Pre-`eventually_adp_alive`, **no in-run mechanism** would have turned such a boot/load +> crash into a counterexample. This property is exactly that missing in-run catch; do not conflate +> "validate rejected a config locally" with "a shot found the bug." + +## The fault-gating mechanism (the crux) +The requirement is: trigger on self-inflicted death (panic on startup, crash from load) but **not** +on death caused by injected node faults (kill/pause/stop/throttle/clock). A quiet period separates +the two: +- `eventually_` and `finally_` test commands run with **faults already paused**; `ANTITHESIS_STOP_FAULTS` + gives the same mid-run. +- During a quiet period a **fault-killed** container is restored by the platform, so fault-induced + down recovers → the liveness check passes. +- A **self-inflicted** crash is config/load-driven and deterministic: it crash-loops or stays dead + even with faults paused → `:5100` never binds → the check fails. + +## Observation points considered +- **`:5100` API reachability (chosen):** external TCP check; survives the crash; the existing + `deploy/workload/entrypoint.sh` already polls it. Clean and direct. +- **End-to-end intake delivery:** stronger (alive *and* functional) — split into + [`adp-keeps-delivering`](adp-keeps-delivering.md). +- **Container-exit / built-in crash detection:** opaque; runs show it not firing for our boot panic + (open question); and it would also trip on fault-kills (not gated). Rejected as the primary. +- **In-SUT assertion:** rejected *for the death case* — a panicking process can't report its own + death, so liveness-on-crash must be observed externally. (Note: `saluki-components` *has* since + gained an `antithesis` feature + SDK dep for the **good-function** anchor in + [`adp-keeps-delivering`](adp-keeps-delivering.md); that proves a *booted* ADP works, which is a + different question from detecting a *dead* one.) + +## Why this image makes API liveness valid (vs. catalog note R1) +Note R1 says container/API liveness is vacuously green because the **production** ADP image runs an +s6 supervisor that auto-restarts ADP. The **harness** adp image is different: `deploy/Dockerfile` +adp stage is a bare binary + boot wrapper (`ENTRYPOINT ["/entrypoint.sh"]` → `agent-data-plane +run`), no supervisor. So a crash is not silently restarted, and a deterministic config/load crash +leaves `:5100` permanently unbound — API liveness is a real signal here. If the harness ever adopts +an auto-restart image, this property must move to a restart-count assertion (per R1). + +## Implementation (landed) +Realized as the `eventually_adp_alive` test command +(`test/antithesis/harness/src/bin/eventually_adp_alive.rs`). In the faults-paused `eventually_` +window it polls **both** ADP's `:5100` API (`TcpStream::connect`) and the DogStatsD listener socket +(`/var/run/datadog/dsd.socket` exists) for up to ~60×1s, then +`assert_always!(api_reachable && socket_present, …)` with the addresses in the details. Checking the +socket as well as `:5100` is slightly stronger than the original sketch: it confirms ADP got far +enough through bootstrap to *accept metrics*, not just to bind its control API. The assertion fires +once per branch; a branch where ADP self-crashed never satisfies both → counterexample. The workload +container intentionally gates on adp `service_started` (not `service_healthy`) so this command still +runs — and can observe a dead ADP — when ADP never becomes healthy. + +## Assertion-type rationale +**Liveness**, realized as an `assert_always!` *inside a faults-paused command after a bounded +recovery poll* — within that command ADP must be up, so a single always-evaluation is the right fit; +the quiet-period prefix supplies the fault discrimination rather than the assertion type. + +## Open Questions +- Does Antithesis's built-in container-exit detection already observe ADP boot-panics in this + topology? Runs show it reporting nothing — confirm via an `antithesis-query-logs` search for the + adp exit / the `isize::MAX` panic. `(needs human input)` +- Does a deterministic boot crash actually crash-loop, or exit once and stay down, under the harness + compose (no `restart:` policy on the adp service)? Either way `:5100` stays down; confirm. +- Mid-run crash coverage (workload masks a transient crash) needs an `ANTITHESIS_STOP_FAULTS` + liveness loop — deferred to a follow-up. + +## Investigation Log +- 2026-05-31: **Landed** as `eventually_adp_alive` (poll `:5100` + DSD socket, faults-paused + `eventually_`, `assert_always!`). Decoupled the workload from adp health (`service_started`) and + made the workload entrypoint non-gating so the check runs even when ADP is down. `snouty validate` + registers it ("1 eventually script"). Clarified detection provenance in Origin: the interner + boot-panic was a `snouty validate` finding, not an in-run one — this command is the in-run catch. +- 2026-05-31: Confirmed harness adp image is bare (no s6) from `deploy/Dockerfile` adp stage + (`ENTRYPOINT ["/entrypoint.sh"]`, `CMD ["run"]`) — so API liveness is non-vacuous here, reconciling + with catalog R1 which describes the production s6 image. +- 2026-05-31: Reviewed Antithesis fault model: `eventually_`/`finally_` run faults-paused; + `ANTITHESIS_STOP_FAULTS` for mid-run quiet periods; killed containers are restored during the + quiet period — basis for the self-inflicted-vs-fault discrimination above. diff --git a/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md b/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md index 98864f58b7..d8c8c940ef 100644 --- a/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md +++ b/test/antithesis/scratchbook/properties/forwarder-eventual-delivery.md @@ -5,7 +5,7 @@ commit: 042f41db3bd97118c38981765fd49696fce9d318 updated: 2026-05-28 type: Liveness priority: High -assertion_status: MISSING (net-new instrumentation) +assertion_status: PARTIAL — in-SUT `Sometimes(forwarded a payload)` landed 2026-05-31; recovery-reconciliation `Sometimes` still net-new --- # Property: After a transient intake outage clears, accepted-and-retryable transactions are eventually delivered @@ -79,12 +79,18 @@ expectation: every transaction that was (a) accepted and (b) retryable is eventu - Circuit breaker backoff schedule (exponential + jitter) — sets recovery latency, hence the size of the "eventually" window the assertion must allow. -## Suggested assertion (MISSING — net-new) -- **Sometimes(all-accepted-retryable-delivered-after-recovery)**: at least once, after a transient - outage clears and within a bounded window, the count of delivered transactions equals the count - of accepted-and-retryable transactions submitted before/during the outage (queue did not overflow). - This proves recovery actually happens. Best evaluated workload-side by reconciling the controlled - input set against the mock-intake received set. +## Suggested assertion +- **Landed 2026-05-31 — in-SUT delivery anchor:** `assert_sometimes!("ADP forwarded a payload to the + intake", { domain })` at the success branch of `process_http_response` (io.rs:553, behind + `saluki-components/antithesis`). This is the in-SUT proof that delivery *happens* (a 2xx from the + intake) and a replay checkpoint on a healthy-forwarding state — but it is **not** the recovery + property: a run-wide `Sometimes(forwarded)` is satisfied by any single delivery and says nothing + about *post-outage* completeness. +- **Still net-new — Sometimes(all-accepted-retryable-delivered-after-recovery):** at least once, after + a transient outage clears and within a bounded window, the count of delivered transactions equals + the count of accepted-and-retryable transactions submitted before/during the outage (queue did not + overflow). This proves recovery actually happens. Best evaluated workload-side by reconciling the + controlled input set against the mock-intake received set. - Supporting **Reachable**: the `Error::Open` re-enqueue path (`io.rs:468-474`) is hit at least once (proves the circuit breaker engaged and re-enqueued, not silently dropped). diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index bfcf0f7cba..73160b962a 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees and gap analyses that seed properties. @@ -15,7 +15,7 @@ external_references: # Property Catalog: Agent Data Plane (ADP) -35 properties across 7 categories. The system makes one headline guarantee — **"ADP will not +37 properties across 8 categories. The system makes one headline guarantee — **"ADP will not crash under load, losing customer data"** — which decomposes into the *Memory & Resource Bounds* and *Data Integrity & No Silent Loss* families. The remaining categories cover aggregation correctness, lifecycle/config, untrusted-input parsing, concurrency, and **transform & enrichment @@ -25,11 +25,12 @@ correctness** (Category G, added after evaluation — ADP as a *transformer*, no > service-checks; G2 transform-chain + runtime filter config-reload), applied 9 refinements, and > escalated one scope bias (traces/APM/logs/OTLP coverage). See `evaluation/synthesis.md`. -**Only bootstrap/workload SDK probes exist so far** (`existing-assertions.md`: 6 call sites — an -ADP `antithesis_init()` + bootstrap `assert_reachable!` behind the `antithesis` feature, and two -workload-side `assert_reachable!`/`assert_sometimes!` pairs in the harness). Every `Invariant` -below is still **net-new** SUT-side instrumentation. Several properties are **expected to fail by -design** under default config (memory limiter disabled, interner heap-fallback enabled, disk +**Instrumentation is mostly net-new, with the first pieces landed** (`existing-assertions.md`: 8 call +sites). Beyond the ADP bootstrap probe and workload-side anchors, two liveness pieces landed +2026-05-31: the external `eventually_adp_alive` command (Category H) and the **first in-SUT property +assertion** — an `assert_sometimes!` at the forwarder 2xx site in `saluki-components`. Every other +`Invariant` below is still **net-new** SUT-side instrumentation. Several properties are **expected to +fail by design** under default config (memory limiter disabled, interner heap-fallback enabled, disk persistence off) — these are flagged; they are the highest-value findings, not catalog errors. Provenance tags `[Fn]` after each slug name the discovery focus that surfaced it: @@ -648,6 +649,60 @@ partner's documented focus (the "Tag Filter RC Relay Stress Test"). These proper - Is the prefix-filter-after-mapper ordering load-bearing for equivalence, with any guard besides this property? - A reload updating one filter but lagging the other could filter at one stage but not the other for the same rule — confirm reachability. +## Category H — Liveness & Availability + +Properties that demonstrate ADP **boots and stays alive**. They exist because the +generated per-replay `datadog.yaml` configs and adversarial load *should* sometimes crash ADP +(`interner-full-bounded`, `rss-bounded-under-cardinality`, `config-incompatible-refuses-start`), yet +the bootstrap `assert_reachable!` only fires on success and cannot report a branch where ADP died. +The death-liveness catch watches from *outside* the SUT and is **fault-gated**: it evaluates in a +quiet period (faults paused — the `eventually_`/`finally_` prefixes do this, or +`ANTITHESIS_STOP_FAULTS`), so a node-fault-induced outage recovers and passes while a self-inflicted +crash (panic on startup, crash from load) persists and fails. That gating is exactly the requirement: +*trigger on self-inflicted death, not on injected node faults.* A complementary **in-SUT +good-function** `Sometimes` (the forwarder shipped a payload) shows a booted ADP actually works. + +> **Detection provenance (clarification, 2026-05-31).** Before this category landed, the *only* place +> a config-driven boot panic was ever observed was local **`snouty validate`** — a single-config +> smoke run done at launch time, outside any Antithesis timeline (e.g. an oversized +> `dogstatsd_string_interner_size` → `capacity would overflow isize::MAX`). No **in-run** mechanism +> caught it: a panicking ADP cannot self-assert, and the bootstrap `assert_reachable!` is silent on +> the dead branch. `eventually_adp_alive` (below) is what makes such boot/load crashes +> **in-run-detectable** — `snouty validate` catching one static config is *not* the same as an +> Antithesis shot finding it across the drawn-config space. + +### adp-stays-alive — ADP boots and stays serving (self-inflicted-crash liveness) +> **Status (2026-05-31): LANDED** as the `eventually_adp_alive` test command +> (`test/antithesis/harness/src/bin/eventually_adp_alive.rs`). Valid in *this* harness because the adp +> image is a bare binary + boot wrapper (no s6 supervisor) — unlike the production image, where API +> liveness is vacuously green (note R1; use restart-count there). +| | | +|---|---| +| **Type** | Liveness (ADP eventually serves), evaluated in a faults-paused window | +| **Property** | After the per-replay config is applied and the workload runs, ADP's unprivileged API (`:5100`) is reachable **and** the DogStatsD listener socket exists; if neither comes up in a quiet period, ADP died of its own config/load, not an injected fault. | +| **Invariant** | The `eventually_` command (faults already paused) polls `:5100` and the DSD socket for ~60×1s, then `assert_always!(api_reachable && socket_present, …)`. Fault-induced down recovers in the quiet period → passes; a deterministic config/load crash crash-loops or stays dead → never binds → fails. | +| **Antithesis Angle** | The crash is config-driven (drawn `dogstatsd_*` boundary values); a deterministic boot panic stays down across the whole quiet period regardless of restart policy, so the quiet period cleanly separates a real bug from a transient node fault. | + +### adp-keeps-delivering — ADP still processes and delivers after load (functional liveness) +> **Status (2026-05-31): PARTIAL.** The in-SUT good-function half **landed**: an `assert_sometimes!` +> at the forwarder's 2xx site (`lib/saluki-components/src/common/datadog/io.rs`, behind the new +> `saluki-components/antithesis` feature) fires when ADP ships a payload — proving a booted ADP runs +> the whole pipeline, and giving Antithesis a replay checkpoint anchored on a healthy state. The +> stronger **per-branch wedge detector** (an `assert_always!(delivered_recently && reachable)` in a +> faults-paused `finally_`) is **still net-new** — the landed `Sometimes` does not fail a branch where +> ADP accepted load then wedged. +| | | +|---|---| +| **Type** | Liveness (accepted load is eventually delivered), faults-paused window | +| **Property** | After the load drivers, in a quiet period the mock intake has received metrics *and* ADP still serves `:5100` — ADP is not merely up but not wedged. | +| **Invariant** | _Landed:_ in-SUT `Sometimes(forwarder shipped a payload)`. _Pending:_ `finally_` command that, in the faults-paused window, polls the mock intake's dump endpoint and `:5100` and asserts `Always(delivered_recently && reachable)` — catches "alive but stuck" that a bare reachability check (and a run-wide `Sometimes`) both miss. | + +**Open Questions (Category H)** +- Does Antithesis's built-in container-exit detection already see ADP boot-panics here? Runs show it + not firing — confirm via log search before assuming Category H is the only catch. `(needs human input)` +- Terminal `finally_`/`eventually_` only checks end-of-branch; a mid-run crash the workload papers + over needs a `ANTITHESIS_STOP_FAULTS` liveness loop (deferred). + ## Catalog-wide notes - **Default config is hostile to the bounded-memory family:** memory limiter disabled @@ -677,12 +732,14 @@ partner's documented focus (the "Tag Filter RC Relay Stress Test"). These proper (`malformed-dsd-no-crash`, `malformed-event-sc-no-crash`, `replay-no-panic-on-malformed-capture`, the aggregate-crash pair) must assert SUT-side `Unreachable` at panic sites — or assert on restart-count — **never** container liveness. -- **(R2, updated 2026-05-30) The Antithesis Rust SDK is now wired into ADP** behind the `antithesis` - cargo feature (`antithesis_init()` + a bootstrap `assert_reachable!`), and the harness binaries - carry workload-side anchors — so the "fork ADP + add the SDK + build an instrumented image" - prerequisite is largely satisfied (the wiring is proven). ~17 properties still need their net-new - in-process SUT-side **invariant** assertions landed on top of that scaffold; the ~10 workload-only - properties (forwarder delivery, retry-queue bounds, shutdown, config-gate, RSS) can run first. +- **(R2, updated 2026-05-31) The Antithesis Rust SDK is wired into ADP** behind the `antithesis` + cargo feature (`antithesis_init()` + a bootstrap `assert_reachable!`), and as of 2026-05-31 the + **first in-SUT property assertion** is landed: an `assert_sometimes!` at the forwarder 2xx site in + `saluki-components` (its own new `antithesis` feature, enabled transitively by + `agent-data-plane/antithesis`). So in-process instrumentation is no longer just the bootstrap probe + — the path from a catalog property to a real SUT-side assertion is proven end-to-end. ~16 properties + still need their net-new in-process SUT-side **invariant** assertions; the remaining workload-only + properties (retry-queue bounds, shutdown, config-gate, RSS) can run first. - **(R3) No-loss properties must use TCP or UDS ingress, not UDP** — UDP's inherent packet loss confounds any "accepted == delivered" reconciliation (`no-silent-interconnect-drop`, `forwarder-eventual-delivery`, `disk-persisted-retry-survives-restart`, `shutdown-drains-no-loss`, @@ -696,6 +753,20 @@ partner's documented focus (the "Tag Filter RC Relay Stress Test"). These proper properties and `config-runtime-update-not-revalidated`) require the **config-stream add-on topology** (Core Agent or stub) — they pass vacuously in standalone mode because the config watcher never fires. +- **(R5, updated 2026-05-31) Liveness observability is valid in this harness, and the catch is now + landed (cf. R1's production case).** The harness `adp` image is a bare binary + boot wrapper + (`deploy/Dockerfile` adp stage: `ENTRYPOINT ["/entrypoint.sh"]` → `agent-data-plane run`) with **no + s6 supervisor**, so external `:5100` / DSD-socket liveness is a real signal for Category H — a + deterministic config/load crash leaves them permanently unbound. As of 2026-05-31 the + `eventually_adp_alive` command realizes this, and the workload no longer gates on adp health + (`docker-compose.yaml`: `service_started`, not `service_healthy`) so the check runs even when ADP is + down. R1's "never use container liveness" applies to the production s6 image; if the harness ever + adopts an auto-restart image, switch Category H to a restart-count assertion. +- **(R6) Observed fault availability contradicts the Scope note.** Despite faults being recorded as + tenant-enabled below, runs launched with the `basic_test` webhook have injected **zero** fault + events — the `node - kill/pause/stop/throttle` and `clock - skip` total-fault-event properties + report `0/0`. So Category H's fault-gating is currently moot (no faults to mistake for a crash) but + is the right design for when faults fire; the webhook's fault configuration needs confirming. ## Scope (confirmed with user, 2026-05-28) diff --git a/test/antithesis/scratchbook/property-relationships.md b/test/antithesis/scratchbook/property-relationships.md index 866de00617..4cca3536b0 100644 --- a/test/antithesis/scratchbook/property-relationships.md +++ b/test/antithesis/scratchbook/property-relationships.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees and gap analyses that seed properties. @@ -15,7 +15,7 @@ external_references: # Property Relationships -Lightweight clustering of the 35 catalog properties by shared code paths, failure mechanisms, and +Lightweight clustering of the 37 catalog properties by shared code paths, failure mechanisms, and suspected dominance. Slugs match `property-catalog.md`. ## Cluster 1 — Bounded memory (the determinism story) @@ -160,7 +160,28 @@ Properties: `events-sc-no-silent-loss`, `malformed-event-sc-no-crash`, `events-s - **Anti-vacuity dependency:** `events-sc-pipeline-reachable` is the R4 anchor that keeps the other two from passing trivially under a metrics-dominated workload — a hard dependency, not just a relation. -## Shared-scenario pairs (R10 — count is not 35 independent test efforts) +## Cluster 10 — Liveness & availability observers (added 2026-05-31) + +Properties: `adp-stays-alive`, `adp-keeps-delivering`. + +- **Status (2026-05-31): landed.** `adp-stays-alive` is realized as the external `eventually_adp_alive` + command; the good-function half of `adp-keeps-delivering` is realized as an in-SUT + `assert_sometimes!` at the forwarder 2xx site. The per-branch "alive but wedged" `assert_always` for + `adp-keeps-delivering` remains net-new. +- **The external catch for every crash property.** Clusters 1, 4, 5, and 6 predict crashes + (interner/RSS overflow, aggregate panics, config-driven startup refusal, malformed-input panics); + Cluster 10 is what turns those into counterexamples, with the *death* check from *outside* the SUT + (a panicking ADP cannot self-assert) and the *good-function* check from inside. `adp-stays-alive` + dominates — `adp-keeps-delivering` adds the "alive but wedged" facet; its landed half lives at the + forwarder 2xx site, and its pending per-branch wedge detector would share the `finally_verify_delivery` + site. +- **Fault-gated, not fault-blind.** Both evaluate in a faults-paused window (`eventually_`/`finally_` + or `ANTITHESIS_STOP_FAULTS`), which is the line between a self-inflicted crash (persists → fail) + and an injected node fault (recovers → pass). This is a hard design dependency, not a relation. +- **Image dependency (R5).** Valid only because the harness adp image has no auto-restart supervisor; + on a production s6 image these become restart-count assertions (R1). + +## Shared-scenario pairs (R10 — count is not 37 independent test efforts) These pairs share a fault scenario / assertion site and should be implemented together; treat them as one test effort each for portfolio-sizing: diff --git a/test/antithesis/scratchbook/sut-analysis.md b/test/antithesis/scratchbook/sut-analysis.md index d3d6e76198..33f5cf86e5 100644 --- a/test/antithesis/scratchbook/sut-analysis.md +++ b/test/antithesis/scratchbook/sut-analysis.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 +updated: 2026-05-31 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees, Phase 1 bug bash, gap analyses, weekly summaries. @@ -305,3 +305,34 @@ into aggregation**, and **timing/interleaving** in the forwarder and interner. T mirrors the correctness harness — ADP + a controllable mock intake + a deterministic load generator — but adds Antithesis fault injection (network, process, clock) that the harness lacks. See `property-catalog.md` and `deployment-topology.md`. + +### 10a. Liveness observability & fault-gating (added 2026-05-31) + +The harness generates configs/load that *should* crash ADP, but nothing observed whether ADP is +alive, so those crashes were invisible **in-run** (Category H closes this; **landed 2026-05-31**). +Two facts shape the design: + +- **Where liveness is observable, and which check goes where:** the process/container, the + unprivileged API on `:5100`, the DogStatsD socket, and end-to-end delivery at the mock intake. + - *Death-liveness must be external:* a panicking process cannot self-assert, so "did ADP stay up?" + is observed from a workload-side command (`eventually_adp_alive`). + - *Good-function can be in-SUT:* a *booted* ADP can report that it works. `saluki-components` gained + an `antithesis` feature + SDK dep for an `assert_sometimes!` at the forwarder 2xx site — proving + the pipeline ran. (The death case and the good-function case are different questions; only the + former is forced external.) +- **Detection provenance (clarification):** the config-driven boot panic (oversized interner → + `isize::MAX`) was first seen via local `snouty validate` on a *single* config — not by an in-run + timeline. `validate`-rejects-a-config is not the same as a shot finding the bug across the drawn + config space; `eventually_adp_alive` is the in-run catch that was missing. +- **Image matters:** the harness adp image is a bare binary + boot wrapper (`deploy/Dockerfile` adp + stage, no s6 supervisor), so external `:5100` liveness is a *valid, non-vacuous* signal here — a + deterministic config/load crash leaves `:5100` permanently unbound. The production s6 image would + auto-restart and make this vacuous (catalog R1); there, use a restart-count assertion. +- **Distinguishing self-inflicted death from injected faults:** evaluate liveness in a **quiet + period** (`eventually_`/`finally_` run faults-paused; `ANTITHESIS_STOP_FAULTS` for mid-run). A + fault-killed container is restored during the quiet period → passes; a self-inflicted crash + persists → fails. This is exactly "trigger on panic-on-startup / crash-from-load, not on node + faults." +- **Caveat:** runs to date inject **zero** faults (the `node - *` / `clock - skip` total-event + properties report `0/0`), so the gating is currently moot but is the right design for when faults + are enabled; the launch webhook's fault configuration needs confirming.