Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion bin/agent-data-plane/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ workspace = true
[features]
default = []
fips = ["saluki-app/tls-fips", "saluki-components/fips"]
antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full", "dep:antithesis-instrumentation"]
antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full", "dep:antithesis-instrumentation", "saluki-components/antithesis"]

[dependencies]
antithesis-instrumentation = { workspace = true, optional = true }
Expand Down
2 changes: 2 additions & 0 deletions lib/saluki-components/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ workspace = true
default = []
config-test-support = []
fips = ["saluki-io/fips"]
antithesis = ["dep:antithesis_sdk", "antithesis_sdk/full"]

[dependencies]
antithesis_sdk = { workspace = true, optional = true }
arc-swap = { workspace = true }
async-trait = { workspace = true }
axum = { workspace = true }
Expand Down
13 changes: 13 additions & 0 deletions lib/saluki-components/src/common/datadog/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,19 @@ async fn process_http_response(
if status.is_success() {
debug!(endpoint_url, %status, "Request completed.");

// Reaching a successful intake response means the whole pipeline
// ran. This is a useful signal for process health but also
// acts as a checkpoint anchor for Antithesis replay: at this point
// there is a nominally functional system.
Comment thread
blt marked this conversation as resolved.
Comment thread
blt marked this conversation as resolved.
Comment thread
blt marked this conversation as resolved.
//
// No-op outside the `antithesis` feature build.
#[cfg(feature = "antithesis")]
antithesis_sdk::assert_sometimes!(
true,
"ADP forwarded a payload to the intake",
&serde_json::json!({ "domain": domain })
);

telemetry.track_successful_transaction(&metadata, domain);
} else {
telemetry.track_permanently_failed_transaction(&metadata, Some(status), domain);
Expand Down
31 changes: 22 additions & 9 deletions test/antithesis/deploy/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,14 @@ RUN --mount=type=bind,source=rust-toolchain.toml,target=/tmp/rust-toolchain.toml
# ---------------------------------------------------------------------------
# Build the instrumented Agent Data Plane.
#
# Coverage instrumentation uses the modern Antithesis Rust flow (post-2026-05-22): the
# `antithesis-instrumentation` crate (referenced once in main.rs behind the `antithesis` feature)
# provides the runtime shim, and these RUSTFLAGS enable LLVM sancov coverage. `--build-id` is
# required for symbolization; the release profile sets `debug = true`, so the binary keeps DWARF
# for /symbols. LTO is disabled to keep sancov instrumentation predictable.
# Coverage instrumentation uses the modern Antithesis Rust flow
# (post-2026-05-22): the `antithesis-instrumentation` crate (referenced once in
# main.rs behind the `antithesis` feature) provides the runtime shim, and these
# RUSTFLAGS enable LLVM sancov coverage. `--build-id` is required for
# symbolization; the release profile sets `debug = true`, so the binary keeps
# DWARF for /symbols. LTO is disabled to keep sancov instrumentation
# predictable. `panic = "abort"` (antithesis build only) turns any ADP panic
# into SIGABRT, caught as a hard crash.
# ---------------------------------------------------------------------------
FROM build-base AS adp-builder
ENV APP_FULL_NAME="Agent Data Plane" \
Expand All @@ -55,6 +58,7 @@ RUN --mount=type=cache,target=/adp/target,id=antithesis-adp-target \
--mount=type=cache,target=/root/.cargo/git,id=cargo-git \
cargo build --release --package agent-data-plane --features antithesis \
--target x86_64-unknown-linux-gnu \
--config 'profile.release.panic="abort"' \
--config 'target.x86_64-unknown-linux-gnu.rustflags=["--cfg","tokio_unstable","-Ccodegen-units=1","-Cpasses=sancov-module","-Cllvm-args=-sanitizer-coverage-level=3","-Cllvm-args=-sanitizer-coverage-trace-pc-guard","-Clink-args=-Wl,--build-id"]' && \
cp /adp/target/x86_64-unknown-linux-gnu/release/agent-data-plane /usr/local/bin/agent-data-plane && \
echo "Validating Antithesis instrumentation symbols..." && \
Expand All @@ -74,11 +78,14 @@ RUN --mount=type=cache,target=/tools/target,id=antithesis-tools-target \
--mount=type=cache,target=/root/.cargo/git,id=cargo-git \
cargo build --release \
--bin datadog-intake --bin millstone \
--bin parallel_driver_send_dogstatsd --bin finally_verify_delivery && \
--bin parallel_driver_send_dogstatsd --bin finally_verify_delivery --bin eventually_adp_alive \
--bin first_sample_config && \
cp /tools/target/release/datadog-intake /usr/local/bin/datadog-intake && \
cp /tools/target/release/millstone /usr/local/bin/millstone && \
cp /tools/target/release/parallel_driver_send_dogstatsd /usr/local/bin/parallel_driver_send_dogstatsd && \
cp /tools/target/release/finally_verify_delivery /usr/local/bin/finally_verify_delivery
cp /tools/target/release/finally_verify_delivery /usr/local/bin/finally_verify_delivery && \
cp /tools/target/release/eventually_adp_alive /usr/local/bin/eventually_adp_alive && \
cp /tools/target/release/first_sample_config /usr/local/bin/first_sample_config

# ---------------------------------------------------------------------------
# Runtime: Agent Data Plane (SUT).
Expand All @@ -92,8 +99,12 @@ RUN apt-get update && \
COPY --from=adp-builder /usr/local/bin/agent-data-plane /usr/local/bin/agent-data-plane
# Expose DWARF/build-id symbols to Antithesis for symbolization (one-hop symlink to the unstripped binary).
RUN mkdir -p /symbols && ln -s /usr/local/bin/agent-data-plane /symbols/agent-data-plane
# main.rs requires the bootstrap config file to exist at the default path; ship a minimal standalone config.
# main.rs requires the bootstrap config file to exist at the default path; ship a minimal standalone
# config as a fallback. The boot wrapper overwrites it with the per-replay config written by the
# `first_sample_config` workload command onto the shared `agent-config` volume.
COPY test/antithesis/deploy/adp/datadog.yaml /etc/datadog-agent/datadog.yaml
Comment thread
blt marked this conversation as resolved.
Comment thread
blt marked this conversation as resolved.
# Boot wrapper: waits for the drawn config sentinel, copies the config into place, then execs ADP.
COPY --chmod=755 test/antithesis/deploy/adp/entrypoint.sh /entrypoint.sh
# ADP's control-plane secure API requires an IPC TLS cert (a single PEM holding both certificate and
# private key) that the Core Agent normally generates. In standalone mode there is no Core Agent, so
# generate a self-signed cert+key. An empty auth_token satisfies the IPC auth config at startup.
Expand All @@ -103,7 +114,7 @@ RUN openssl req -x509 -newkey rsa:2048 -nodes -days 3650 \
cat /tmp/ipc_cert.pem /tmp/ipc_key.pem > /etc/datadog-agent/ipc_cert.pem && \
rm -f /tmp/ipc_cert.pem /tmp/ipc_key.pem && \
touch /etc/datadog-agent/auth_token
ENTRYPOINT ["/usr/local/bin/agent-data-plane"]
ENTRYPOINT ["/entrypoint.sh"]
CMD ["run"]

# ---------------------------------------------------------------------------
Expand All @@ -128,7 +139,9 @@ COPY --from=tools-builder /usr/local/bin/millstone /usr/local/bin/millstone
COPY --chmod=755 test/antithesis/deploy/workload/setup-complete.sh /opt/antithesis/setup-complete.sh
COPY test/antithesis/deploy/workload/test/ /opt/antithesis/test/
# Inject the compiled test-command binaries into the "main" test template.
COPY --from=tools-builder --chmod=755 /usr/local/bin/first_sample_config /opt/antithesis/test/v1/main/first_sample_config
COPY --from=tools-builder --chmod=755 /usr/local/bin/parallel_driver_send_dogstatsd /opt/antithesis/test/v1/main/parallel_driver_send_dogstatsd
COPY --from=tools-builder --chmod=755 /usr/local/bin/finally_verify_delivery /opt/antithesis/test/v1/main/finally_verify_delivery
COPY --from=tools-builder --chmod=755 /usr/local/bin/eventually_adp_alive /opt/antithesis/test/v1/main/eventually_adp_alive
COPY --chmod=755 test/antithesis/deploy/workload/entrypoint.sh /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
21 changes: 21 additions & 0 deletions test/antithesis/deploy/adp/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash
set -euo pipefail

# Agent Data Plane boot wrapper.
#
# first_sample_config writes this timeline's datadog.yaml + a `ready` sentinel to
# the shared volume; we block on it, copy the config, then `exec` one stable ADP.
# We block indefinitely rather than timing out and exiting non-zero, which would
# be read as an ADP crash. The startup log below makes the wait visible in triage,
# so a missing release shows as "waiting…" with no boot rather than a silent hang.

CONFIG_DIR="${AGENT_CONFIG_DIR:-/agent-config}"

echo "adp: waiting for ${CONFIG_DIR}/ready (released by first_sample_config)" >&2
while [ ! -f "${CONFIG_DIR}/ready" ]; do
sleep 1
done

cp "${CONFIG_DIR}/datadog.yaml" /etc/datadog-agent/datadog.yaml

exec /usr/local/bin/agent-data-plane "$@"
16 changes: 4 additions & 12 deletions test/antithesis/deploy/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,11 @@ services:
DD_DATA_PLANE_DOGSTATSD_ENABLED: "true"
volumes:
- dogstatsd-socket:/var/run/datadog
# first_sample_config (workload) writes this timeline's datadog.yaml + ready sentinel here.
- agent-config:/agent-config:ro
depends_on:
intake:
condition: service_healthy
healthcheck:
# ADP's unprivileged API listens on TCP :5100 once the internal supervisor is up.
test: ["CMD-SHELL", "bash -c 'exec 3<>/dev/tcp/localhost/5100'"]
interval: 2s
timeout: 2s
retries: 60

workload:
container_name: workload
Expand All @@ -62,19 +58,15 @@ services:
image: workload:latest
environment:
NO_COLOR: "1"
ADP_HOST: "adp"
ADP_API_PORT: "5100"
DSD_SOCKET: "/var/run/datadog/dsd.socket"
INTAKE_ADDR: "intake:2049"
INTAKE_HOST: "intake"
INTAKE_PORT: "2049"
volumes:
- dogstatsd-socket:/var/run/datadog
- agent-config:/agent-config
depends_on:
adp:
condition: service_healthy
intake:
condition: service_healthy

volumes:
dogstatsd-socket:
agent-config:
48 changes: 3 additions & 45 deletions test/antithesis/deploy/workload/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,9 @@ set -euo pipefail

# Workload client entrypoint.
#
# By the time this runs, docker-compose has gated startup on the `adp` and `intake` services being
# healthy (depends_on: condition: service_healthy). We re-confirm reachability defensively, emit the
# Antithesis `setup_complete` signal, then idle so Antithesis can run test commands from the test
# template at /opt/antithesis/test/v1/.
# Gated on intake-healthy (compose `depends_on`). Emit `setup_complete`, then
# idle so Antithesis runs the test commands.
Comment thread
blt marked this conversation as resolved.

ADP_HOST="${ADP_HOST:-adp}"
ADP_API_PORT="${ADP_API_PORT:-5100}"
DSD_SOCKET="${DSD_SOCKET:-/var/run/datadog/dsd.socket}"
INTAKE_HOST="${INTAKE_HOST:-intake}"
INTAKE_PORT="${INTAKE_PORT:-2049}"

wait_for_tcp() {
local host="$1" port="$2" name="$3" tries=60
echo "Waiting for ${name} (${host}:${port})..."
while (( tries-- > 0 )); do
if (exec 3<>"/dev/tcp/${host}/${port}") 2>/dev/null; then
echo "${name} is reachable."
return 0
fi
sleep 1
done
echo "Timed out waiting for ${name} (${host}:${port})." >&2
return 1
}

wait_for_socket() {
local path="$1" name="$2" tries=60
echo "Waiting for ${name} (${path})..."
while (( tries-- > 0 )); do
if [[ -S "${path}" ]]; then
echo "${name} is reachable."
return 0
fi
sleep 1
done
echo "Timed out waiting for ${name} (${path})." >&2
return 1
}

wait_for_tcp "${ADP_HOST}" "${ADP_API_PORT}" "agent-data-plane API"
wait_for_socket "${DSD_SOCKET}" "agent-data-plane DogStatsD socket"
wait_for_tcp "${INTAKE_HOST}" "${INTAKE_PORT}" "datadog-intake"

echo "System is ready. Emitting setup_complete."
/opt/antithesis/setup-complete.sh

echo "Workload client idle; awaiting Antithesis test commands."
echo "setup_complete emitted; workload idle, awaiting Antithesis test commands."
exec tail -f /dev/null
4 changes: 4 additions & 0 deletions test/antithesis/harness/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@ clap = { workspace = true, features = [
"std",
"usage",
] }
num-traits = { workspace = true }
rand = { workspace = true }
rand_distr = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
serde_yaml = { workspace = true }

[lints.clippy]
all = "deny"
Expand Down
75 changes: 75 additions & 0 deletions test/antithesis/harness/src/bin/eventually_adp_alive.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//! Antithesis `eventually_` liveness check: ADP booted and became reachable
//! within a bounded window.
//!
//! `eventually_` commands run in a fault-quiet period, so a node-fault induced
//! kill of ADP does not trip this check but a self-inflicted process exit
//! does. This triggers on ADP's own bugs, rather than antithesis fault
//! injection.
//!
//! We check two signals. First that ADP is reachable on :5100 and second that
//! it created a `DogStatsD` listener socket.

use std::net::{TcpStream, ToSocketAddrs};
use std::os::unix::fs::FileTypeExt;
use std::path::PathBuf;
use std::thread::sleep;
use std::time::Duration;

use antithesis_sdk::prelude::*;
use clap::{builder::NonEmptyStringValueParser, Parser};
use serde_json::json;

#[derive(Debug, Parser)]
#[command(name = "eventually_adp_alive")]
struct Config {
#[arg(
long = "adp-api-addr",
env = "ADP_API_ADDR",
default_value = "adp:5100",
value_parser = NonEmptyStringValueParser::new()
)]
adp_api_addr: String,
#[arg(
long = "dsd-socket",
env = "DSD_SOCKET",
default_value = "/var/run/datadog/dsd.socket"
)]
dsd_socket: PathBuf,
}

fn main() -> anyhow::Result<()> {
antithesis_init();
let config = Config::try_parse()?;

let mut api_reachable = false;
let mut socket_present = false;
// Check that the adp-api is reachable and the DogStatsD socket exists for
// about 60 seconds. A 1s connect timeout keeps the poll cadence bounded
// even when the API host is unresponsive.
for _ in 0..60 {
api_reachable = config
.adp_api_addr
.to_socket_addrs()
.ok()
.and_then(|mut addrs| addrs.next())
.is_some_and(|addr| TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok());
socket_present = config.dsd_socket.metadata().is_ok_and(|m| m.file_type().is_socket());
if api_reachable && socket_present {
break;
}
sleep(Duration::from_secs(1));
}
Comment thread
blt marked this conversation as resolved.

assert_always!(
api_reachable && socket_present,
"ADP booted: API reachable and DogStatsD socket present",
&json!({
"adp_api_addr": config.adp_api_addr,
"dsd_socket": config.dsd_socket.display().to_string(),
"api_reachable": api_reachable,
"socket_present": socket_present,
})
);

Ok(())
}
Loading
Loading