From 728c300d6b633ea1817a444969e2a94c8ea625a7 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Tue, 2 Jun 2026 14:31:51 +0000 Subject: [PATCH] =?UTF-8?q?test(antithesis):=20enable=20forwarder=20disk?= =?UTF-8?q?=20persistence=20=E2=80=94=20flags=20a=20log-amplification=20bu?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sample forwarder_storage_max_size_in_bytes 50/50 on/off with forwarder_storage_path on a persistent compose volume, so the on-disk retry queue and restart-recovery paths run for the first time. BUG this branch surfaces: with persistence on, a network partition fills the disk-backed retry queue, and the forwarder logs error! per failed retry attempt (io.rs:462/472/421). Over a large backlog that is unbounded log amplification — it floods per-moment output, tripping 'very high output ... fail to materialize' at cx=134896 on run 4ecf6d1b, which masks other findings. The same path also opens the non-atomic torn-write hunt at persisted.rs:184 under node termination. --- test/antithesis/deploy/docker-compose.yaml | 3 +++ .../src/bin/first_sample_config/config.rs | 22 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/test/antithesis/deploy/docker-compose.yaml b/test/antithesis/deploy/docker-compose.yaml index 4a54c2b95b..b215f2f5a9 100644 --- a/test/antithesis/deploy/docker-compose.yaml +++ b/test/antithesis/deploy/docker-compose.yaml @@ -41,6 +41,8 @@ services: - dogstatsd-socket:/var/run/datadog # first_sample_config (workload) writes this timeline's datadog.yaml + ready sentinel here. - agent-config:/agent-config:ro + # Forwarder on-disk retry queue. Persists across node termination so restart can recover it. + - forwarder-storage:/var/lib/adp-storage depends_on: intake: condition: service_healthy @@ -69,3 +71,4 @@ services: volumes: dogstatsd-socket: agent-config: + forwarder-storage: diff --git a/test/antithesis/harness/src/bin/first_sample_config/config.rs b/test/antithesis/harness/src/bin/first_sample_config/config.rs index a1abe76b17..8591512074 100644 --- a/test/antithesis/harness/src/bin/first_sample_config/config.rs +++ b/test/antithesis/harness/src/bin/first_sample_config/config.rs @@ -176,6 +176,16 @@ fn sample_buffer_size(rng: &mut R) -> u64 { } } +/// Forwarder on-disk retry cap. Half the time a real size so disk persistence is +/// on and the persisted-retry path runs, half the time 0 for in-memory-only. +fn sample_storage_max_bytes(rng: &mut R) -> u64 { + if rng.random_ratio(1, 2) { + 0 + } else { + rng.random_range(1_048_576..=268_435_456) + } +} + impl DogStatsdConfig { /// Sample the `DogStatsD` options from `rng`, taking the socket from the /// environment. @@ -230,6 +240,16 @@ pub(crate) struct DatadogConfig { /// with [`Probe`] so it often lands small enough for the workload to reach /// and exercise the cap, and occasionally large to probe the headroom. aggregate_context_limit: u64, + /// Forwarder on-disk retry cap, `forwarder_storage_max_size_in_bytes`. ADP + /// defaults to 0, which disables disk persistence and leaves the persisted + /// retry path dead. Sampled half the time nonzero to turn persistence on, + /// half the time 0 to cover the in-memory-only path. + #[serde(rename = "forwarder_storage_max_size_in_bytes")] + forwarder_storage_max_size_bytes: u64, + /// Forwarder storage directory, `forwarder_storage_path`. A mounted volume + /// that survives node termination, so a restart can recover the queue. + #[serde(rename = "forwarder_storage_path")] + forwarder_storage_path: &'static str, /// `DogStatsD` options, flattened to top-level `dogstatsd_*` keys. #[serde(flatten)] dogstatsd: DogStatsdConfig, @@ -248,6 +268,8 @@ impl DatadogConfig { dd_url: dd_url.to_owned(), log_level: rng.random(), aggregate_context_limit: Probe.sample(rng), + forwarder_storage_max_size_bytes: sample_storage_max_bytes(rng), + forwarder_storage_path: "/var/lib/adp-storage", dogstatsd: DogStatsdConfig::sample(rng, dogstatsd_socket), } }