From d665ab80f91d76979daa2d6ce4dca0f12a77c9fc Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 21 May 2026 14:42:18 -0400 Subject: [PATCH 01/56] feat(airlock): add native process driver for non-containerized tests Introduces NativeProcess and NativeProcessConfig in airlock, mirroring the relevant surface of the Docker Driver but spawning a local binary instead of a container. Captured stdout/stderr lines flow through a small LogSink trait so consumers can bridge to their own log buffer types without coupling airlock to panoramic-specific code. This is the foundation for running ADP integration tests natively on macOS, where ADP runs as a real macOS process rather than inside a Linux container. --- Cargo.lock | 1 + bin/correctness/airlock/Cargo.toml | 3 + bin/correctness/airlock/src/lib.rs | 1 + bin/correctness/airlock/src/native.rs | 228 ++++++++++++++++++++++++++ 4 files changed, 233 insertions(+) create mode 100644 bin/correctness/airlock/src/native.rs diff --git a/Cargo.lock b/Cargo.lock index c31c548538e..65eaf3e3712 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -81,6 +81,7 @@ dependencies = [ "home", "saluki-error", "tokio", + "tokio-util", "tracing", ] diff --git a/bin/correctness/airlock/Cargo.toml b/bin/correctness/airlock/Cargo.toml index c7fb63c5843..f6e160e719b 100644 --- a/bin/correctness/airlock/Cargo.toml +++ b/bin/correctness/airlock/Cargo.toml @@ -15,8 +15,11 @@ home = { workspace = true } saluki-error = { workspace = true } tokio = { workspace = true, features = [ "fs", + "io-util", "macros", + "process", "rt", "rt-multi-thread", ] } +tokio-util = { workspace = true } tracing = { workspace = true } diff --git a/bin/correctness/airlock/src/lib.rs b/bin/correctness/airlock/src/lib.rs index 896cb080303..bcf68aa6d38 100644 --- a/bin/correctness/airlock/src/lib.rs +++ b/bin/correctness/airlock/src/lib.rs @@ -1,3 +1,4 @@ pub mod config; pub mod docker; pub mod driver; +pub mod native; diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/native.rs new file mode 100644 index 00000000000..6f9c924a5ea --- /dev/null +++ b/bin/correctness/airlock/src/native.rs @@ -0,0 +1,228 @@ +//! Native process driver for non-containerized integration tests. +//! +//! This module mirrors the relevant surface of the Docker [`Driver`][crate::driver::Driver] but +//! spawns a local binary instead of a container. It exists so that integration tests can run on +//! macOS hosts where ADP is exercised as a real macOS process rather than inside a Linux +//! container. +//! +//! Only the small subset of the Docker driver surface needed by the panoramic native runner is +//! implemented: spawn, log capture, exit watching, and cleanup. + +use std::{collections::HashMap, path::PathBuf, process::Stdio, sync::Arc, time::Duration}; + +use saluki_error::{generic_error, ErrorContext as _, GenericError}; +use tokio::{ + io::{AsyncBufReadExt as _, AsyncRead, BufReader}, + process::{Child, Command}, + sync::Mutex, + task::JoinHandle, +}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, warn}; + +/// Configuration for a native process to spawn. +#[derive(Clone)] +pub struct NativeProcessConfig { + /// Display name used for logs and reporting. + pub name: String, + /// Absolute path to the binary to execute. + pub binary_path: PathBuf, + /// Arguments passed to the binary. + pub args: Vec, + /// Environment variables to set for the process. + pub env: HashMap, + /// Working directory for the process. If `None`, inherits the caller's working directory. + pub working_dir: Option, +} + +impl NativeProcessConfig { + /// Creates a new configuration with the given display name and binary path. + pub fn new(name: impl Into, binary_path: impl Into) -> Self { + Self { + name: name.into(), + binary_path: binary_path.into(), + args: Vec::new(), + env: HashMap::new(), + working_dir: None, + } + } + + /// Sets the arguments for the process. + pub fn with_args(mut self, args: Vec) -> Self { + self.args = args; + self + } + + /// Sets all environment variables for the process at once. + pub fn with_env_map(mut self, env: HashMap) -> Self { + self.env = env; + self + } + + /// Sets the working directory for the process. + #[allow(dead_code)] + pub fn with_working_dir(mut self, dir: PathBuf) -> Self { + self.working_dir = Some(dir); + self + } +} + +/// A trait-object-friendly sink for log lines captured from a native process. +/// +/// This is intentionally minimal so consumers can implement it on their own log buffer type +/// without depending on `airlock`. +pub trait LogSink: Send + Sync { + /// Pushes a captured log line. `is_stderr` is `true` for lines that came from the + /// process's stderr stream, `false` for stdout. + fn push_line(&mut self, line: String, is_stderr: bool); +} + +/// A spawned native process and its supporting tasks. +/// +/// `NativeProcess` owns the child process plus background tasks that pump stdout/stderr lines +/// into a shared sink and observe the child's exit. Calling [`cleanup`][Self::cleanup] kills the +/// child, joins the background tasks, and cancels the exit token. +pub struct NativeProcess { + name: String, + child: Option, + exit_token: CancellationToken, + log_tasks: Vec>, + exit_task: Option>, +} + +impl NativeProcess { + /// Spawns the process described by `config`. The provided `log_sink` receives each line of + /// captured stdout/stderr; the provided `exit_token` is cancelled when the process exits. + pub async fn spawn( + config: NativeProcessConfig, log_sink: Arc>, exit_token: CancellationToken, + ) -> Result { + if !config.binary_path.exists() { + return Err(generic_error!( + "Binary not found at expected path: {}", + config.binary_path.display() + )); + } + + let mut cmd = Command::new(&config.binary_path); + cmd.args(&config.args) + .envs(&config.env) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .kill_on_drop(true); + if let Some(ref wd) = config.working_dir { + cmd.current_dir(wd); + } + + let mut child = cmd + .spawn() + .with_error_context(|| format!("Failed to spawn '{}'.", config.binary_path.display()))?; + + let stdout = child + .stdout + .take() + .ok_or_else(|| generic_error!("Failed to capture stdout."))?; + let stderr = child + .stderr + .take() + .ok_or_else(|| generic_error!("Failed to capture stderr."))?; + + let stdout_task = spawn_log_pump(stdout, log_sink.clone(), false); + let stderr_task = spawn_log_pump(stderr, log_sink, true); + + // We don't move the child here, so the actual exit observation happens in `cleanup` or + // `wait_with_timeout`. The exit_task is kept as a placeholder so future implementations + // can attach a SIGCHLD-style notifier without changing the public API. + let name_for_watcher = config.name.clone(); + let exit_token_for_watcher = exit_token.clone(); + let exit_task = tokio::spawn(async move { + debug!(name = %name_for_watcher, "Native process exit watcher placeholder; exit observation happens in cleanup."); + exit_token_for_watcher.cancelled().await; + }); + + Ok(Self { + name: config.name, + child: Some(child), + exit_token, + log_tasks: vec![stdout_task, stderr_task], + exit_task: Some(exit_task), + }) + } + + /// Returns the display name of the process. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns a handle to the cancellation token that fires when the process exits. + pub fn exit_token(&self) -> CancellationToken { + self.exit_token.clone() + } + + /// Waits for the process to exit, killing it if `timeout` elapses first. + /// + /// Returns the exit code if available, `None` if the process was terminated by signal. + #[allow(dead_code)] + pub async fn wait_with_timeout(&mut self, timeout: Duration) -> Result, GenericError> { + let child = self + .child + .as_mut() + .ok_or_else(|| generic_error!("Process already cleaned up."))?; + match tokio::time::timeout(timeout, child.wait()).await { + Ok(Ok(status)) => Ok(status.code()), + Ok(Err(e)) => Err(generic_error!("Failed to wait for process: {}", e)), + Err(_) => { + let _ = child.kill().await; + let _ = child.wait().await; + Err(generic_error!("Process did not exit within timeout.")) + } + } + } + + /// Kills the child, joins background tasks, and cancels the exit token. + pub async fn cleanup(mut self) { + if let Some(mut child) = self.child.take() { + let _ = child.kill().await; + let _ = child.wait().await; + } + self.exit_token.cancel(); + if let Some(handle) = self.exit_task.take() { + let _ = handle.await; + } + for handle in self.log_tasks.drain(..) { + let _ = handle.await; + } + } +} + +impl Drop for NativeProcess { + fn drop(&mut self) { + if self.child.is_some() { + warn!( + name = %self.name, + "NativeProcess dropped without explicit cleanup; child will be killed via kill_on_drop." + ); + } + } +} + +fn spawn_log_pump(reader: R, sink: Arc>, is_stderr: bool) -> JoinHandle<()> +where + R: AsyncRead + Unpin + Send + 'static, +{ + let mut lines = BufReader::new(reader).lines(); + tokio::spawn(async move { + loop { + match lines.next_line().await { + Ok(Some(line)) => { + let mut sink = sink.lock().await; + sink.push_line(line, is_stderr); + } + Ok(None) => break, + Err(e) => { + debug!(error = %e, "Log pump read error; stopping."); + break; + } + } + } + }) +} From 67f1776daa0b3782ea02d318a0862305b8b7a96a Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 21 May 2026 14:48:01 -0400 Subject: [PATCH 02/56] feat(panoramic): add native_macos runtime for integration tests Adds a 'runtimes' field to IntegrationConfig (default ['docker']). At test discovery time, configs that declare multiple runtimes expand into one Test instance per runtime, each named '{base_name}/{runtime}'. The Test trait impl for IntegrationConfig dispatches to either the existing Docker IntegrationRunner or the new NativeIntegrationRunner based on the resolved runtime. NativeIntegrationRunner spawns the ADP binary directly via tokio::process, piping stdout/stderr into the same LogBuffer the Docker path uses so existing assertions work unchanged. The binary path is discovered via the ADP_BINARY_PATH env var, defaulting to target/release/agent-data-plane. The Docker code path is untouched; declaring 'runtimes: [docker]' (or omitting the field) produces the existing behavior. --- bin/correctness/panoramic/src/config.rs | 80 +++++- bin/correctness/panoramic/src/main.rs | 1 + .../panoramic/src/native_runner.rs | 249 ++++++++++++++++++ 3 files changed, 325 insertions(+), 5 deletions(-) create mode 100644 bin/correctness/panoramic/src/native_runner.rs diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index 9120370ef51..541f08614c9 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -114,11 +114,36 @@ pub struct IntegrationConfig { /// List of assertion steps to run. pub assertions: Vec, + /// Runtimes under which this test runs. + /// + /// Each value must be either `"docker"` (the default) or `"native_macos"`. When multiple + /// runtimes are declared, the test discovery layer expands the config into one independent + /// test case per runtime, named `{name}/{runtime}`. + #[serde(default = "default_integration_runtimes")] + pub runtimes: Vec, + + /// Resolved runtime for this specific test instance after discovery-time expansion. + /// + /// At parse time, this is always empty. The discovery layer sets it when expanding a + /// multi-runtime config into per-runtime instances. + #[serde(skip)] + pub resolved_runtime: String, + /// Base path for resolving relative file paths. #[serde(skip)] pub base_path: PathBuf, } +fn default_integration_runtimes() -> Vec { + vec!["docker".to_string()] +} + +/// Runtime identifier for integration tests that run as native (non-containerized) processes. +pub const NATIVE_MACOS_RUNTIME: &str = "native_macos"; + +/// Runtime identifier for integration tests that run inside a Docker container. +pub const DOCKER_RUNTIME: &str = "docker"; + /// Container configuration for a test case. #[derive(Clone, Debug, Deserialize)] pub struct ContainerConfig { @@ -350,7 +375,11 @@ impl AssertionStep { #[async_trait] impl Test for IntegrationConfig { fn name(&self) -> String { - self.name.clone() + if self.resolved_runtime.is_empty() || self.runtimes.len() <= 1 { + self.name.clone() + } else { + format!("{}/{}", self.name, self.resolved_runtime) + } } fn suite(&self) -> TestSuite { @@ -367,13 +396,33 @@ impl Test for IntegrationConfig { fn images(&self) -> BTreeMap<&str, String> { let mut m = BTreeMap::new(); - m.insert("container", self.container.image.clone()); + // The native_macos runtime doesn't require any container image. + if self.resolved_runtime != NATIVE_MACOS_RUNTIME { + m.insert("container", self.container.image.clone()); + } m } + fn runtime(&self) -> String { + if self.resolved_runtime.is_empty() { + DOCKER_RUNTIME.to_string() + } else { + self.resolved_runtime.clone() + } + } + async fn run(&self, tctx: TestContext) -> TestResult { - let mut runner = crate::runner::IntegrationRunner::new(self.clone(), tctx); - runner.run().await + match self.resolved_runtime.as_str() { + NATIVE_MACOS_RUNTIME => { + let mut runner = crate::native_runner::NativeIntegrationRunner::new(self.clone(), tctx); + runner.run().await + } + // Default to the existing Docker path for "docker" or unset. + _ => { + let mut runner = crate::runner::IntegrationRunner::new(self.clone(), tctx); + runner.run().await + } + } } } @@ -685,7 +734,28 @@ fn try_load_test(config_path: &Path, dir_path: &Path) -> Result { let config = IntegrationConfig::from_yaml(config_path)?; - Ok(vec![Box::new(config)]) + if config.runtimes.is_empty() { + return Err(generic_error!( + "integration test '{}' has empty runtimes list", + config.name + )); + } + let mut tests: Vec> = Vec::new(); + for runtime in &config.runtimes { + if runtime != DOCKER_RUNTIME && runtime != NATIVE_MACOS_RUNTIME { + return Err(generic_error!( + "integration test '{}' declares unknown runtime '{}' (expected '{}' or '{}')", + config.name, + runtime, + DOCKER_RUNTIME, + NATIVE_MACOS_RUNTIME + )); + } + let mut variant = config.clone(); + variant.resolved_runtime = runtime.clone(); + tests.push(Box::new(variant)); + } + Ok(tests) } "correctness" => { let config_path_str = config_path diff --git a/bin/correctness/panoramic/src/main.rs b/bin/correctness/panoramic/src/main.rs index 9899aa0c536..1d926616ef9 100644 --- a/bin/correctness/panoramic/src/main.rs +++ b/bin/correctness/panoramic/src/main.rs @@ -32,6 +32,7 @@ use self::events::{create_event_channel, TestEvent}; mod reporter; use self::reporter::{OutputFormat, Reporter, TestResult, TestSuiteResult}; +mod native_runner; mod runner; mod test; mod tui; diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs new file mode 100644 index 00000000000..75a1bf350e6 --- /dev/null +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -0,0 +1,249 @@ +//! Native-process integration test runner. +//! +//! This runner is the parallel of [`crate::runner::IntegrationRunner`] but for tests declared +//! with `runtime: native_macos`. Instead of building a Docker container, it spawns a binary +//! directly via [`airlock::native::NativeProcess`] and feeds its stdout/stderr into the same +//! [`LogBuffer`][crate::assertions::LogBuffer] used by the Docker path so the assertions work +//! unchanged. +//! +//! # Scope +//! +//! Initial scope is ADP-standalone tests: a single binary, no Core Agent, no IPC. The binary +//! path is discovered via the `ADP_BINARY_PATH` env var, falling back to +//! `target/release/agent-data-plane` (resolved relative to the current working directory). + +use std::{ + collections::HashMap, + path::PathBuf, + sync::Arc, + time::{Duration, Instant}, +}; + +use airlock::native::{LogSink, NativeProcess, NativeProcessConfig}; +use saluki_error::{ErrorContext as _, GenericError}; +use tokio::sync::{Mutex, RwLock}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info}; + +use crate::{ + assertions::{create_assertion, AssertionContext, AssertionResult, LogBuffer}, + config::{AssertionStep, IntegrationConfig}, + reporter::{PhaseTiming, TestResult}, + test::{Test, TestContext}, +}; + +const ADP_BINARY_ENV_VAR: &str = "ADP_BINARY_PATH"; +const DEFAULT_ADP_BINARY_PATH: &str = "target/release/agent-data-plane"; + +/// Runner for a single native-process integration test case. +pub(crate) struct NativeIntegrationRunner { + test_case: IntegrationConfig, + tctx: TestContext, + log_buffer: Arc>, +} + +impl NativeIntegrationRunner { + /// Creates a new runner for the given test case. + pub(crate) fn new(test_case: IntegrationConfig, tctx: TestContext) -> Self { + Self { + test_case, + tctx, + log_buffer: Arc::new(RwLock::new(LogBuffer::default())), + } + } + + /// Runs the test case and returns the result. + pub(crate) async fn run(&mut self) -> TestResult { + let started = Instant::now(); + let test_name = self.test_case.name(); + let mut phase_timings = Vec::new(); + + info!(test = %test_name, "Starting native integration test case."); + + // Phase: resolve binary path. + let binary_path = match resolve_adp_binary_path() { + Ok(p) => p, + Err(e) => return make_error_result(test_name, started, "resolve_binary", e, phase_timings), + }; + debug!(test = %test_name, binary = %binary_path.display(), "Resolved ADP binary path."); + + // Phase: spawn the process. + let spawn_start = Instant::now(); + let exit_token = CancellationToken::new(); + let log_sink: Arc> = Arc::new(Mutex::new(NativeLogSink { + buf: self.log_buffer.clone(), + })); + + let process_config = NativeProcessConfig::new(self.test_case.name.clone(), binary_path) + .with_args(vec!["run".to_string()]) + .with_env_map(self.test_case.container.env.clone()); + + let process = match NativeProcess::spawn(process_config, log_sink, exit_token.clone()).await { + Ok(p) => p, + Err(e) => { + phase_timings.push(PhaseTiming { + phase: "spawn".to_string(), + duration: spawn_start.elapsed(), + }); + return make_error_result(test_name, started, "spawn", e, phase_timings); + } + }; + phase_timings.push(PhaseTiming { + phase: "spawn".to_string(), + duration: spawn_start.elapsed(), + }); + + info!(test = %test_name, "Native process started."); + + // Phase: run assertions. + let assertion_start = Instant::now(); + let assertion_results = self + .run_assertions(process.name().to_string(), exit_token.clone()) + .await; + phase_timings.push(PhaseTiming { + phase: "assertions".to_string(), + duration: assertion_start.elapsed(), + }); + + // Phase: cleanup. + let cleanup_start = Instant::now(); + process.cleanup().await; + phase_timings.push(PhaseTiming { + phase: "cleanup".to_string(), + duration: cleanup_start.elapsed(), + }); + + let passed = assertion_results.iter().all(|r| r.passed); + TestResult { + name: test_name, + passed, + duration: started.elapsed(), + assertion_results, + error: None, + phase_timings, + assertion_details: Vec::new(), + } + } + + async fn run_assertions( + &self, process_display_name: String, exit_token: CancellationToken, + ) -> Vec { + let mut results = Vec::new(); + let cancel_token = self.tctx.test_cancel_token(); + + for step in &self.test_case.assertions { + match step { + AssertionStep::Single(cfg) => { + let assertion = match create_assertion(cfg) { + Ok(a) => a, + Err(e) => { + results.push(AssertionResult { + name: "create_assertion".to_string(), + passed: false, + message: format!("Failed to create assertion: {}", e), + duration: Duration::ZERO, + }); + continue; + } + }; + let ctx = AssertionContext { + log_buffer: self.log_buffer.clone(), + container_exit_token: exit_token.clone(), + cancel_token: cancel_token.clone(), + container_name: process_display_name.clone(), + port_mappings: HashMap::new(), + }; + results.push(assertion.check(&ctx).await); + } + AssertionStep::Parallel { parallel } => { + let mut futures = Vec::with_capacity(parallel.len()); + for cfg in parallel { + match create_assertion(cfg) { + Ok(a) => { + let ctx = AssertionContext { + log_buffer: self.log_buffer.clone(), + container_exit_token: exit_token.clone(), + cancel_token: cancel_token.clone(), + container_name: process_display_name.clone(), + port_mappings: HashMap::new(), + }; + futures.push(async move { a.check(&ctx).await }); + } + Err(e) => { + results.push(AssertionResult { + name: "create_assertion".to_string(), + passed: false, + message: format!("Failed to create parallel assertion: {}", e), + duration: Duration::ZERO, + }); + } + } + } + let parallel_results = futures::future::join_all(futures).await; + results.extend(parallel_results); + } + } + } + + results + } +} + +fn resolve_adp_binary_path() -> Result { + let raw = std::env::var(ADP_BINARY_ENV_VAR) + .ok() + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(DEFAULT_ADP_BINARY_PATH)); + + raw.canonicalize().with_error_context(|| { + format!( + "ADP binary not found at '{}'. Set {} or run `cargo build --release --bin agent-data-plane`.", + raw.display(), + ADP_BINARY_ENV_VAR + ) + }) +} + +fn make_error_result( + name: String, started: Instant, phase: &str, e: GenericError, phase_timings: Vec, +) -> TestResult { + error!(test = %name, error = %e, phase, "Native integration test setup failed."); + TestResult { + name, + passed: false, + duration: started.elapsed(), + assertion_results: vec![], + error: Some(format!("Failed in phase '{}': {}", phase, e)), + phase_timings, + assertion_details: vec![], + } +} + +/// Bridges [`airlock::native::LogSink`] to the panoramic [`LogBuffer`]. +struct NativeLogSink { + buf: Arc>, +} + +impl LogSink for NativeLogSink { + fn push_line(&mut self, line: String, is_stderr: bool) { + // Try a non-blocking write first. If contended, spawn a task to defer the write so we + // don't stall the log pump (which is itself a tokio task). + if let Ok(mut buf) = self.buf.try_write() { + if is_stderr { + buf.stderr.push(line); + } else { + buf.stdout.push(line); + } + } else { + let buf = self.buf.clone(); + tokio::spawn(async move { + let mut buf = buf.write().await; + if is_stderr { + buf.stderr.push(line); + } else { + buf.stdout.push(line); + } + }); + } + } +} From edca625e0a5dadadb44da9f296df58c63dd993ac Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 21 May 2026 14:49:34 -0400 Subject: [PATCH 03/56] test(integration): enable basic-startup on native_macos runtime basic-startup is a single-process standalone ADP test (no Core Agent required), making it the simplest integration test to validate the new native_macos runtime end-to-end. The same config now runs in both Docker (on Linux) and as a native process (on macOS). --- test/integration/cases/basic-startup/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/test/integration/cases/basic-startup/config.yaml b/test/integration/cases/basic-startup/config.yaml index d84e802305f..cd1e1adc7c6 100644 --- a/test/integration/cases/basic-startup/config.yaml +++ b/test/integration/cases/basic-startup/config.yaml @@ -2,6 +2,7 @@ type: integration name: "basic-startup" description: "Verifies ADP starts successfully and remains stable" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" From 5c6d5df278c281eb4542bdfa6488f192a899599b Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 21 May 2026 14:52:38 -0400 Subject: [PATCH 04/56] build: add test-integration-macos make target for native macOS tests Adds two new Makefile targets: - build-adp-native: builds agent-data-plane in release mode for the current host. On macOS this produces a native macOS binary. - test-integration-macos: builds panoramic and ADP, then runs panoramic filtered to a single native_macos integration test (defaulting to basic-startup/native_macos, overridable via CASE=...). This works on any macOS host with cargo + rustc, no Docker required. Verified locally on Apple Silicon with basic-startup/native_macos passing in ~11s. --- Makefile | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Makefile b/Makefile index 49ecf52ebd7..c05672de7b2 100644 --- a/Makefile +++ b/Makefile @@ -569,6 +569,21 @@ list-integration-tests: build-panoramic list-integration-tests: ## Lists available ADP integration tests @target/release/panoramic list -d $(shell pwd)/test/integration/cases +.PHONY: build-adp-native +build-adp-native: check-rust-build-tools +build-adp-native: ## Builds the agent-data-plane binary natively for the current host (release profile) + @echo "[*] Building agent-data-plane (release, native host target)..." + @cargo build --release --bin agent-data-plane + +.PHONY: test-integration-macos +test-integration-macos: build-panoramic build-adp-native +test-integration-macos: ## Runs ADP integration tests natively on macOS (no Docker) + @echo "[*] Running native macOS integration tests..." + @ADP_BINARY_PATH=$(shell pwd)/target/release/agent-data-plane \ + target/release/panoramic run -d $(shell pwd)/test/integration/cases \ + -t $(if $(CASE),$(CASE),basic-startup/native_macos) --no-tui \ + $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) + .PHONY: ensure-rust-miri ensure-rust-miri: ifeq ($(shell command -v rustup >/dev/null || echo not-found), not-found) From 7539f276702b531ed371eb9ce894d0bf3085c9e5 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 21 May 2026 14:53:37 -0400 Subject: [PATCH 05/56] docs: add macOS native integration tests implementation plan --- ...26-05-21-macos-native-integration-tests.md | 941 ++++++++++++++++++ 1 file changed, 941 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md diff --git a/docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md b/docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md new file mode 100644 index 00000000000..85a35d80070 --- /dev/null +++ b/docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md @@ -0,0 +1,941 @@ +# macOS Native Integration Tests Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Enable a single integration test (`basic-startup`) to run as a native macOS process via panoramic, in a way that works both on the existing bare-metal `macos:sonoma-arm64` CI runner and locally on a developer's macOS machine. + +**Architecture:** Reuse the existing `Test::runtime()` mechanism (currently `"docker"` or `"kubernetes_in_docker"`) by adding a new `"native_macos"` runtime. At test discovery time, expand each `IntegrationConfig` with multiple declared runtimes into one `Test` instance per runtime. A new `NativeRunner` in panoramic handles the `native_macos` case by spawning the ADP binary directly via `tokio::process::Command` and using the existing assertion framework. The existing Docker path is untouched. + +**Tech Stack:** Rust, Tokio, the existing `panoramic` test runner and `airlock` driver crate. + +**Scope:** Only `basic-startup` on macOS. Only the standalone ADP path (no Core Agent, no IPC). No Tart wrapper in this PR — `make` target works directly on a macOS host; a follow-up PR can add Tart for non-macOS local dev. No CI job in this PR — that's a follow-up that depends on a separate `build-adp-macos-binary` job. This PR proves the end-to-end design works locally on macOS. + +--- + +## File Structure + +**New files:** +- `bin/correctness/airlock/src/native.rs` — `NativeProcess` abstraction (spawn, log capture, exit watch, cleanup) +- `bin/correctness/panoramic/src/native_runner.rs` — `NativeIntegrationRunner` analogous to the existing `IntegrationRunner` but for the native-process path +- `docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md` — this plan + +**Modified files:** +- `bin/correctness/airlock/src/lib.rs` — export `native` module +- `bin/correctness/panoramic/src/config.rs` — add `runtimes: Vec` field to `IntegrationConfig`; dispatch `run()` based on the per-instance runtime +- `bin/correctness/panoramic/src/test.rs` — at discovery, expand multi-runtime integration configs into one `Test` per runtime +- `bin/correctness/panoramic/src/main.rs` — wire the new module +- `bin/correctness/panoramic/src/assertions/mod.rs` — `AssertionContext` already has `container_name` and `port_mappings`; for native, `container_name` doubles as the process display name and `port_mappings` is identity (no remapping needed). No code change expected here, but verify. +- `test/integration/cases/basic-startup/config.yaml` — add `runtimes: [docker, native_macos]` +- `Makefile` — add `test-integration-macos` target + +**Files NOT touched in this PR (deferred):** +- `bin/correctness/panoramic/src/runner.rs` (the existing `IntegrationRunner`) — leave alone to keep the Linux path zero-risk +- `bin/correctness/panoramic/src/assertions/file_contains.rs` — only one test in the corpus uses it, not basic-startup +- `tooling/generate-correctness-pipeline.sh` — CI pipeline gen, deferred to a follow-up +- `.gitlab/` files — CI integration deferred + +--- + +## Conventions + +- The ADP binary location is discovered via the `ADP_BINARY_PATH` env var, falling back to `target/release/agent-data-plane` relative to the panoramic working directory. +- Per-test process output (stdout + stderr) is captured into the existing `LogBuffer` exactly the same way the Docker path does, so the existing assertions work unchanged. +- The native runner respects the existing `TestContext` cancel token and writes per-test logs into the existing `log_dir` structure. + +--- + +## Task 1: Add the `NativeProcess` abstraction in `airlock` + +**Files:** +- Create: `bin/correctness/airlock/src/native.rs` +- Modify: `bin/correctness/airlock/src/lib.rs:1-3` +- Test: covered by integration end-to-end (no unit test for this initial slice; the structure is mostly straight `tokio::process` wiring that's easier to exercise through panoramic) + +- [ ] **Step 1: Create the `native.rs` module skeleton** + +Create `bin/correctness/airlock/src/native.rs`: + +```rust +//! Native process driver for non-containerized integration tests. +//! +//! This module mirrors the surface of the Docker [`Driver`][crate::driver::Driver] but spawns a +//! local binary instead of a container. It exists so that integration tests can run on macOS +//! hosts where ADP is exercised as a real macOS process rather than inside a Linux container. +//! +//! Only the small subset of the Docker driver surface needed by the panoramic +//! `NativeIntegrationRunner` is implemented: spawn, log capture, exit watching, and cleanup. + +use std::{ + collections::HashMap, + path::PathBuf, + process::Stdio, + sync::Arc, + time::Duration, +}; + +use saluki_error::{generic_error, ErrorContext as _, GenericError}; +use tokio::{ + io::{AsyncBufReadExt as _, BufReader}, + process::{Child, Command}, + sync::Mutex, + task::JoinHandle, +}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, warn}; + +/// Configuration for a native process to spawn. +#[derive(Clone)] +pub struct NativeProcessConfig { + /// Display name used for logs and reporting. + pub name: String, + /// Absolute path to the binary to execute. + pub binary_path: PathBuf, + /// Arguments passed to the binary. + pub args: Vec, + /// Environment variables to set for the process. + pub env: HashMap, + /// Working directory for the process. If `None`, inherits panoramic's working directory. + pub working_dir: Option, +} + +impl NativeProcessConfig { + /// Creates a new configuration with the given display name and binary path. + pub fn new(name: impl Into, binary_path: impl Into) -> Self { + Self { + name: name.into(), + binary_path: binary_path.into(), + args: Vec::new(), + env: HashMap::new(), + working_dir: None, + } + } + + /// Sets the arguments for the process. + pub fn with_args(mut self, args: Vec) -> Self { + self.args = args; + self + } + + /// Sets an environment variable for the process. + pub fn with_env(mut self, key: impl Into, value: impl Into) -> Self { + self.env.insert(key.into(), value.into()); + self + } + + /// Sets all environment variables for the process at once. + pub fn with_env_map(mut self, env: HashMap) -> Self { + self.env = env; + self + } + + /// Sets the working directory for the process. + pub fn with_working_dir(mut self, dir: PathBuf) -> Self { + self.working_dir = Some(dir); + self + } +} + +/// A spawned native process and its supporting tasks. +/// +/// `NativeProcess` owns the child process plus background tasks that pump stdout/stderr lines +/// into a shared buffer and observe the child's exit. Dropping or explicitly calling +/// [`cleanup`][Self::cleanup] kills the child and joins the background tasks. +pub struct NativeProcess { + name: String, + child: Option, + exit_token: CancellationToken, + log_tasks: Vec>, + exit_task: Option>, +} + +impl NativeProcess { + /// Spawns the process described by `config`. The provided `log_sink` receives each line of + /// captured stdout/stderr; the provided `exit_token` is cancelled when the process exits. + pub async fn spawn( + config: NativeProcessConfig, + log_sink: Arc>, + exit_token: CancellationToken, + ) -> Result { + if !config.binary_path.exists() { + return Err(generic_error!( + "Binary not found at expected path: {}", + config.binary_path.display() + )); + } + + let mut cmd = Command::new(&config.binary_path); + cmd.args(&config.args) + .envs(&config.env) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .kill_on_drop(true); + if let Some(ref wd) = config.working_dir { + cmd.current_dir(wd); + } + + let mut child = cmd + .spawn() + .with_error_context(|| format!("Failed to spawn '{}'.", config.binary_path.display()))?; + + let stdout = child + .stdout + .take() + .ok_or_else(|| generic_error!("Failed to capture stdout."))?; + let stderr = child + .stderr + .take() + .ok_or_else(|| generic_error!("Failed to capture stderr."))?; + + let stdout_task = spawn_log_pump(stdout, log_sink.clone(), false); + let stderr_task = spawn_log_pump(stderr, log_sink, true); + + let exit_token_for_watcher = exit_token.clone(); + let name_for_watcher = config.name.clone(); + let exit_task = tokio::spawn(async move { + // Wait for the child to exit; we cannot move it out of the struct here, so the + // exit watching is done in `cleanup`. This task is only used as a placeholder if + // we add SIGCHLD-style observation later. For now, the exit token is fired in + // `cleanup` after `child.wait().await`. + debug!(name = %name_for_watcher, "Native process exit watcher placeholder."); + exit_token_for_watcher.cancelled().await; + }); + + Ok(Self { + name: config.name, + child: Some(child), + exit_token, + log_tasks: vec![stdout_task, stderr_task], + exit_task: Some(exit_task), + }) + } + + /// Returns the display name of the process. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns a handle to the cancellation token that fires when the process exits. + pub fn exit_token(&self) -> CancellationToken { + self.exit_token.clone() + } + + /// Waits for the process to exit. If `timeout` elapses first, the process is killed. + /// + /// Returns the exit code if available, or `None` if the process was killed by signal. + pub async fn wait_with_timeout(&mut self, timeout: Duration) -> Result, GenericError> { + let child = self + .child + .as_mut() + .ok_or_else(|| generic_error!("Process already cleaned up."))?; + match tokio::time::timeout(timeout, child.wait()).await { + Ok(Ok(status)) => Ok(status.code()), + Ok(Err(e)) => Err(generic_error!("Failed to wait for process: {}", e)), + Err(_) => { + let _ = child.kill().await; + let _ = child.wait().await; + Err(generic_error!("Process did not exit within timeout.")) + } + } + } + + /// Kills the child, joins background tasks, and cancels the exit token. + pub async fn cleanup(mut self) { + if let Some(mut child) = self.child.take() { + let _ = child.kill().await; + let _ = child.wait().await; + } + self.exit_token.cancel(); + if let Some(handle) = self.exit_task.take() { + let _ = handle.await; + } + for handle in self.log_tasks.drain(..) { + let _ = handle.await; + } + } +} + +impl Drop for NativeProcess { + fn drop(&mut self) { + if self.child.is_some() { + warn!( + name = %self.name, + "NativeProcess dropped without explicit cleanup; child will be killed via kill_on_drop." + ); + } + } +} + +/// A trait-object-friendly sink for log lines captured from a native process. +/// +/// This is intentionally minimal so panoramic's existing `LogBuffer` can wrap one of these +/// without depending on `airlock`. +pub trait LogSink: Send + Sync { + fn push_line(&mut self, line: String, is_stderr: bool); +} + +fn spawn_log_pump( + reader: R, + sink: Arc>, + is_stderr: bool, +) -> JoinHandle<()> +where + R: tokio::io::AsyncRead + Unpin + Send + 'static, +{ + let mut lines = BufReader::new(reader).lines(); + tokio::spawn(async move { + loop { + match lines.next_line().await { + Ok(Some(line)) => { + let mut sink = sink.lock().await; + sink.push_line(line, is_stderr); + } + Ok(None) => break, + Err(e) => { + debug!(error = %e, "Log pump read error; stopping."); + break; + } + } + } + }); +} +``` + +- [ ] **Step 2: Export the module from `airlock`** + +Modify `bin/correctness/airlock/src/lib.rs`: + +```rust +pub mod config; +pub mod docker; +pub mod driver; +pub mod native; +``` + +- [ ] **Step 3: Compile and verify** + +Run: `cd bin/correctness && cargo check -p airlock` +Expected: clean compile, no errors. + +- [ ] **Step 4: Commit** + +```bash +git add bin/correctness/airlock/src/native.rs bin/correctness/airlock/src/lib.rs +git commit -m "feat(airlock): add native process driver for non-containerized tests" +``` + +--- + +## Task 2: Bridge the existing `LogBuffer` to the `LogSink` trait + +**Files:** +- Modify: `bin/correctness/panoramic/src/assertions/mod.rs` (around `LogBuffer` definition) + +The Docker path populates `LogBuffer` via bollard `LogOutput`. The native path needs to populate the same `LogBuffer` via the `LogSink` trait so the existing assertions work unchanged. + +- [ ] **Step 1: Inspect the current `LogBuffer`** + +Run: `grep -n "pub struct LogBuffer\|impl LogBuffer\|push" bin/correctness/panoramic/src/assertions/mod.rs` +Note the existing API so the trait implementation matches. + +- [ ] **Step 2: Implement `LogSink` for `LogBuffer`** + +Add to `bin/correctness/panoramic/src/assertions/mod.rs`, after the existing `impl LogBuffer { ... }` block: + +```rust +impl airlock::native::LogSink for LogBuffer { + fn push_line(&mut self, line: String, is_stderr: bool) { + // Match the existing Docker log capture format: each entry is the raw line. The + // is_stderr flag is currently informational only. + let _ = is_stderr; + self.lines.push(line); + } +} +``` + +(Adjust field name `lines` if the struct uses something different — verify in Step 1.) + +- [ ] **Step 3: Verify it compiles** + +Run: `cd bin/correctness && cargo check -p panoramic` +Expected: clean compile. + +- [ ] **Step 4: Commit** + +```bash +git add bin/correctness/panoramic/src/assertions/mod.rs +git commit -m "feat(panoramic): implement LogSink for LogBuffer" +``` + +--- + +## Task 3: Add `runtimes` field to `IntegrationConfig` and expand at discovery + +**Files:** +- Modify: `bin/correctness/panoramic/src/config.rs` (`IntegrationConfig` struct, deserialization, `Test` impl) +- Modify: `bin/correctness/panoramic/src/test.rs` (`try_load_test` for `integration` type) + +- [ ] **Step 1: Add the `runtimes` field** + +Modify the `IntegrationConfig` struct in `bin/correctness/panoramic/src/config.rs`: + +```rust +#[derive(Clone, Debug, Deserialize)] +pub struct IntegrationConfig { + pub name: String, + + #[serde(default)] + pub description: Option, + + pub timeout: HumanDuration, + + pub container: ContainerConfig, + + pub assertions: Vec, + + /// Runtimes under which this test runs. + /// + /// Each value must be either `"docker"` (the default) or `"native_macos"`. When multiple + /// runtimes are declared, the test discovery layer expands the config into one independent + /// test case per runtime, named `{name}/{runtime}`. + #[serde(default = "default_runtimes")] + pub runtimes: Vec, + + /// Resolved runtime for this specific test instance after discovery-time expansion. + /// + /// At parse time, this is always empty. The discovery layer sets it when expanding a + /// multi-runtime config into per-runtime instances. + #[serde(skip)] + pub resolved_runtime: String, + + #[serde(skip)] + pub base_path: PathBuf, +} + +fn default_runtimes() -> Vec { + vec!["docker".to_string()] +} +``` + +- [ ] **Step 2: Surface the per-instance runtime via the `Test` trait impl** + +In the same file, update the `Test` impl for `IntegrationConfig`: + +```rust +#[async_trait] +impl Test for IntegrationConfig { + fn name(&self) -> String { + if self.resolved_runtime.is_empty() || self.runtimes.len() <= 1 { + self.name.clone() + } else { + format!("{}/{}", self.name, self.resolved_runtime) + } + } + + fn suite(&self) -> TestSuite { + TestSuite::Integration + } + + fn description(&self) -> Option { + self.description.clone() + } + + fn timeout(&self) -> Duration { + self.timeout.0 + } + + fn images(&self) -> BTreeMap<&str, String> { + let mut m = BTreeMap::new(); + // The native_macos runtime doesn't require any container image. + if self.resolved_runtime != "native_macos" { + m.insert("container", self.container.image.clone()); + } + m + } + + fn runtime(&self) -> String { + if self.resolved_runtime.is_empty() { + "docker".to_string() + } else { + self.resolved_runtime.clone() + } + } + + async fn run(&self, tctx: TestContext) -> TestResult { + match self.resolved_runtime.as_str() { + "native_macos" => { + let mut runner = crate::native_runner::NativeIntegrationRunner::new(self.clone(), tctx); + runner.run().await + } + // Default to the existing Docker path for "docker" or unset. + _ => { + let mut runner = crate::runner::IntegrationRunner::new(self.clone(), tctx); + runner.run().await + } + } + } +} +``` + +- [ ] **Step 3: Expand multi-runtime configs at discovery** + +Modify `try_load_test` in `bin/correctness/panoramic/src/test.rs` for the `"integration"` arm: + +```rust +"integration" => { + let config = IntegrationConfig::from_yaml(config_path)?; + if config.runtimes.is_empty() { + return Err(generic_error!("integration test '{}' has empty runtimes list", config.name)); + } + let mut tests: Vec> = Vec::new(); + for runtime in &config.runtimes { + if runtime != "docker" && runtime != "native_macos" { + return Err(generic_error!( + "integration test '{}' declares unknown runtime '{}' (expected 'docker' or 'native_macos')", + config.name, + runtime + )); + } + let mut variant = config.clone(); + variant.resolved_runtime = runtime.clone(); + tests.push(Box::new(variant)); + } + Ok(tests) +} +``` + +- [ ] **Step 4: Verify compilation (panoramic will fail until Task 4 lands)** + +Run: `cd bin/correctness && cargo check -p panoramic 2>&1 | tail -20` +Expected: FAIL on missing `crate::native_runner` module — that's the next task. + +- [ ] **Step 5: Commit** + +```bash +git add bin/correctness/panoramic/src/config.rs bin/correctness/panoramic/src/test.rs +git commit -m "feat(panoramic): add runtimes field to integration test config" +``` + +--- + +## Task 4: Add the `NativeIntegrationRunner` + +**Files:** +- Create: `bin/correctness/panoramic/src/native_runner.rs` +- Modify: `bin/correctness/panoramic/src/main.rs` (declare the module) + +- [ ] **Step 1: Create the runner module** + +Create `bin/correctness/panoramic/src/native_runner.rs`: + +```rust +//! Native-process integration test runner. +//! +//! This runner is the parallel of [`crate::runner::IntegrationRunner`] but for tests declared +//! with `runtime: native_macos`. Instead of building a Docker container, it spawns a binary +//! directly via [`airlock::native::NativeProcess`] and feeds its stdout/stderr into the same +//! [`LogBuffer`][crate::assertions::LogBuffer] used by the Docker path so the assertions work +//! unchanged. +//! +//! Scope (initial): only ADP-standalone tests. The binary is `agent-data-plane`, located via +//! the `ADP_BINARY_PATH` env var (falling back to `target/release/agent-data-plane`). + +use std::{ + collections::HashMap, + path::PathBuf, + sync::Arc, + time::{Duration, Instant}, +}; + +use airlock::native::{NativeProcess, NativeProcessConfig}; +use saluki_error::{generic_error, ErrorContext as _, GenericError}; +use tokio::sync::{Mutex, RwLock}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; + +use crate::{ + assertions::{create_assertion, AssertionContext, AssertionResult, LogBuffer}, + config::{AssertionStep, IntegrationConfig}, + reporter::{PhaseTiming, TestResult}, + test::TestContext, +}; + +const ADP_BINARY_ENV_VAR: &str = "ADP_BINARY_PATH"; +const DEFAULT_ADP_BINARY_PATH: &str = "target/release/agent-data-plane"; + +/// Runner for a single native-process integration test case. +pub(crate) struct NativeIntegrationRunner { + test_case: IntegrationConfig, + tctx: TestContext, + log_buffer: Arc>, +} + +impl NativeIntegrationRunner { + /// Creates a new runner for the given test case. + pub(crate) fn new(test_case: IntegrationConfig, tctx: TestContext) -> Self { + Self { + test_case, + tctx, + log_buffer: Arc::new(RwLock::new(LogBuffer::default())), + } + } + + /// Runs the test case and returns the result. + pub(crate) async fn run(&mut self) -> TestResult { + let started = Instant::now(); + let test_name = self.test_case.name(); + let mut phase_timings = Vec::new(); + + info!(test = %test_name, "Starting native integration test case."); + + // Phase: resolve binary path + let binary_path = match resolve_adp_binary_path() { + Ok(p) => p, + Err(e) => { + return make_error_result(test_name, started, "resolve_binary", e); + } + }; + debug!(test = %test_name, binary = %binary_path.display(), "Resolved ADP binary path."); + + // Phase: spawn process + let spawn_start = Instant::now(); + let exit_token = CancellationToken::new(); + + // Bridge the LogBuffer behind a Mutex. We have to take ownership of the + // buffer via an Arc> compatible shape; the simplest path is to construct a + // separate sink struct that pushes into the shared LogBuffer. + let sink_buf = self.log_buffer.clone(); + let log_sink: Arc> = + Arc::new(Mutex::new(NativeLogSink { buf: sink_buf })); + + let process_config = NativeProcessConfig::new(self.test_case.name.clone(), binary_path) + .with_args(vec!["run".to_string()]) + .with_env_map(self.test_case.container.env.clone()); + + let process = match NativeProcess::spawn(process_config, log_sink, exit_token.clone()).await { + Ok(p) => p, + Err(e) => { + phase_timings.push(PhaseTiming { + phase: "spawn".to_string(), + duration: spawn_start.elapsed(), + }); + return make_error_result(test_name, started, "spawn", e); + } + }; + phase_timings.push(PhaseTiming { + phase: "spawn".to_string(), + duration: spawn_start.elapsed(), + }); + + info!(test = %test_name, "Native process started."); + + // Phase: run assertions + let assertion_start = Instant::now(); + let assertion_results = self + .run_assertions(process.name().to_string(), exit_token.clone()) + .await; + phase_timings.push(PhaseTiming { + phase: "assertions".to_string(), + duration: assertion_start.elapsed(), + }); + + // Phase: cleanup + let cleanup_start = Instant::now(); + process.cleanup().await; + phase_timings.push(PhaseTiming { + phase: "cleanup".to_string(), + duration: cleanup_start.elapsed(), + }); + + let passed = assertion_results.iter().all(|r| r.passed); + TestResult { + name: test_name, + passed, + duration: started.elapsed(), + assertion_results: assertion_results.clone(), + error: None, + phase_timings, + assertion_details: assertion_results, + } + } + + async fn run_assertions( + &self, + process_display_name: String, + exit_token: CancellationToken, + ) -> Vec { + let mut results = Vec::new(); + let cancel_token = self.tctx.test_cancel_token(); + + for step in &self.test_case.assertions { + match step { + AssertionStep::Single(cfg) => { + let assertion = create_assertion(cfg.clone(), &self.test_case); + let ctx = AssertionContext { + log_buffer: self.log_buffer.clone(), + container_exit_token: exit_token.clone(), + cancel_token: cancel_token.clone(), + container_name: process_display_name.clone(), + port_mappings: HashMap::new(), + }; + results.push(assertion.check(&ctx).await); + } + AssertionStep::Parallel { parallel } => { + let futures: Vec<_> = parallel + .iter() + .map(|cfg| { + let assertion = create_assertion(cfg.clone(), &self.test_case); + let ctx = AssertionContext { + log_buffer: self.log_buffer.clone(), + container_exit_token: exit_token.clone(), + cancel_token: cancel_token.clone(), + container_name: process_display_name.clone(), + port_mappings: HashMap::new(), + }; + async move { assertion.check(&ctx).await } + }) + .collect(); + let parallel_results = futures::future::join_all(futures).await; + results.extend(parallel_results); + } + } + } + + results + } +} + +fn resolve_adp_binary_path() -> Result { + let explicit = std::env::var(ADP_BINARY_ENV_VAR).ok(); + let path = match explicit { + Some(p) => PathBuf::from(p), + None => PathBuf::from(DEFAULT_ADP_BINARY_PATH), + }; + + let canonical = path.canonicalize().with_error_context(|| { + format!( + "ADP binary not found at '{}'. Set {} or build via `cargo build --release --bin agent-data-plane`.", + path.display(), + ADP_BINARY_ENV_VAR + ) + })?; + Ok(canonical) +} + +fn make_error_result(name: String, started: Instant, phase: &str, e: GenericError) -> TestResult { + error!(test = %name, error = %e, phase, "Native integration test setup failed."); + TestResult { + name, + passed: false, + duration: started.elapsed(), + assertion_results: vec![], + error: Some(format!("Failed in phase '{}': {}", phase, e)), + phase_timings: vec![], + assertion_details: vec![], + } +} + +/// Bridge from `airlock::native::LogSink` to the panoramic `LogBuffer`. +struct NativeLogSink { + buf: Arc>, +} + +impl airlock::native::LogSink for NativeLogSink { + fn push_line(&mut self, line: String, is_stderr: bool) { + // Try a non-blocking write. If the lock is contended, do a blocking write — assertions + // hold the read lock briefly so contention is rare. + if let Ok(mut buf) = self.buf.try_write() { + buf.push_line(line, is_stderr); + } else { + // Fall back: spawn a task to do the write so we don't block this caller. We're + // already inside a tokio task here (the log pump), so blocking would stall it. + let buf = self.buf.clone(); + tokio::spawn(async move { + buf.write().await.push_line(line, is_stderr); + }); + } + } +} +``` + +- [ ] **Step 2: Declare the module in `main.rs`** + +Modify `bin/correctness/panoramic/src/main.rs`: + +Find the existing `mod runner;` line and add `mod native_runner;` after it. + +- [ ] **Step 3: Verify compilation** + +Run: `cd bin/correctness && cargo check -p panoramic 2>&1 | tail -30` +Expected: clean compile. + +If compile errors mention `AssertionContext` field names, the field names need to match the existing struct definition — check `bin/correctness/panoramic/src/assertions/mod.rs` for the exact shape and adjust. + +- [ ] **Step 4: Commit** + +```bash +git add bin/correctness/panoramic/src/native_runner.rs bin/correctness/panoramic/src/main.rs +git commit -m "feat(panoramic): add NativeIntegrationRunner for native_macos runtime" +``` + +--- + +## Task 5: Wire up `basic-startup` for the new runtime + +**Files:** +- Modify: `test/integration/cases/basic-startup/config.yaml` + +- [ ] **Step 1: Add the runtime opt-in** + +Modify `test/integration/cases/basic-startup/config.yaml` so the top-level keys read: + +```yaml +type: integration +name: "basic-startup" +description: "Verifies ADP starts successfully and remains stable" +timeout: 90s +runtimes: [docker, native_macos] + +container: + image: "saluki-images/datadog-agent:testing-devel" + env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + +assertions: + - type: log_contains + pattern: "Agent Data Plane starting" + timeout: 5s + - parallel: + - type: process_stable_for + duration: 10s + - type: log_not_contains + pattern: "panic|PANIC" + regex: true + during: 10s +``` + +- [ ] **Step 2: Discovery-only sanity check** + +Run: `cargo run --release --bin panoramic -- list -d test/integration/cases` +Expected output includes: +``` +basic-startup/docker +basic-startup/native_macos +``` +…and all other tests show up exactly once with their original names. + +- [ ] **Step 3: Commit** + +```bash +git add test/integration/cases/basic-startup/config.yaml +git commit -m "test(integration): enable basic-startup on native_macos runtime" +``` + +--- + +## Task 6: Add the `test-integration-macos` make target + +**Files:** +- Modify: `Makefile` + +- [ ] **Step 1: Inspect existing integration test targets** + +Run: `grep -n "test-integration\|test-integration-quick\|build-panoramic" Makefile` +Note the existing pattern. + +- [ ] **Step 2: Add the new targets** + +Append to `Makefile`, near the existing `test-integration` rule: + +```makefile +.PHONY: build-adp-macos +build-adp-macos: ## Builds the ADP binary natively for macOS (release profile) + @echo "[*] Building agent-data-plane (release, native macOS target)..." + @cargo build --release --bin agent-data-plane + +.PHONY: test-integration-macos +test-integration-macos: build-panoramic build-adp-macos +test-integration-macos: ## Runs macOS native integration tests (no Docker) + @echo "[*] Running macOS native integration tests..." + @ADP_BINARY_PATH=$(shell pwd)/target/release/agent-data-plane \ + target/release/panoramic run -d $(shell pwd)/test/integration/cases \ + -t basic-startup/native_macos --no-tui \ + $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) +``` + +- [ ] **Step 3: Run the new target end-to-end** + +Run: `make test-integration-macos` + +Expected output: +- Panoramic launches one test (`basic-startup/native_macos`). +- The `log_contains` assertion for `"Agent Data Plane starting"` passes within 5s. +- The `process_stable_for` and `log_not_contains` assertions complete after ~10s. +- Test result: PASS. + +If the test fails, check: +- The `agent-data-plane` binary exists at `target/release/agent-data-plane`. +- No other ADP process is already bound to default ports (`lsof -i :8125 -i :8135`). +- The log buffer is actually receiving lines (look at `PANORAMIC_LOG_DIR` output if set). + +- [ ] **Step 4: Commit** + +```bash +git add Makefile +git commit -m "build: add test-integration-macos make target" +``` + +--- + +## Task 7: Verify the Docker path still works for the same test + +This is the regression check that ensures we didn't break anything on Linux. + +- [ ] **Step 1: Confirm the docker variant still shows up in discovery** + +Already covered by Task 5 Step 2, but re-confirm: + +Run: `cargo run --release --bin panoramic -- list -d test/integration/cases | grep basic-startup` +Expected: +``` +basic-startup/docker +basic-startup/native_macos +``` + +- [ ] **Step 2: Run the docker variant locally if Docker is available** + +Skip if Docker isn't available locally. Otherwise: + +Run: `target/release/panoramic run -d test/integration/cases -t basic-startup/docker --no-tui` +Expected: existing Docker path runs unchanged and passes. + +- [ ] **Step 3: Run unit tests for affected crates** + +Run: `cargo test -p airlock -p panoramic 2>&1 | tail -20` +Expected: all tests pass, no regressions. + +- [ ] **Step 4: Run formatter and clippy** + +Run: `make fmt && make check-clippy 2>&1 | tail -30` +Expected: clean. + +--- + +## Self-review checklist + +- **Spec coverage:** Single test on macOS running natively via panoramic — Task 5 + Task 6. Docker path preserved — Task 7. CI and Tart wrapper are explicitly deferred and not in spec for this PR. +- **No placeholders:** All code blocks are concrete. +- **Type consistency:** `NativeProcessConfig` defined in Task 1 is used in Task 4; field names match. `LogSink` defined in Task 1 is implemented in Task 4. `AssertionContext` field names match the existing struct shape (verified in Task 4 Step 3 — adjust if compile fails). +- **One risk to flag in execution:** the `AssertionContext` struct definition lives in `assertions/mod.rs` and may have a different field shape than what's written in Task 4's code. The first thing to verify when implementing Task 4 is the exact `AssertionContext` definition; adapt the `run_assertions` call sites in `native_runner.rs` to match. + +--- + +## What this PR explicitly does NOT do + +- No CI job — that requires building ADP as an artifact and a new `.gitlab/test.yml` entry; deferred. +- No Tart wrapper script — deferred to a follow-up so non-macOS developers can run macOS tests locally. +- No conversion of the other 16 standalone integration tests — only `basic-startup` is wired up. +- No converged (Agent + ADP) tests — they need Agent install plumbing and IPC, which is its own scope. +- No refactor of the existing Docker `IntegrationRunner`. The two runners remain parallel for now; merging via a shared trait is a follow-up. From a64af8fea3a1d2119087f34f3455bb4831a3a176 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 21 May 2026 16:44:55 -0400 Subject: [PATCH 06/56] fix(panoramic): native runner provisions per-test datadog.yaml ADP's bootstrap loader requires the configuration file to exist, defaulting to /opt/datadog-agent/etc/datadog.yaml on macOS. On a clean macOS host without an installed Datadog Agent (which a CI runner would be), ADP fails immediately with 'No such file or directory'. Fix by having NativeIntegrationRunner create a per-test temp directory with an empty datadog.yaml and passing it to ADP via -c. Tests communicate config through env vars, so the file itself is intentionally empty. Also splits the Makefile target so 'test-integration-macos-run' can be invoked against pre-built binaries (useful for CI build-once-run-many) while 'test-integration-macos' remains the build+run convenience wrapper for local use. Quotes $(CURDIR) so the run target survives paths with spaces. --- Makefile | 12 +++--- .../panoramic/src/native_runner.rs | 37 ++++++++++++++++++- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index c05672de7b2..7ef93f6596d 100644 --- a/Makefile +++ b/Makefile @@ -575,15 +575,17 @@ build-adp-native: ## Builds the agent-data-plane binary natively for the current @echo "[*] Building agent-data-plane (release, native host target)..." @cargo build --release --bin agent-data-plane -.PHONY: test-integration-macos -test-integration-macos: build-panoramic build-adp-native -test-integration-macos: ## Runs ADP integration tests natively on macOS (no Docker) +.PHONY: test-integration-macos-run +test-integration-macos-run: ## Runs native macOS integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist) @echo "[*] Running native macOS integration tests..." - @ADP_BINARY_PATH=$(shell pwd)/target/release/agent-data-plane \ - target/release/panoramic run -d $(shell pwd)/test/integration/cases \ + @ADP_BINARY_PATH="$(CURDIR)/target/release/agent-data-plane" \ + target/release/panoramic run -d "$(CURDIR)/test/integration/cases" \ -t $(if $(CASE),$(CASE),basic-startup/native_macos) --no-tui \ $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) +.PHONY: test-integration-macos +test-integration-macos: build-panoramic build-adp-native test-integration-macos-run ## Builds and runs ADP integration tests natively on macOS (no Docker) + .PHONY: ensure-rust-miri ensure-rust-miri: ifeq ($(shell command -v rustup >/dev/null || echo not-found), not-found) diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index 75a1bf350e6..8c77657e1ea 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -20,6 +20,7 @@ use std::{ }; use airlock::native::{LogSink, NativeProcess, NativeProcessConfig}; +use rand::distr::SampleString as _; use saluki_error::{ErrorContext as _, GenericError}; use tokio::sync::{Mutex, RwLock}; use tokio_util::sync::CancellationToken; @@ -67,6 +68,29 @@ impl NativeIntegrationRunner { }; debug!(test = %test_name, binary = %binary_path.display(), "Resolved ADP binary path."); + // Create a per-test state directory and seed it with an empty datadog.yaml. ADP's + // bootstrap loader requires the file to exist; tests communicate config through env + // vars, so the file itself is intentionally empty. + let state_dir = match create_test_state_dir() { + Ok(d) => d, + Err(e) => return make_error_result(test_name, started, "prepare_state_dir", e, phase_timings), + }; + let config_path = state_dir.join("datadog.yaml"); + if let Err(e) = std::fs::write(&config_path, b"") { + return make_error_result( + test_name, + started, + "prepare_state_dir", + saluki_error::generic_error!( + "Failed to write empty datadog.yaml at '{}': {}", + config_path.display(), + e + ), + phase_timings, + ); + } + debug!(test = %test_name, state_dir = %state_dir.display(), "Prepared per-test state directory."); + // Phase: spawn the process. let spawn_start = Instant::now(); let exit_token = CancellationToken::new(); @@ -74,8 +98,9 @@ impl NativeIntegrationRunner { buf: self.log_buffer.clone(), })); + let config_path_str = config_path.to_string_lossy().into_owned(); let process_config = NativeProcessConfig::new(self.test_case.name.clone(), binary_path) - .with_args(vec!["run".to_string()]) + .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) .with_env_map(self.test_case.container.env.clone()); let process = match NativeProcess::spawn(process_config, log_sink, exit_token.clone()).await { @@ -204,6 +229,16 @@ fn resolve_adp_binary_path() -> Result { }) } +fn create_test_state_dir() -> Result { + let suffix = rand::distr::Alphanumeric + .sample_string(&mut rand::rng(), 8) + .to_lowercase(); + let dir = std::env::temp_dir().join(format!("panoramic-native-{}", suffix)); + std::fs::create_dir_all(&dir) + .with_error_context(|| format!("Failed to create state directory '{}'.", dir.display()))?; + Ok(dir) +} + fn make_error_result( name: String, started: Instant, phase: &str, e: GenericError, phase_timings: Vec, ) -> TestResult { From 8069d7999ef80bd51cb1b9b785141500a48ebba2 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 11:47:55 -0400 Subject: [PATCH 07/56] feat(panoramic): add --runtime filter to the run subcommand Adds a new --runtime option to 'panoramic run' that restricts the test run to tests whose Test::runtime() matches the given value. Composes with the existing -t name filter (AND semantics): a test must match BOTH filters to be selected when both are set. Updates the Makefile's test-integration-macos-run target to default to '--runtime native_macos -p 1' (run all native_macos tests serially) instead of hardcoding basic-startup. Tests can still be selected individually with CASE=/native_macos. --- Makefile | 4 ++-- bin/correctness/panoramic/src/cli.rs | 5 +++++ bin/correctness/panoramic/src/main.rs | 19 ++++++++++++++++--- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 7ef93f6596d..56c2385f4f5 100644 --- a/Makefile +++ b/Makefile @@ -576,11 +576,11 @@ build-adp-native: ## Builds the agent-data-plane binary natively for the current @cargo build --release --bin agent-data-plane .PHONY: test-integration-macos-run -test-integration-macos-run: ## Runs native macOS integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist) +test-integration-macos-run: ## Runs native macOS integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist). Defaults to all native_macos tests; override with CASE=/native_macos. @echo "[*] Running native macOS integration tests..." @ADP_BINARY_PATH="$(CURDIR)/target/release/agent-data-plane" \ target/release/panoramic run -d "$(CURDIR)/test/integration/cases" \ - -t $(if $(CASE),$(CASE),basic-startup/native_macos) --no-tui \ + $(if $(CASE),-t $(CASE),--runtime native_macos) --no-tui -p 1 \ $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) .PHONY: test-integration-macos diff --git a/bin/correctness/panoramic/src/cli.rs b/bin/correctness/panoramic/src/cli.rs index 6ffe6c0c22b..2c8cd270bad 100644 --- a/bin/correctness/panoramic/src/cli.rs +++ b/bin/correctness/panoramic/src/cli.rs @@ -30,6 +30,11 @@ pub struct RunCommand { #[argh(option, short = 't')] pub tests: Option, + /// run only tests with the given runtime (e.g., `docker`, `native_macos`, `kubernetes_in_docker`). + /// Can be combined with `-t` to further restrict by name. + #[argh(option)] + pub runtime: Option, + /// number of tests to run in parallel #[argh(option, short = 'p', default = "4")] pub parallelism: usize, diff --git a/bin/correctness/panoramic/src/main.rs b/bin/correctness/panoramic/src/main.rs index 1d926616ef9..30c65c7df83 100644 --- a/bin/correctness/panoramic/src/main.rs +++ b/bin/correctness/panoramic/src/main.rs @@ -188,9 +188,22 @@ async fn run_tests(cmd: cli::RunCommand, use_tui: bool) -> ExitCode { .with_fail_fast(cmd.fail_fast) .with_event_sender(tx); - if let Some(ref filter_str) = cmd.tests { - let names: Vec = filter_str.split(',').map(|s| s.trim().to_string()).collect(); - args = args.with_filter(Box::new(move |t: &dyn test::Test| names.iter().any(|n| *n == t.name()))); + // Combine the optional --runtime filter and the optional -t name filter into a single + // predicate. A test passes if it matches BOTH constraints (i.e., AND semantics). When neither + // is set, no filter is installed and every discovered test runs. + let name_filter: Option> = cmd + .tests + .as_ref() + .map(|s| s.split(',').map(|n| n.trim().to_string()).collect()); + let runtime_filter: Option = cmd.runtime.clone(); + if name_filter.is_some() || runtime_filter.is_some() { + args = args.with_filter(Box::new(move |t: &dyn test::Test| { + let name_ok = name_filter + .as_ref() + .is_none_or(|names| names.iter().any(|n| *n == t.name())); + let runtime_ok = runtime_filter.as_ref().is_none_or(|r| t.runtime() == *r); + name_ok && runtime_ok + })); } // Spawn the test runner task (same code path for both modes). From 4080ede1b844f3dd58142aed1313c6af27336184 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 11:56:33 -0400 Subject: [PATCH 08/56] feat(panoramic): populate identity port_mappings in native runner + enable 13 more standalone integration tests on native_macos Adds 'runtimes: [docker, native_macos]' to 13 of the standalone integration tests. Combined with basic-startup (enabled earlier on this branch), that brings the native_macos coverage to 14 of 17 standalone tests. Also populates the AssertionContext's port_mappings in the native runner using the test config's 'exposed_ports' as identity mappings (host port == 'container' port, since native has no port remapping). The Docker path uses this map to translate container ports to Docker-allocated host ports; on native we just need every probed port to appear in the map for the existing port_listening assertion to work unchanged. Tests enabled in this commit: - adp-memory-mode-disabled - adp-memory-mode-permissive-exceeds-limit - adp-memory-mode-permissive-within-limit - adp-memory-mode-strict-within-limit - adp-no-pipelines-exit - dogstatsd-autoscale-udp - dogstatsd-default-bind - dogstatsd-enabled - dogstatsd-non-local-overrides-bind-host - otlp-traces-enabled - privileged-api-endpoints - telemetry-endpoint - unprivileged-api-endpoints Verified all 14 pass via 'make test-integration-macos' (134s total, serial). Tests intentionally NOT enabled (each needs adaptation): - adp-memory-mode-strict-exceeds-limit (asserts on s6 supervisor log) - dogstatsd-bind-host (uses 'hostname -i' which works differently on macOS) - dogstatsd-bind-custom-hostname (writes to /etc/hosts via PANORAMIC_DYNAMIC) --- .../panoramic/src/native_runner.rs | 22 ++++++++++++++++--- .../adp-memory-mode-disabled/config.yaml | 1 + .../config.yaml | 1 + .../config.yaml | 1 + .../config.yaml | 1 + .../cases/adp-no-pipelines-exit/config.yaml | 1 + .../cases/dogstatsd-autoscale-udp/config.yaml | 1 + .../cases/dogstatsd-default-bind/config.yaml | 1 + .../cases/dogstatsd-enabled/config.yaml | 1 + .../config.yaml | 1 + .../cases/otlp-traces-enabled/config.yaml | 1 + .../privileged-api-endpoints/config.yaml | 1 + .../cases/telemetry-endpoint/config.yaml | 1 + .../unprivileged-api-endpoints/config.yaml | 1 + 14 files changed, 32 insertions(+), 3 deletions(-) diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index 8c77657e1ea..74e830da05f 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -28,7 +28,7 @@ use tracing::{debug, error, info}; use crate::{ assertions::{create_assertion, AssertionContext, AssertionResult, LogBuffer}, - config::{AssertionStep, IntegrationConfig}, + config::{parse_port_spec, AssertionStep, IntegrationConfig}, reporter::{PhaseTiming, TestResult}, test::{Test, TestContext}, }; @@ -150,11 +150,27 @@ impl NativeIntegrationRunner { } } + /// Builds the port mappings for assertions. In the Docker runner this maps container ports + /// to host ports allocated by Docker. On native there is no remapping: a port declared in + /// `exposed_ports` is reachable on the host at the same number. We populate identity entries + /// so the existing `port_listening` assertion (which expects every probed port to appear in + /// the mapping) works unchanged. + fn build_port_mappings(&self) -> HashMap { + let mut mappings = HashMap::new(); + for spec in &self.test_case.container.exposed_ports { + if let Ok((port, protocol)) = parse_port_spec(spec) { + mappings.insert(format!("{}/{}", port, protocol), port); + } + } + mappings + } + async fn run_assertions( &self, process_display_name: String, exit_token: CancellationToken, ) -> Vec { let mut results = Vec::new(); let cancel_token = self.tctx.test_cancel_token(); + let port_mappings = self.build_port_mappings(); for step in &self.test_case.assertions { match step { @@ -176,7 +192,7 @@ impl NativeIntegrationRunner { container_exit_token: exit_token.clone(), cancel_token: cancel_token.clone(), container_name: process_display_name.clone(), - port_mappings: HashMap::new(), + port_mappings: port_mappings.clone(), }; results.push(assertion.check(&ctx).await); } @@ -190,7 +206,7 @@ impl NativeIntegrationRunner { container_exit_token: exit_token.clone(), cancel_token: cancel_token.clone(), container_name: process_display_name.clone(), - port_mappings: HashMap::new(), + port_mappings: port_mappings.clone(), }; futures.push(async move { a.check(&ctx).await }); } diff --git a/test/integration/cases/adp-memory-mode-disabled/config.yaml b/test/integration/cases/adp-memory-mode-disabled/config.yaml index bacc3c82186..187ca95a3d3 100644 --- a/test/integration/cases/adp-memory-mode-disabled/config.yaml +++ b/test/integration/cases/adp-memory-mode-disabled/config.yaml @@ -2,6 +2,7 @@ type: integration name: "adp-memory-mode-disabled" description: "Verifies that memory limiting is disabled by default and bounds verification is skipped" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml b/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml index 46a4a2c6c6d..0682c8a4e57 100644 --- a/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml @@ -2,6 +2,7 @@ type: integration name: "adp-memory-mode-permissive-exceeds-limit" description: "Verifies that permissive mode emits a best-effort warning when the calculated bounds exceed the configured limit, but the process still starts" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml b/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml index 9c20ae03486..9ab1d0dfd85 100644 --- a/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml @@ -2,6 +2,7 @@ type: integration name: "adp-memory-mode-permissive-within-limit" description: "Verifies that permissive mode succeeds and verifies bounds when the calculated bounds fit within the configured limit" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml b/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml index 9ea964109fe..f40d2ac6e3f 100644 --- a/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml @@ -2,6 +2,7 @@ type: integration name: "adp-memory-mode-strict-within-limit" description: "Verifies that strict mode succeeds and verifies bounds when the calculated bounds fit within the configured limit" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-no-pipelines-exit/config.yaml b/test/integration/cases/adp-no-pipelines-exit/config.yaml index ee9d312b83b..d876d5680f8 100644 --- a/test/integration/cases/adp-no-pipelines-exit/config.yaml +++ b/test/integration/cases/adp-no-pipelines-exit/config.yaml @@ -2,6 +2,7 @@ type: integration name: "adp-no-pipelines-exit" description: "Verify ADP exits with error when no data pipelines enabled" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml index ae4aa348ef6..915f50aa6fb 100644 --- a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml +++ b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml @@ -2,6 +2,7 @@ type: integration name: "dogstatsd-autoscale-udp" description: "Verifies DogStatsD UDP listener autoscaling (SO_REUSEPORT) starts cleanly on Linux" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-default-bind/config.yaml b/test/integration/cases/dogstatsd-default-bind/config.yaml index c67013c4590..03a4151066f 100644 --- a/test/integration/cases/dogstatsd-default-bind/config.yaml +++ b/test/integration/cases/dogstatsd-default-bind/config.yaml @@ -12,6 +12,7 @@ type: integration name: "dogstatsd-default-bind" description: "Verifies DogStatsD binds to 127.0.0.1 by default when bind_host is not configured" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-enabled/config.yaml b/test/integration/cases/dogstatsd-enabled/config.yaml index 2ddb06ed3fe..6feecc21088 100644 --- a/test/integration/cases/dogstatsd-enabled/config.yaml +++ b/test/integration/cases/dogstatsd-enabled/config.yaml @@ -2,6 +2,7 @@ type: integration name: "dogstatsd-enabled" description: "Verifies DogStatsD pipeline starts and listens on UDP port" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml index 45fe2f72a1a..a3b6286e8ea 100644 --- a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml @@ -14,6 +14,7 @@ type: integration name: "dogstatsd-non-local-overrides-bind-host" description: "Verifies dogstatsd_non_local_traffic takes precedence over bind_host" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/otlp-traces-enabled/config.yaml b/test/integration/cases/otlp-traces-enabled/config.yaml index 3e2d271adc9..f3cff643ae0 100644 --- a/test/integration/cases/otlp-traces-enabled/config.yaml +++ b/test/integration/cases/otlp-traces-enabled/config.yaml @@ -2,6 +2,7 @@ type: integration name: "otlp-traces-enabled" description: "Verifies OTLP pipeline starts with native trace handling and proxying for metrics/logs" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/privileged-api-endpoints/config.yaml b/test/integration/cases/privileged-api-endpoints/config.yaml index c721dac799a..d8e1e4ac80b 100644 --- a/test/integration/cases/privileged-api-endpoints/config.yaml +++ b/test/integration/cases/privileged-api-endpoints/config.yaml @@ -2,6 +2,7 @@ type: integration name: "privileged-api-endpoints" description: "Verifies the logging and metrics override routes are exposed on the privileged API after the workers assert them dynamically." timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/telemetry-endpoint/config.yaml b/test/integration/cases/telemetry-endpoint/config.yaml index 869c44a5cfa..ac13b2c420c 100644 --- a/test/integration/cases/telemetry-endpoint/config.yaml +++ b/test/integration/cases/telemetry-endpoint/config.yaml @@ -2,6 +2,7 @@ type: integration name: "telemetry-endpoint" description: "Verifies the internal telemetry routes are exposed on the unprivileged API endpoint" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/unprivileged-api-endpoints/config.yaml b/test/integration/cases/unprivileged-api-endpoints/config.yaml index c33fe8ecace..3889b408349 100644 --- a/test/integration/cases/unprivileged-api-endpoints/config.yaml +++ b/test/integration/cases/unprivileged-api-endpoints/config.yaml @@ -2,6 +2,7 @@ type: integration name: "unprivileged-api-endpoints" description: "Verifies the /ready, /live, and /memory/status endpoints are accessible on the unprivileged API" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" From 2766c2d8a84b3e39ba869ea7675edd39eab708eb Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 13:39:11 -0400 Subject: [PATCH 09/56] feat(panoramic): support converged tests in native_macos runtime Adds the ability for native_macos integration tests to spawn the Datadog Core Agent alongside ADP, sharing a per-test config directory so they authenticate over IPC the same way they would in production. Test configs opt in by setting 'requires_core_agent: true'. When set, the native runner: 1. Resolves the Core Agent binary (CORE_AGENT_BINARY_PATH env var, defaulting to /opt/datadog-agent/bin/agent/agent). 2. Spawns the Agent against the per-test config dir, in a new process group so its trace-agent and process-agent child processes can be reaped together on cleanup. 3. Waits up to 60s for the Agent to write 'auth_token' and 'ipc_cert.pem' into the config dir. 4. Spawns ADP with DD_AUTH_TOKEN_FILE_PATH pointing at the per-test auth_token, so ADP's IPC client uses the same per-test credentials and ADP's API server uses the matching cert. 5. On cleanup, SIGTERM then SIGKILL the entire Agent process group (parent + trace-agent + process-agent) to prevent orphans holding ports between tests. The 'requires_core_agent' field is informational on the existing docker runtime, which always runs both processes via s6. --- Cargo.lock | 1 + bin/correctness/airlock/Cargo.toml | 3 + bin/correctness/airlock/src/native.rs | 53 +++++++- bin/correctness/panoramic/src/config.rs | 12 ++ .../panoramic/src/native_runner.rs | 128 +++++++++++++++++- 5 files changed, 191 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 65eaf3e3712..ec91ddbc2ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,6 +79,7 @@ dependencies = [ "bollard", "futures", "home", + "libc", "saluki-error", "tokio", "tokio-util", diff --git a/bin/correctness/airlock/Cargo.toml b/bin/correctness/airlock/Cargo.toml index f6e160e719b..fe97714f592 100644 --- a/bin/correctness/airlock/Cargo.toml +++ b/bin/correctness/airlock/Cargo.toml @@ -13,6 +13,9 @@ bollard = { workspace = true, features = ["http", "pipe"] } futures = { workspace = true } home = { workspace = true } saluki-error = { workspace = true } + +[target.'cfg(unix)'.dependencies] +libc = { workspace = true } tokio = { workspace = true, features = [ "fs", "io-util", diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/native.rs index 6f9c924a5ea..108267f31cd 100644 --- a/bin/correctness/airlock/src/native.rs +++ b/bin/correctness/airlock/src/native.rs @@ -33,6 +33,12 @@ pub struct NativeProcessConfig { pub env: HashMap, /// Working directory for the process. If `None`, inherits the caller's working directory. pub working_dir: Option, + /// If `true`, the spawned process is placed into a new process group with itself as the + /// group leader, and [`cleanup`][NativeProcess::cleanup] signals the entire group instead of + /// only the immediate child. This is essential when the spawned binary forks helpers that + /// outlive their parent (e.g., the Datadog Core Agent spawns `trace-agent` and + /// `process-agent` which orphan onto launchd if only the parent is killed). + pub use_process_group: bool, } impl NativeProcessConfig { @@ -44,9 +50,19 @@ impl NativeProcessConfig { args: Vec::new(), env: HashMap::new(), working_dir: None, + use_process_group: false, } } + /// Places the spawned process in a new process group with itself as the group leader. + /// + /// Use this for binaries that fork long-lived helper processes that would otherwise orphan + /// when the parent is killed. + pub fn with_process_group(mut self) -> Self { + self.use_process_group = true; + self + } + /// Sets the arguments for the process. pub fn with_args(mut self, args: Vec) -> Self { self.args = args; @@ -85,6 +101,9 @@ pub trait LogSink: Send + Sync { pub struct NativeProcess { name: String, child: Option, + /// PGID to signal on cleanup when the spawned process is a process group leader. `None` + /// when [`NativeProcessConfig::use_process_group`] was `false`. + process_group: Option, exit_token: CancellationToken, log_tasks: Vec>, exit_task: Option>, @@ -112,11 +131,25 @@ impl NativeProcess { if let Some(ref wd) = config.working_dir { cmd.current_dir(wd); } + if config.use_process_group { + // Place the spawned process in a new process group so we can later signal all of + // its descendants together. + #[cfg(unix)] + cmd.process_group(0); + } let mut child = cmd .spawn() .with_error_context(|| format!("Failed to spawn '{}'.", config.binary_path.display()))?; + // When using a process group, capture the PGID. We made the child the group leader + // (process_group(0)), so PGID == child PID. + let process_group = if config.use_process_group { + child.id().map(|pid| pid as i32) + } else { + None + }; + let stdout = child .stdout .take() @@ -142,6 +175,7 @@ impl NativeProcess { Ok(Self { name: config.name, child: Some(child), + process_group, exit_token, log_tasks: vec![stdout_task, stderr_task], exit_task: Some(exit_task), @@ -178,8 +212,25 @@ impl NativeProcess { } } - /// Kills the child, joins background tasks, and cancels the exit token. + /// Kills the child (and its process group, if configured), joins background tasks, and + /// cancels the exit token. pub async fn cleanup(mut self) { + // If we asked for a process group, first send SIGTERM to the entire group. This gives + // descendants (e.g., trace-agent, process-agent spawned by the Datadog Core Agent) a + // chance to shut down cleanly before we hard-kill them. After a brief grace period we + // send SIGKILL to the group to guarantee no orphans remain. + #[cfg(unix)] + if let Some(pgid) = self.process_group { + // SAFETY: killpg with a valid pgid is a safe syscall; we ignore the return value. + unsafe { + libc::killpg(pgid, libc::SIGTERM); + } + tokio::time::sleep(Duration::from_millis(500)).await; + unsafe { + libc::killpg(pgid, libc::SIGKILL); + } + } + if let Some(mut child) = self.child.take() { let _ = child.kill().await; let _ = child.wait().await; diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index 541f08614c9..77cd9f958e1 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -122,6 +122,18 @@ pub struct IntegrationConfig { #[serde(default = "default_integration_runtimes")] pub runtimes: Vec, + /// Whether this test requires a Core Agent process to be running alongside ADP. + /// + /// When `true`, the native runtime spawns the Datadog Core Agent as a side process before + /// starting ADP, sharing a per-test config directory so they communicate over IPC the same + /// way they would in production. When `false` (the default), only ADP is spawned (standalone + /// mode). + /// + /// On the `docker` runtime this field is informational — the converged image always runs + /// both processes via s6. + #[serde(default)] + pub requires_core_agent: bool, + /// Resolved runtime for this specific test instance after discovery-time expansion. /// /// At parse time, this is always empty. The discovery layer sets it when expanding a diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index 74e830da05f..51c51078206 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -36,6 +36,14 @@ use crate::{ const ADP_BINARY_ENV_VAR: &str = "ADP_BINARY_PATH"; const DEFAULT_ADP_BINARY_PATH: &str = "target/release/agent-data-plane"; +const CORE_AGENT_BINARY_ENV_VAR: &str = "CORE_AGENT_BINARY_PATH"; +const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/opt/datadog-agent/bin/agent/agent"; + +/// How long to wait for the Core Agent to write its `auth_token` and `ipc_cert.pem` before +/// giving up and failing the test. +const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); +const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); + /// Runner for a single native-process integration test case. pub(crate) struct NativeIntegrationRunner { test_case: IntegrationConfig, @@ -91,21 +99,95 @@ impl NativeIntegrationRunner { } debug!(test = %test_name, state_dir = %state_dir.display(), "Prepared per-test state directory."); - // Phase: spawn the process. - let spawn_start = Instant::now(); let exit_token = CancellationToken::new(); let log_sink: Arc> = Arc::new(Mutex::new(NativeLogSink { buf: self.log_buffer.clone(), })); + // Optional Phase: spawn the Core Agent (converged tests). + // + // Converged tests need both the Core Agent and ADP running side-by-side, sharing a + // config directory so they can authenticate over IPC. We spawn the Agent first against + // the per-test state dir, wait until it has written `auth_token` and `ipc_cert.pem`, + // then spawn ADP with `DD_AUTH_TOKEN_FILE_PATH` pointing at the per-test auth token so + // ADP's IPC client uses the same per-test credentials (and ADP's own API server uses + // the matching cert). + let mut core_agent: Option = None; + if self.test_case.requires_core_agent { + let agent_spawn_start = Instant::now(); + let agent_binary = match resolve_core_agent_binary_path() { + Ok(p) => p, + Err(e) => return make_error_result(test_name, started, "resolve_core_agent", e, phase_timings), + }; + debug!(test = %test_name, binary = %agent_binary.display(), "Resolved Core Agent binary path."); + + let agent_config = NativeProcessConfig::new(format!("{}-core-agent", self.test_case.name), agent_binary) + .with_args(vec![ + "run".to_string(), + "-c".to_string(), + state_dir.to_string_lossy().into_owned(), + ]) + .with_env_map(self.test_case.container.env.clone()) + // The Core Agent forks `trace-agent` and `process-agent` helpers; without a process + // group they orphan onto launchd on cleanup and continue holding ports (e.g., 8126 + // for trace-agent), blocking subsequent tests. + .with_process_group(); + + let agent = match NativeProcess::spawn(agent_config, log_sink.clone(), exit_token.clone()).await { + Ok(p) => p, + Err(e) => { + phase_timings.push(PhaseTiming { + phase: "core_agent_spawn".to_string(), + duration: agent_spawn_start.elapsed(), + }); + return make_error_result(test_name, started, "core_agent_spawn", e, phase_timings); + } + }; + phase_timings.push(PhaseTiming { + phase: "core_agent_spawn".to_string(), + duration: agent_spawn_start.elapsed(), + }); + info!(test = %test_name, "Core Agent process started."); + + let wait_start = Instant::now(); + if let Err(e) = wait_for_agent_ipc_ready(&state_dir, CORE_AGENT_IPC_READY_TIMEOUT).await { + agent.cleanup().await; + phase_timings.push(PhaseTiming { + phase: "core_agent_ipc_ready".to_string(), + duration: wait_start.elapsed(), + }); + return make_error_result(test_name, started, "core_agent_ipc_ready", e, phase_timings); + } + phase_timings.push(PhaseTiming { + phase: "core_agent_ipc_ready".to_string(), + duration: wait_start.elapsed(), + }); + debug!(test = %test_name, "Core Agent IPC credentials present."); + core_agent = Some(agent); + } + + // Phase: spawn ADP. + let spawn_start = Instant::now(); let config_path_str = config_path.to_string_lossy().into_owned(); + let mut adp_env = self.test_case.container.env.clone(); + if self.test_case.requires_core_agent { + // Point ADP's IPC client at the per-test auth token (and by derivation, the + // per-test ipc_cert.pem in the same directory). + adp_env.insert( + "DD_AUTH_TOKEN_FILE_PATH".to_string(), + state_dir.join("auth_token").to_string_lossy().into_owned(), + ); + } let process_config = NativeProcessConfig::new(self.test_case.name.clone(), binary_path) .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) - .with_env_map(self.test_case.container.env.clone()); + .with_env_map(adp_env); let process = match NativeProcess::spawn(process_config, log_sink, exit_token.clone()).await { Ok(p) => p, Err(e) => { + if let Some(agent) = core_agent.take() { + agent.cleanup().await; + } phase_timings.push(PhaseTiming { phase: "spawn".to_string(), duration: spawn_start.elapsed(), @@ -118,7 +200,7 @@ impl NativeIntegrationRunner { duration: spawn_start.elapsed(), }); - info!(test = %test_name, "Native process started."); + info!(test = %test_name, "ADP process started."); // Phase: run assertions. let assertion_start = Instant::now(); @@ -130,9 +212,13 @@ impl NativeIntegrationRunner { duration: assertion_start.elapsed(), }); - // Phase: cleanup. + // Phase: cleanup. ADP first, Core Agent second — in case the Agent's shutdown depends on + // ADP releasing connections gracefully. let cleanup_start = Instant::now(); process.cleanup().await; + if let Some(agent) = core_agent.take() { + agent.cleanup().await; + } phase_timings.push(PhaseTiming { phase: "cleanup".to_string(), duration: cleanup_start.elapsed(), @@ -245,6 +331,38 @@ fn resolve_adp_binary_path() -> Result { }) } +fn resolve_core_agent_binary_path() -> Result { + let raw = std::env::var(CORE_AGENT_BINARY_ENV_VAR) + .ok() + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(DEFAULT_CORE_AGENT_BINARY_PATH)); + + raw.canonicalize().with_error_context(|| { + format!( + "Core Agent binary not found at '{}'. Set {} or install the Datadog Agent (https://docs.datadoghq.com/agent/).", + raw.display(), + CORE_AGENT_BINARY_ENV_VAR + ) + }) +} + +async fn wait_for_agent_ipc_ready(state_dir: &std::path::Path, timeout: Duration) -> Result<(), GenericError> { + let auth_token = state_dir.join("auth_token"); + let ipc_cert = state_dir.join("ipc_cert.pem"); + let deadline = Instant::now() + timeout; + while Instant::now() < deadline { + if auth_token.is_file() && ipc_cert.is_file() { + return Ok(()); + } + tokio::time::sleep(CORE_AGENT_IPC_READY_POLL).await; + } + Err(saluki_error::generic_error!( + "Core Agent did not write 'auth_token' and 'ipc_cert.pem' to '{}' within {:?}.", + state_dir.display(), + timeout + )) +} + fn create_test_state_dir() -> Result { let suffix = rand::distr::Alphanumeric .sample_string(&mut rand::rng(), 8) From 257a7e19be7db1e73a250d657d47acfe0b9f7ba8 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 13:40:04 -0400 Subject: [PATCH 10/56] test(integration): enable 3 converged tests on native_macos runtime Enables the first three converged tests in the native_macos runtime: - adp-rar-registration: ADP successfully registers with the Agent's Remote Agent Registry. - adp-rar-disabled: ADP handles registration failure gracefully when the Agent has the RAR disabled. - adp-config-check-warn: ADP warns (but does not exit) on medium-severity unsupported config keys. These all rely on the converged-spawn support added in the previous commit. Verified inside an ephemeral Tart macOS VM with a freshly installed Datadog Agent 7.78.0. Remaining converged tests are intentionally NOT enabled yet: - adp-cmd-port: needs investigation; requires a specific cmd_port that isn't being honored end-to-end on macOS yet. - adp-config-check-exit: asserts on the s6 supervisor's exit log, which has no native equivalent. - adp-config-stream: ADP waits indefinitely for config when the test uses the new config stream endpoint; needs investigation. - adp-logging-*: assert on Linux log paths (/var/log/datadog/...); macOS uses /opt/datadog-agent/logs/... so the assertions need platform-specific paths. --- test/integration/cases/adp-config-check-warn/config.yaml | 2 ++ test/integration/cases/adp-rar-disabled/config.yaml | 2 ++ test/integration/cases/adp-rar-registration/config.yaml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/test/integration/cases/adp-config-check-warn/config.yaml b/test/integration/cases/adp-config-check-warn/config.yaml index 2b025c22c27..9e1b5cd8a4c 100644 --- a/test/integration/cases/adp-config-check-warn/config.yaml +++ b/test/integration/cases/adp-config-check-warn/config.yaml @@ -7,6 +7,8 @@ type: integration name: "adp-config-check-warn" description: "Verify config check warns on medium-severity incompatible keys without exiting" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-rar-disabled/config.yaml b/test/integration/cases/adp-rar-disabled/config.yaml index 7b32eafbda2..5f166802619 100644 --- a/test/integration/cases/adp-rar-disabled/config.yaml +++ b/test/integration/cases/adp-rar-disabled/config.yaml @@ -2,6 +2,8 @@ type: integration name: "adp-rar-disabled" description: "Verify ADP gracefully handles RAR being disabled on the Core Agent" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-rar-registration/config.yaml b/test/integration/cases/adp-rar-registration/config.yaml index f38324d535c..2d8580504c4 100644 --- a/test/integration/cases/adp-rar-registration/config.yaml +++ b/test/integration/cases/adp-rar-registration/config.yaml @@ -2,6 +2,8 @@ type: integration name: "adp-rar-registration" description: "Verify ADP successfully registers with Remote Agent Registry" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" From c365aaa6cada6e75d1c2a952e831e17efefea129 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 13:57:03 -0400 Subject: [PATCH 11/56] fix(panoramic): set DD_AUTH_TOKEN_FILE_PATH on the Core Agent too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ADP's bootstrap config flow uses the new config stream endpoint (DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT=true), the Agent's authoritative configuration is layered on top of ADP's env vars and takes precedence (per ConfigurationLoader's ordering: later sources win). Previously the native runner only set DD_AUTH_TOKEN_FILE_PATH on ADP, but the Agent's config stream still advertised the platform default (/opt/datadog-agent/etc/auth_token). ADP would honor the stream value for its post-bootstrap IPC clients, load the wrong cert, and fail TLS validation with 'invalid peer certificate: UnknownIssuer' — even though the cert and key on disk in the per-test state directory were correct. Fix: pass DD_AUTH_TOKEN_FILE_PATH to the Agent too so its config stream advertises the per-test path that both processes are actually using. Also enables adp-cmd-port and adp-config-stream on native_macos, which hit exactly this failure (they pass with the fix, observed in a Tart VM with a freshly provisioned Datadog Agent 7.78.0). --- bin/correctness/panoramic/src/native_runner.rs | 13 ++++++++++++- test/integration/cases/adp-cmd-port/config.yaml | 2 ++ .../integration/cases/adp-config-stream/config.yaml | 2 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index 51c51078206..b7c58486f7e 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -121,13 +121,24 @@ impl NativeIntegrationRunner { }; debug!(test = %test_name, binary = %agent_binary.display(), "Resolved Core Agent binary path."); + // The Agent and ADP must agree on the auth_token / ipc_cert.pem path. The Agent's + // authoritative config (sent to ADP via the config stream) overrides ADP's env vars + // by design, so the Agent must itself be told about the per-test path — otherwise + // it advertises the platform default (`/opt/datadog-agent/etc/auth_token`), ADP + // follows that advice for its post-config-stream IPC clients, and TLS fails with + // UnknownIssuer because the platform default cert does not match what the per-test + // Agent is actually serving. + let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); + let mut agent_env = self.test_case.container.env.clone(); + agent_env.insert("DD_AUTH_TOKEN_FILE_PATH".to_string(), auth_token_path.clone()); + let agent_config = NativeProcessConfig::new(format!("{}-core-agent", self.test_case.name), agent_binary) .with_args(vec![ "run".to_string(), "-c".to_string(), state_dir.to_string_lossy().into_owned(), ]) - .with_env_map(self.test_case.container.env.clone()) + .with_env_map(agent_env) // The Core Agent forks `trace-agent` and `process-agent` helpers; without a process // group they orphan onto launchd on cleanup and continue holding ports (e.g., 8126 // for trace-agent), blocking subsequent tests. diff --git a/test/integration/cases/adp-cmd-port/config.yaml b/test/integration/cases/adp-cmd-port/config.yaml index 278dcf692e5..a65236c56ba 100644 --- a/test/integration/cases/adp-cmd-port/config.yaml +++ b/test/integration/cases/adp-cmd-port/config.yaml @@ -13,6 +13,8 @@ type: integration name: "adp-cmd-port" description: "Verifies ADP connects to the correct port when cmd_port is set" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-config-stream/config.yaml b/test/integration/cases/adp-config-stream/config.yaml index bcf48327eed..ba587c28066 100644 --- a/test/integration/cases/adp-config-stream/config.yaml +++ b/test/integration/cases/adp-config-stream/config.yaml @@ -2,6 +2,8 @@ type: integration name: "adp-config-stream" description: "Verify ADP receives configuration from Core Agent via config stream" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" From 68cd7c68bb4febc4a38f8ac4ab2fa3981762c53f Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 14:10:16 -0400 Subject: [PATCH 12/56] feat(panoramic): add native code path to file_contains + propagate APP_* metadata to ADP native build Two related fixes needed for the adp-logging-* integration tests on the native_macos runtime: 1. The file_contains assertion previously always shelled out to 'docker exec cat ' to read files from the container. On native there is no container; files referenced by the test live on the host filesystem directly. Adds an is_native flag to AssertionContext (set by NativeIntegrationRunner) and branches the assertion to read via tokio::fs::read_to_string when native. 2. The Makefile's build-adp-native target was running 'cargo build' without the APP_FULL_NAME / APP_SHORT_NAME / APP_IDENTIFIER / APP_GIT_HASH / APP_VERSION / APP_BUILD_DATE env vars that the saluki-metadata build script reads. Without them, ADP logs as '| UNKNOWN |' instead of '| DATAPLANE |', and tests that look for the 'DATAPLANE' marker fail. Aligns with build-adp-base. --- Makefile | 8 ++++++- .../panoramic/src/assertions/file_contains.rs | 21 ++++++++++++++++++- .../panoramic/src/assertions/mod.rs | 4 ++++ .../panoramic/src/native_runner.rs | 2 ++ bin/correctness/panoramic/src/runner.rs | 1 + 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 56c2385f4f5..7f9950da9a9 100644 --- a/Makefile +++ b/Makefile @@ -573,7 +573,13 @@ list-integration-tests: ## Lists available ADP integration tests build-adp-native: check-rust-build-tools build-adp-native: ## Builds the agent-data-plane binary natively for the current host (release profile) @echo "[*] Building agent-data-plane (release, native host target)..." - @cargo build --release --bin agent-data-plane + @APP_FULL_NAME="$(ADP_APP_FULL_NAME)" \ + APP_SHORT_NAME="$(ADP_APP_SHORT_NAME)" \ + APP_IDENTIFIER="$(ADP_APP_IDENTIFIER)" \ + APP_GIT_HASH="$(ADP_APP_GIT_HASH)" \ + APP_VERSION="$(ADP_APP_VERSION)" \ + APP_BUILD_DATE="$(ADP_APP_BUILD_DATE)" \ + cargo build --release --bin agent-data-plane .PHONY: test-integration-macos-run test-integration-macos-run: ## Runs native macOS integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist). Defaults to all native_macos tests; override with CASE=/native_macos. diff --git a/bin/correctness/panoramic/src/assertions/file_contains.rs b/bin/correctness/panoramic/src/assertions/file_contains.rs index 6758dc7c2f5..64d4df37377 100644 --- a/bin/correctness/panoramic/src/assertions/file_contains.rs +++ b/bin/correctness/panoramic/src/assertions/file_contains.rs @@ -95,7 +95,12 @@ impl Assertion for FileContainsAssertion { }; } - match read_file_in_container(&ctx.container_name, &self.path).await { + let read_result = if ctx.is_native { + read_file_local(&self.path).await + } else { + read_file_in_container(&ctx.container_name, &self.path).await + }; + match read_result { Ok(Some(content)) => { let matches = match &self.pattern { None => true, @@ -131,6 +136,20 @@ impl Assertion for FileContainsAssertion { } } +/// Reads a file from the host filesystem. +/// +/// Used by the `native_macos` runtime where ADP runs as a local process and writes log files to +/// real host paths. Returns the same shape as [`read_file_in_container`]: `Ok(Some(contents))` +/// when readable, `Ok(None)` when missing, `Err` for unexpected I/O failures. +async fn read_file_local(path: &str) -> Result, String> { + match tokio::fs::read_to_string(path).await { + Ok(contents) => Ok(Some(contents)), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => Ok(None), + Err(e) => Err(format!("Failed to read '{}': {}", path, e)), + } +} + /// Reads a file from inside the container via `docker exec cat `. /// /// Returns `Ok(Some(contents))` when the file exists and is readable, `Ok(None)` when the file is missing or diff --git a/bin/correctness/panoramic/src/assertions/mod.rs b/bin/correctness/panoramic/src/assertions/mod.rs index 9a6650fb960..2c6d3c8f557 100644 --- a/bin/correctness/panoramic/src/assertions/mod.rs +++ b/bin/correctness/panoramic/src/assertions/mod.rs @@ -102,6 +102,10 @@ pub struct AssertionContext { pub port_mappings: std::collections::HashMap, /// Name of the container being tested. pub container_name: String, + /// Whether the test is running natively (no container). When `true`, assertions that would + /// otherwise reach into a container (e.g., reading a file via `docker exec`) should operate + /// against the host filesystem / local process instead. + pub is_native: bool, } /// Trait for assertion implementations. diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index b7c58486f7e..e22ba8230ac 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -289,6 +289,7 @@ impl NativeIntegrationRunner { container_exit_token: exit_token.clone(), cancel_token: cancel_token.clone(), container_name: process_display_name.clone(), + is_native: true, port_mappings: port_mappings.clone(), }; results.push(assertion.check(&ctx).await); @@ -303,6 +304,7 @@ impl NativeIntegrationRunner { container_exit_token: exit_token.clone(), cancel_token: cancel_token.clone(), container_name: process_display_name.clone(), + is_native: true, port_mappings: port_mappings.clone(), }; futures.push(async move { a.check(&ctx).await }); diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index 355be1da870..63f4572419d 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -817,6 +817,7 @@ impl IntegrationRunner { cancel_token: self.tctx.test_cancel_token(), port_mappings: port_mappings.clone(), container_name: container_name.to_string(), + is_native: false, }; for (step_index, step) in self.test_case.assertions.iter().enumerate() { From d44faf2d4d0c9cb092d7d47763746fc45ed09b57 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 14:11:15 -0400 Subject: [PATCH 13/56] test(integration): enable 5 more converged tests on native_macos Builds on the converged-spawn support and the DD_AUTH_TOKEN_FILE_PATH alignment + file_contains native code path landed in previous commits. Newly enabled: - adp-cmd-port: ADP connects to the Agent on a custom cmd_port (7777) - adp-config-stream: ADP reaches a healthy topology via the config stream from the Agent - adp-logging-default-path: ADP writes its log to the platform-default path under converged operation - adp-logging-ignores-core-agent-log-file: ADP does not honor the Core Agent's DD_LOG_FILE setting and keeps using its own log path - adp-logging-respects-data-plane-log-file: ADP honors DD_DATA_PLANE_LOG_FILE when set explicitly Total native_macos integration coverage is now 22 of 27 integration tests (17 standalone + 5 converged). Verified end-to-end inside an ephemeral Tart macOS VM with a freshly provisioned Datadog Agent 7.78.0 in 3m9s wall clock. Remaining 5 tests intentionally not enabled: - adp-disabled-exit, adp-config-check-exit: assert on the s6 supervisor log line in the converged Docker image; no s6 on native, so the assertion has no equivalent without a runner change. - adp-memory-mode-strict-exceeds-limit: same s6 supervisor log assertion. - dogstatsd-bind-host, dogstatsd-bind-custom-hostname: use PANORAMIC_DYNAMIC env shell hooks that run 'hostname -i' and 'echo ... >> /etc/hosts' \u2014 valid in a Linux container, not portable to a macOS host. --- test/integration/cases/adp-logging-default-path/config.yaml | 2 ++ .../cases/adp-logging-ignores-core-agent-log-file/config.yaml | 2 ++ .../cases/adp-logging-respects-data-plane-log-file/config.yaml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/test/integration/cases/adp-logging-default-path/config.yaml b/test/integration/cases/adp-logging-default-path/config.yaml index 0e2dd6b1d74..45b21e7b0e6 100644 --- a/test/integration/cases/adp-logging-default-path/config.yaml +++ b/test/integration/cases/adp-logging-default-path/config.yaml @@ -2,6 +2,8 @@ type: integration name: "adp-logging-default-path" description: "Verifies ADP writes to the platform-default log file path (/var/log/datadog/agent-data-plane.log) when no override is provided" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml index e95585f335f..6dda23c78eb 100644 --- a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml +++ b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml @@ -2,6 +2,8 @@ type: integration name: "adp-logging-ignores-core-agent-log-file" description: "Verifies ADP ignores the Core Agent's `log_file` setting and continues to use its own per-subagent log file path" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml index af06aead554..7d4abd43414 100644 --- a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml +++ b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml @@ -2,6 +2,8 @@ type: integration name: "adp-logging-respects-data-plane-log-file" description: "Verifies ADP honors the per-subagent `data_plane.log_file` setting when explicitly configured" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" From 0ac30a789a7406cb5d4c6c251c42f3eb94f3033b Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 15:04:31 -0400 Subject: [PATCH 14/56] feat(panoramic): observe real native process exits + add runtime-aware adp_exits_with Three related changes that together unlock three more converged tests on the native_macos runtime. 1) NativeProcess now actually observes the spawned child's exit on its own (previously the exit token only fired during cleanup). The exit watcher moves the Child handle into a tokio task that calls wait() and records the exit code in a shared OnceLock cell. Cleanup now signals via libc::kill/killpg directly since the Child handle lives in the watcher. 2) Adds a new 'adp_exits_with' assertion that abstracts the runtime difference for 'did ADP exit with code N': - On docker (s6 wrapper): greps the captured log buffer for 'agent-data-plane exited with code N'. - On native_macos: reads the exit code from the per-process OnceLock cell populated by NativeProcess. Test configs use this once; both runtimes agree. 3) log_contains and log_not_contains now do a final post-exit read of the buffer instead of bailing on container_exit_token. log_contains was returning 'cancelled because container exited' before checking one last time; log_not_contains was missing the deserved success case for short-lived processes. ADP and the (optional) Core Agent get independent exit tokens on the native runner: the token passed to assertions is ADP's, so the Agent dying on its own does not falsely trigger 'process exited' for the test. The Agent's token is used only by NativeProcess internals. --- bin/correctness/airlock/src/native.rs | 114 ++++++++----- .../panoramic/src/assertions/adp_exits.rs | 149 +++++++++++++++++ .../panoramic/src/assertions/log_contains.rs | 38 ++++- .../panoramic/src/assertions/mod.rs | 9 ++ .../panoramic/src/assertions/process_exits.rs | 150 ++++++++++++------ bin/correctness/panoramic/src/config.rs | 23 ++- .../panoramic/src/native_runner.rs | 21 ++- bin/correctness/panoramic/src/runner.rs | 1 + .../cases/adp-rar-disabled/config.yaml | 7 +- 9 files changed, 414 insertions(+), 98 deletions(-) create mode 100644 bin/correctness/panoramic/src/assertions/adp_exits.rs diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/native.rs index 108267f31cd..275c0da3a6f 100644 --- a/bin/correctness/airlock/src/native.rs +++ b/bin/correctness/airlock/src/native.rs @@ -8,18 +8,34 @@ //! Only the small subset of the Docker driver surface needed by the panoramic native runner is //! implemented: spawn, log capture, exit watching, and cleanup. -use std::{collections::HashMap, path::PathBuf, process::Stdio, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + path::PathBuf, + process::Stdio, + sync::{Arc, OnceLock}, + time::Duration, +}; use saluki_error::{generic_error, ErrorContext as _, GenericError}; use tokio::{ io::{AsyncBufReadExt as _, AsyncRead, BufReader}, - process::{Child, Command}, + process::Command, sync::Mutex, task::JoinHandle, }; use tokio_util::sync::CancellationToken; use tracing::{debug, warn}; +/// Shared cell that receives the exit code of a spawned [`NativeProcess`]. +/// +/// The cell is populated by the background exit watcher when the child exits on its own, or by +/// [`NativeProcess::cleanup`] when the test tears down. Consumers (e.g., the +/// `process_exits_with` assertion in panoramic) read the cell after the exit token fires. +/// +/// The inner `Option` is `None` if the process was terminated by signal rather than exiting +/// normally with a status code. +pub type ExitCodeCell = Arc>>; + /// Configuration for a native process to spawn. #[derive(Clone)] pub struct NativeProcessConfig { @@ -96,15 +112,21 @@ pub trait LogSink: Send + Sync { /// A spawned native process and its supporting tasks. /// /// `NativeProcess` owns the child process plus background tasks that pump stdout/stderr lines -/// into a shared sink and observe the child's exit. Calling [`cleanup`][Self::cleanup] kills the -/// child, joins the background tasks, and cancels the exit token. +/// into a shared sink and observe the child's exit. The provided exit token is cancelled when +/// the child process exits on its own (observed by the background watcher) or when +/// [`cleanup`][Self::cleanup] is called. The exit code is recorded in the shared +/// [`ExitCodeCell`] returned by [`exit_code_cell`][Self::exit_code_cell]. pub struct NativeProcess { name: String, - child: Option, /// PGID to signal on cleanup when the spawned process is a process group leader. `None` /// when [`NativeProcessConfig::use_process_group`] was `false`. process_group: Option, + /// The child process. Owned by the exit watcher; we communicate with it via signals. + /// + /// `None` once `cleanup` has reaped it (or never set if spawn failed before assignment). + child_pid: Option, exit_token: CancellationToken, + exit_code: ExitCodeCell, log_tasks: Vec>, exit_task: Option>, } @@ -144,8 +166,9 @@ impl NativeProcess { // When using a process group, capture the PGID. We made the child the group leader // (process_group(0)), so PGID == child PID. + let child_pid = child.id(); let process_group = if config.use_process_group { - child.id().map(|pid| pid as i32) + child_pid.map(|pid| pid as i32) } else { None }; @@ -162,21 +185,35 @@ impl NativeProcess { let stdout_task = spawn_log_pump(stdout, log_sink.clone(), false); let stderr_task = spawn_log_pump(stderr, log_sink, true); - // We don't move the child here, so the actual exit observation happens in `cleanup` or - // `wait_with_timeout`. The exit_task is kept as a placeholder so future implementations - // can attach a SIGCHLD-style notifier without changing the public API. - let name_for_watcher = config.name.clone(); + // Real exit watcher: moves the child into the task, calls `wait()`, records the exit + // code, and fires the exit token so blocked assertions (process_stable_for / + // process_exits_with) unblock immediately rather than waiting for the test's own + // cleanup phase. + let exit_code: ExitCodeCell = Arc::new(OnceLock::new()); + let exit_code_for_watcher = exit_code.clone(); let exit_token_for_watcher = exit_token.clone(); + let name_for_watcher = config.name.clone(); let exit_task = tokio::spawn(async move { - debug!(name = %name_for_watcher, "Native process exit watcher placeholder; exit observation happens in cleanup."); - exit_token_for_watcher.cancelled().await; + match child.wait().await { + Ok(status) => { + let code = status.code(); + debug!(name = %name_for_watcher, ?code, "Native process exited."); + let _ = exit_code_for_watcher.set(code); + } + Err(e) => { + warn!(name = %name_for_watcher, error = %e, "Failed to wait on native process; treating as exited."); + let _ = exit_code_for_watcher.set(None); + } + } + exit_token_for_watcher.cancel(); }); Ok(Self { name: config.name, - child: Some(child), process_group, + child_pid, exit_token, + exit_code, log_tasks: vec![stdout_task, stderr_task], exit_task: Some(exit_task), }) @@ -192,24 +229,11 @@ impl NativeProcess { self.exit_token.clone() } - /// Waits for the process to exit, killing it if `timeout` elapses first. - /// - /// Returns the exit code if available, `None` if the process was terminated by signal. - #[allow(dead_code)] - pub async fn wait_with_timeout(&mut self, timeout: Duration) -> Result, GenericError> { - let child = self - .child - .as_mut() - .ok_or_else(|| generic_error!("Process already cleaned up."))?; - match tokio::time::timeout(timeout, child.wait()).await { - Ok(Ok(status)) => Ok(status.code()), - Ok(Err(e)) => Err(generic_error!("Failed to wait for process: {}", e)), - Err(_) => { - let _ = child.kill().await; - let _ = child.wait().await; - Err(generic_error!("Process did not exit within timeout.")) - } - } + /// Returns a clone of the shared exit-code cell. The cell is populated once the process + /// exits (either on its own or via cleanup). Consumers should wait on [`exit_token`] before + /// reading. + pub fn exit_code_cell(&self) -> ExitCodeCell { + self.exit_code.clone() } /// Kills the child (and its process group, if configured), joins background tasks, and @@ -229,16 +253,30 @@ impl NativeProcess { unsafe { libc::killpg(pgid, libc::SIGKILL); } + } else if let Some(pid) = self.child_pid { + // Fallback: just signal the direct child. The exit watcher owns the Child handle + // so we can't call kill() through it; use libc directly. + #[cfg(unix)] + unsafe { + libc::kill(pid as i32, libc::SIGTERM); + } + tokio::time::sleep(Duration::from_millis(200)).await; + #[cfg(unix)] + unsafe { + libc::kill(pid as i32, libc::SIGKILL); + } + #[cfg(not(unix))] + let _ = pid; } - if let Some(mut child) = self.child.take() { - let _ = child.kill().await; - let _ = child.wait().await; - } - self.exit_token.cancel(); + // The exit watcher will have observed the kill and set the exit code + fired the token. + // Join it so we don't leak the task. if let Some(handle) = self.exit_task.take() { let _ = handle.await; } + // Defensive: make sure the token is fired even if the watcher never set it (e.g., on a + // failed wait). + self.exit_token.cancel(); for handle in self.log_tasks.drain(..) { let _ = handle.await; } @@ -247,10 +285,10 @@ impl NativeProcess { impl Drop for NativeProcess { fn drop(&mut self) { - if self.child.is_some() { + if self.exit_task.is_some() { warn!( name = %self.name, - "NativeProcess dropped without explicit cleanup; child will be killed via kill_on_drop." + "NativeProcess dropped without explicit cleanup; child may have been killed via kill_on_drop." ); } } diff --git a/bin/correctness/panoramic/src/assertions/adp_exits.rs b/bin/correctness/panoramic/src/assertions/adp_exits.rs new file mode 100644 index 00000000000..ccb5de614a6 --- /dev/null +++ b/bin/correctness/panoramic/src/assertions/adp_exits.rs @@ -0,0 +1,149 @@ +use std::time::{Duration, Instant}; + +use crate::{ + assertions::{Assertion, AssertionContext, AssertionResult}, + config::LogStream, +}; + +/// Assertion that checks ADP exited with a specific exit code, abstracting over the runtime. +/// +/// On the `docker` runtime ADP runs under s6, which keeps the container alive across ADP +/// restarts and logs `"agent-data-plane exited with code N"` from +/// `docker/s6-services/agent-data-plane/finish` when ADP exits. We grep the captured log buffer +/// for that line. +/// +/// On the `native_macos` runtime there is no supervisor. The native runner observes ADP's child +/// process exit directly and records the exit code in the shared cell on +/// [`AssertionContext::native_exit_code`]. +pub struct AdpExitsWithAssertion { + expected_code: i64, + timeout: Duration, +} + +impl AdpExitsWithAssertion { + pub fn new(expected_code: i64, timeout: Duration) -> Self { + Self { expected_code, timeout } + } +} + +#[async_trait::async_trait] +impl Assertion for AdpExitsWithAssertion { + fn name(&self) -> &'static str { + "adp_exits_with" + } + + fn description(&self) -> String { + format!("ADP exits with code {} within {:?}.", self.expected_code, self.timeout) + } + + async fn check(&self, ctx: &AssertionContext) -> AssertionResult { + let started = Instant::now(); + if ctx.is_native { + self.check_native(ctx, started).await + } else { + self.check_docker_via_supervisor_log(ctx, started).await + } + } +} + +impl AdpExitsWithAssertion { + async fn check_native(&self, ctx: &AssertionContext, started: Instant) -> AssertionResult { + let cell = match ctx.native_exit_code.as_ref() { + Some(c) => c.clone(), + None => { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: "Native exit code cell not provided in AssertionContext.".to_string(), + duration: started.elapsed(), + }; + } + }; + + // Wait until either the exit token fires (process exited or was cleaned up) or the + // timeout elapses. + tokio::select! { + _ = ctx.container_exit_token.cancelled() => {} + _ = tokio::time::sleep(self.timeout) => { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!("ADP did not exit within {:?}.", self.timeout), + duration: started.elapsed(), + }; + } + } + + match cell.get() { + Some(Some(code)) => { + let code = *code as i64; + if code == self.expected_code { + AssertionResult { + name: self.name().to_string(), + passed: true, + message: format!("ADP exited with expected code {}.", code), + duration: started.elapsed(), + } + } else { + AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!("ADP exited with code {}, expected {}.", code, self.expected_code), + duration: started.elapsed(), + } + } + } + Some(None) => AssertionResult { + name: self.name().to_string(), + passed: false, + message: "ADP was terminated by signal; no exit code available.".to_string(), + duration: started.elapsed(), + }, + None => AssertionResult { + name: self.name().to_string(), + passed: false, + message: "Exit token fired but exit code not yet recorded.".to_string(), + duration: started.elapsed(), + }, + } + } + + async fn check_docker_via_supervisor_log(&self, ctx: &AssertionContext, started: Instant) -> AssertionResult { + // s6 writes `agent-data-plane exited with code N` to the container's log stream when + // ADP exits. Poll the captured log buffer for that line. + let pattern = format!("agent-data-plane exited with code {}", self.expected_code); + let deadline = Instant::now() + self.timeout; + loop { + if Instant::now() > deadline { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!( + "Did not observe ADP exit with code {} within {:?}.", + self.expected_code, self.timeout + ), + duration: started.elapsed(), + }; + } + if ctx.cancel_token.is_cancelled() { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: "Assertion cancelled.".to_string(), + duration: started.elapsed(), + }; + } + let buf = ctx.log_buffer.read().await; + if buf.contains_match(&pattern, false, &LogStream::Both) { + return AssertionResult { + name: self.name().to_string(), + passed: true, + message: format!("Observed ADP exit with expected code {}.", self.expected_code), + duration: started.elapsed(), + }; + } + drop(buf); + tokio::time::sleep(Duration::from_millis(200)).await; + } + } +} diff --git a/bin/correctness/panoramic/src/assertions/log_contains.rs b/bin/correctness/panoramic/src/assertions/log_contains.rs index 8faf6ed8e38..a5582173fee 100644 --- a/bin/correctness/panoramic/src/assertions/log_contains.rs +++ b/bin/correctness/panoramic/src/assertions/log_contains.rs @@ -63,11 +63,15 @@ impl Assertion for LogContainsAssertion { }; } - if ctx.cancel_token.is_cancelled() || ctx.container_exit_token.is_cancelled() { + // If the process exited, do a final read of the log buffer in case the line we are + // looking for landed before the exit. Only treat true external cancellation (e.g., + // Ctrl-C / test-suite timeout) as a hard cancel. + let exited = ctx.container_exit_token.is_cancelled(); + if ctx.cancel_token.is_cancelled() { return AssertionResult { name: self.name().to_string(), passed: false, - message: "Assertion cancelled because container exited.".to_string(), + message: "Assertion cancelled.".to_string(), duration: started.elapsed(), }; } @@ -92,6 +96,17 @@ impl Assertion for LogContainsAssertion { ); } + // After the final post-exit read, if we still have not matched, the pattern is + // definitively absent. Stop polling rather than spinning until the deadline. + if exited { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!("Pattern '{}' not found in logs before process exited.", self.pattern), + duration: started.elapsed(), + }; + } + tokio::time::sleep(Duration::from_millis(100)).await; } } @@ -157,11 +172,15 @@ impl Assertion for LogNotContainsAssertion { }; } - if ctx.cancel_token.is_cancelled() || ctx.container_exit_token.is_cancelled() { + // If the process exited, do a final read of the log buffer in case the line we are + // looking for landed before the exit. Only treat true external cancellation (e.g., + // Ctrl-C / test-suite timeout) as a hard cancel. + let exited = ctx.container_exit_token.is_cancelled(); + if ctx.cancel_token.is_cancelled() { return AssertionResult { name: self.name().to_string(), passed: false, - message: "Assertion cancelled because container exited.".to_string(), + message: "Assertion cancelled.".to_string(), duration: started.elapsed(), }; } @@ -186,6 +205,17 @@ impl Assertion for LogNotContainsAssertion { } } + // If the process exited, the absence of the pattern is final — nothing more can + // be logged. Treat this as success (the pattern never appeared during the run). + if exited { + return AssertionResult { + name: self.name().to_string(), + passed: true, + message: format!("Pattern '{}' not found in logs; process exited cleanly.", self.pattern), + duration: started.elapsed(), + }; + } + tokio::time::sleep(Duration::from_millis(100)).await; } } diff --git a/bin/correctness/panoramic/src/assertions/mod.rs b/bin/correctness/panoramic/src/assertions/mod.rs index 2c6d3c8f557..5a99f53930c 100644 --- a/bin/correctness/panoramic/src/assertions/mod.rs +++ b/bin/correctness/panoramic/src/assertions/mod.rs @@ -6,6 +6,7 @@ use tokio_util::sync::CancellationToken; use crate::config::{AssertionConfig, LogStream}; +mod adp_exits; mod file_contains; mod http_check; mod log_contains; @@ -13,6 +14,7 @@ mod port_listening; mod process_exits; mod process_stable; +pub use adp_exits::AdpExitsWithAssertion; pub use file_contains::FileContainsAssertion; pub use http_check::HttpCheckAssertion; pub use log_contains::{LogContainsAssertion, LogNotContainsAssertion}; @@ -106,6 +108,10 @@ pub struct AssertionContext { /// otherwise reach into a container (e.g., reading a file via `docker exec`) should operate /// against the host filesystem / local process instead. pub is_native: bool, + /// Exit code of the native target process, populated once it exits. `None` on the docker + /// path or while the process is still running; `Some(None)` if the process was killed by + /// signal; `Some(Some(code))` if it exited normally. + pub native_exit_code: Option, } /// Trait for assertion implementations. @@ -128,6 +134,9 @@ pub fn create_assertion(config: &AssertionConfig) -> Result, AssertionConfig::ProcessExitsWith { expected_code, timeout } => { Ok(Box::new(ProcessExitsWithAssertion::new(*expected_code, timeout.0))) } + AssertionConfig::AdpExitsWith { expected_code, timeout } => { + Ok(Box::new(AdpExitsWithAssertion::new(*expected_code, timeout.0))) + } AssertionConfig::PortListening { port, protocol, diff --git a/bin/correctness/panoramic/src/assertions/process_exits.rs b/bin/correctness/panoramic/src/assertions/process_exits.rs index 8efbf4b3175..a4e979bf5b1 100644 --- a/bin/correctness/panoramic/src/assertions/process_exits.rs +++ b/bin/correctness/panoramic/src/assertions/process_exits.rs @@ -31,58 +31,15 @@ impl Assertion for ProcessExitsWithAssertion { let started = Instant::now(); tokio::select! { - // Wait for the container to exit. _ = ctx.container_exit_token.cancelled() => { - // Container exited - check exit code via Docker API - let docker: bollard::Docker = match airlock::docker::connect() { - Ok(d) => d, - Err(e) => { - return AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!("Failed to connect to Docker: {}", e), - duration: started.elapsed(), - }; - } - }; - - let info = docker.inspect_container(&ctx.container_name, None).await; - - match info { - Ok(container) => { - let exit_code = container.state - .and_then(|s| s.exit_code) - .unwrap_or(-1); - - if exit_code == self.expected_code { - AssertionResult { - name: self.name().to_string(), - passed: true, - message: format!("Process exited with expected code {}.", exit_code), - duration: started.elapsed(), - } - } else { - AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!( - "Process exited with code {}, expected {}.", - exit_code, self.expected_code - ), - duration: started.elapsed(), - } - } - } - Err(e) => AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!("Failed to inspect container: {}", e), - duration: started.elapsed(), - } + if ctx.is_native { + self.check_native(ctx, started) + } else { + self.check_docker(ctx, started).await } } - // Timeout waiting for the container to exit. + // Timeout waiting for the process to exit. _ = tokio::time::sleep(self.timeout) => { AssertionResult { name: self.name().to_string(), @@ -94,3 +51,100 @@ impl Assertion for ProcessExitsWithAssertion { } } } + +impl ProcessExitsWithAssertion { + fn check_native(&self, ctx: &AssertionContext, started: Instant) -> AssertionResult { + let cell = match ctx.native_exit_code.as_ref() { + Some(c) => c, + None => { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: "Native exit code cell not provided in AssertionContext.".to_string(), + duration: started.elapsed(), + }; + } + }; + let exit_code = match cell.get() { + Some(Some(code)) => *code as i64, + Some(None) => { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: "Process was terminated by signal; no exit code available.".to_string(), + duration: started.elapsed(), + }; + } + None => { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: "Exit token fired but exit code not yet recorded.".to_string(), + duration: started.elapsed(), + }; + } + }; + if exit_code == self.expected_code { + AssertionResult { + name: self.name().to_string(), + passed: true, + message: format!("Process exited with expected code {}.", exit_code), + duration: started.elapsed(), + } + } else { + AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!( + "Process exited with code {}, expected {}.", + exit_code, self.expected_code + ), + duration: started.elapsed(), + } + } + } + + async fn check_docker(&self, ctx: &AssertionContext, started: Instant) -> AssertionResult { + let docker: bollard::Docker = match airlock::docker::connect() { + Ok(d) => d, + Err(e) => { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!("Failed to connect to Docker: {}", e), + duration: started.elapsed(), + }; + } + }; + + match docker.inspect_container(&ctx.container_name, None).await { + Ok(container) => { + let exit_code = container.state.and_then(|s| s.exit_code).unwrap_or(-1); + if exit_code == self.expected_code { + AssertionResult { + name: self.name().to_string(), + passed: true, + message: format!("Process exited with expected code {}.", exit_code), + duration: started.elapsed(), + } + } else { + AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!( + "Process exited with code {}, expected {}.", + exit_code, self.expected_code + ), + duration: started.elapsed(), + } + } + } + Err(e) => AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!("Failed to inspect container: {}", e), + duration: started.elapsed(), + }, + } + } +} diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index 77cd9f958e1..21adb0287e5 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -217,6 +217,21 @@ pub enum AssertionConfig { timeout: HumanDuration, }, + /// Check that ADP itself exits with a specific exit code, abstracting over the runtime's + /// observation mechanism. + /// + /// On the `docker` runtime the converged image wraps ADP under s6, which keeps the + /// container alive across ADP restarts and logs `agent-data-plane exited with code N` from + /// `docker/s6-services/agent-data-plane/finish`. This assertion greps the log buffer for + /// that line. On the `native_macos` runtime ADP is spawned directly; the assertion reads + /// the exit code recorded by the native runner when ADP's child process exited. + AdpExitsWith { + /// The expected exit code. + expected_code: i64, + /// Timeout for waiting for the exit to be observed. + timeout: HumanDuration, + }, + /// Check that a port is listening. PortListening { /// The port number to check. @@ -333,7 +348,9 @@ impl AssertionConfig { crate::dynamic_vars::resolve_placeholders(p, vars); } } - AssertionConfig::ProcessStableFor { .. } | AssertionConfig::ProcessExitsWith { .. } => {} + AssertionConfig::ProcessStableFor { .. } + | AssertionConfig::ProcessExitsWith { .. } + | AssertionConfig::AdpExitsWith { .. } => {} } } @@ -356,7 +373,9 @@ impl AssertionConfig { crate::dynamic_vars::find_unresolved(p, &mut out); } } - AssertionConfig::ProcessStableFor { .. } | AssertionConfig::ProcessExitsWith { .. } => {} + AssertionConfig::ProcessStableFor { .. } + | AssertionConfig::ProcessExitsWith { .. } + | AssertionConfig::AdpExitsWith { .. } => {} } out } diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index e22ba8230ac..fc50f60db94 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -99,7 +99,13 @@ impl NativeIntegrationRunner { } debug!(test = %test_name, state_dir = %state_dir.display(), "Prepared per-test state directory."); - let exit_token = CancellationToken::new(); + // ADP and the (optional) Core Agent get independent exit tokens. The token passed to + // assertions is ADP's — we only care about ADP's exit lifecycle from the test's point + // of view; the Core Agent dying separately is an environmental fault, not a test + // signal. The Agent's token is used only by `NativeProcess` internals to fire when the + // Agent process truly exits. + let adp_exit_token = CancellationToken::new(); + let agent_exit_token = CancellationToken::new(); let log_sink: Arc> = Arc::new(Mutex::new(NativeLogSink { buf: self.log_buffer.clone(), })); @@ -144,7 +150,7 @@ impl NativeIntegrationRunner { // for trace-agent), blocking subsequent tests. .with_process_group(); - let agent = match NativeProcess::spawn(agent_config, log_sink.clone(), exit_token.clone()).await { + let agent = match NativeProcess::spawn(agent_config, log_sink.clone(), agent_exit_token.clone()).await { Ok(p) => p, Err(e) => { phase_timings.push(PhaseTiming { @@ -193,7 +199,7 @@ impl NativeIntegrationRunner { .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) .with_env_map(adp_env); - let process = match NativeProcess::spawn(process_config, log_sink, exit_token.clone()).await { + let process = match NativeProcess::spawn(process_config, log_sink, adp_exit_token.clone()).await { Ok(p) => p, Err(e) => { if let Some(agent) = core_agent.take() { @@ -216,7 +222,11 @@ impl NativeIntegrationRunner { // Phase: run assertions. let assertion_start = Instant::now(); let assertion_results = self - .run_assertions(process.name().to_string(), exit_token.clone()) + .run_assertions( + process.name().to_string(), + adp_exit_token.clone(), + process.exit_code_cell(), + ) .await; phase_timings.push(PhaseTiming { phase: "assertions".to_string(), @@ -264,6 +274,7 @@ impl NativeIntegrationRunner { async fn run_assertions( &self, process_display_name: String, exit_token: CancellationToken, + exit_code_cell: airlock::native::ExitCodeCell, ) -> Vec { let mut results = Vec::new(); let cancel_token = self.tctx.test_cancel_token(); @@ -290,6 +301,7 @@ impl NativeIntegrationRunner { cancel_token: cancel_token.clone(), container_name: process_display_name.clone(), is_native: true, + native_exit_code: Some(exit_code_cell.clone()), port_mappings: port_mappings.clone(), }; results.push(assertion.check(&ctx).await); @@ -305,6 +317,7 @@ impl NativeIntegrationRunner { cancel_token: cancel_token.clone(), container_name: process_display_name.clone(), is_native: true, + native_exit_code: Some(exit_code_cell.clone()), port_mappings: port_mappings.clone(), }; futures.push(async move { a.check(&ctx).await }); diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index 63f4572419d..02ac477ef97 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -818,6 +818,7 @@ impl IntegrationRunner { port_mappings: port_mappings.clone(), container_name: container_name.to_string(), is_native: false, + native_exit_code: None, }; for (step_index, step) in self.test_case.assertions.iter().enumerate() { diff --git a/test/integration/cases/adp-rar-disabled/config.yaml b/test/integration/cases/adp-rar-disabled/config.yaml index 5f166802619..8d4d0603b84 100644 --- a/test/integration/cases/adp-rar-disabled/config.yaml +++ b/test/integration/cases/adp-rar-disabled/config.yaml @@ -2,8 +2,11 @@ type: integration name: "adp-rar-disabled" description: "Verify ADP gracefully handles RAR being disabled on the Core Agent" timeout: 120s -runtimes: [docker, native_macos] -requires_core_agent: true +# Docker-only: the assertion is that ADP stays up (handles failure gracefully via retry). In +# the converged image, s6 restarts ADP on every exit, so 'process stable for 10s' really +# means 'container stable for 10s'. There is no equivalent supervisor on the native_macos +# runtime, so this assertion does not translate; re-enable once the native runner grows a +# supervisor or the test is rewritten to assert on retry behavior directly. container: image: "saluki-images/datadog-agent:testing-devel" From 1d27828ea987fd8d2b17afcbfdce7445989958eb Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 15:05:38 -0400 Subject: [PATCH 15/56] test(integration): enable 3 more tests on native_macos via adp_exits_with Newly enabled on the native_macos runtime: - adp-disabled-exit: ADP logs and exits when DD_DATA_PLANE_ENABLED=false. Needs requires_core_agent: true because remote_agent_enabled defaults to true; ADP must complete its IPC handshake with the Agent before reaching the 'not enabled, exiting' check. - adp-config-check-exit: replaces the docker-specific s6 supervisor log assertion ('agent-data-plane exited with code') with the new adp_exits_with assertion. The assertion works on both runtimes: docker still greps the s6 log; native reads the exit code cell. - adp-memory-mode-strict-exceeds-limit: same s6-log -> adp_exits_with swap. Total native_macos coverage is now 24 of 27 integration tests. Remaining 3 not enabled: - adp-rar-disabled: the assertion is 'container stable for 10s', but on docker that really means 's6 keeps restarting ADP for 10s.' There is no equivalent supervisor on native_macos; ADP exits when registration fails (which is expected, and the s6 wrapper recovers on its side). Re-enable once the native runner grows restart-on- failure semantics, or rewrite the assertion to probe retry behavior directly. - dogstatsd-bind-host, dogstatsd-bind-custom-hostname: use PANORAMIC_DYNAMIC env shell hooks that run 'hostname -i' and 'echo ... >> /etc/hosts' \u2014 Linux-container-isms that need a portable equivalent or a per-runtime DYNAMIC mechanism on the native runner. --- .../cases/adp-config-check-exit/config.yaml | 13 +++++++------ .../integration/cases/adp-disabled-exit/config.yaml | 2 ++ .../config.yaml | 10 ++++++---- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/test/integration/cases/adp-config-check-exit/config.yaml b/test/integration/cases/adp-config-check-exit/config.yaml index f9a7871e811..7fb4667775d 100644 --- a/test/integration/cases/adp-config-check-exit/config.yaml +++ b/test/integration/cases/adp-config-check-exit/config.yaml @@ -10,6 +10,8 @@ type: integration name: "adp-config-check-exit" description: "Verify config check exits ADP on high-severity incompatible key" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" @@ -29,10 +31,9 @@ assertions: - type: log_contains pattern: "incompatible configuration detected" timeout: 30s - # The bundled image runs ADP under s6, which logs this line from - # docker/s6-services/agent-data-plane/finish when the ADP process exits. - # Asserting it proves ADP actually terminated rather than logging the - # error and continuing. - - type: log_contains - pattern: "agent-data-plane exited with code" + # Verify ADP actually exited (not just logged the error and continued). On docker this + # observes the s6 supervisor's exit log; on native_macos it observes the process exit code + # directly. Expected code is 1 because the high-severity check returns a non-zero status. + - type: adp_exits_with + expected_code: 1 timeout: 30s diff --git a/test/integration/cases/adp-disabled-exit/config.yaml b/test/integration/cases/adp-disabled-exit/config.yaml index 4cb1c58a3a2..7f651eac373 100644 --- a/test/integration/cases/adp-disabled-exit/config.yaml +++ b/test/integration/cases/adp-disabled-exit/config.yaml @@ -2,6 +2,8 @@ type: integration name: "adp-disabled-exit" description: "Verify ADP exits cleanly when data plane is not enabled" timeout: 120s +runtimes: [docker, native_macos] +requires_core_agent: true container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml b/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml index a39c1055572..4faf6831a0d 100644 --- a/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml @@ -2,6 +2,7 @@ type: integration name: "adp-memory-mode-strict-exceeds-limit" description: "Verifies that strict mode causes ADP to exit with code 1 when the calculated bounds exceed the configured limit" timeout: 120s +runtimes: [docker, native_macos] container: image: "saluki-images/datadog-agent:testing-devel" @@ -17,8 +18,9 @@ container: DD_AGGREGATE_CONTEXT_LIMIT: "10000000" assertions: - # The container's s6 supervisor restarts ADP after every exit, so the container itself never - # exits. Match on the supervisor's exit log instead, which reports ADP's actual exit code. - - type: log_contains - pattern: "agent-data-plane exited with code 1" + # Observe ADP's actual exit code, regardless of runtime. On docker (s6) this greps the + # supervisor's exit log line; on native_macos this reads the exit code recorded by the + # native runner. + - type: adp_exits_with + expected_code: 1 timeout: 30s From e308a668adbc44569d59e021e8dc4ac20ecabbab Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 15:14:39 -0400 Subject: [PATCH 16/56] ci: wire up native_macos integration tests on the macOS runners Adds two GitLab CI jobs in the e2e stage: - test-integration-macos-arm64 (runs on macos:sonoma-arm64) - test-integration-macos-amd64 (runs on macos:sonoma-amd64) Both extend a shared .test-integration-macos-base mixin and invoke a new 'test-integration-macos-ci' make target that: 1. Builds panoramic and agent-data-plane natively for the runner's architecture. 2. Provisions the test environment via 'provision-macos-test-env', which is idempotent: - If /opt/datadog-agent is missing, downloads the matching DMG ($(uname -m) selects arm64 vs x86_64) and runs the installer (the postinstall step exits non-zero but the agent binary lands before that point, so we tolerate the failure). - If the IPC cert/auth_token are missing, runs the Agent briefly to bootstrap them. - chowns them to the current user only if not already readable so sudo isn't invoked needlessly on already-set-up systems. 3. Runs the native_macos integration suite via 'test-integration-macos-run'. Before each job, a defensive 'sudo pkill' clears any stranded datadog-agent processes from prior runs on the shared runner. Caching: the Datadog Agent install at /opt/datadog-agent persists across jobs on a given runner, so subsequent runs skip the (~15s) DMG download and install steps entirely. First-run overhead on a fresh runner is ~30-60s. The DMG version is pinned via MACOS_TEST_AGENT_VERSION (default 7.78.0) for reproducibility; bump it when integration tests need newer Agent behavior. The 3 tests not enabled on native_macos (adp-rar-disabled, dogstatsd-bind-host, dogstatsd-bind-custom-hostname) are filtered out naturally because they don't list 'native_macos' in their 'runtimes' field, and panoramic's --runtime filter only selects matching tests. --- .gitlab/e2e.yml | 36 ++++++++++++++++++++++++++++++++ Makefile | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index 330c78bd240..f7d05600b10 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -116,3 +116,39 @@ test-integration: - docker pull ${SALUKI_IMAGE_REPO_BASE}/bundled-agent-adp:${CI_COMMIT_SHA} - docker tag ${SALUKI_IMAGE_REPO_BASE}/bundled-agent-adp:${CI_COMMIT_SHA} saluki-images/datadog-agent:testing-devel - make test-integration-quick + +# Runs the subset of integration tests that have opted in to the `native_macos` runtime +# directly on a bare-metal macOS runner. No Docker, no virtualization: panoramic spawns ADP +# (and the Core Agent for converged tests) as real macOS processes against a per-test temp +# state directory. The Datadog Agent install at /opt/datadog-agent is provisioned by the +# Makefile target (idempotent: re-uses an existing install if the runner has one). +.test-integration-macos-base: + stage: e2e + needs: [] + retry: 2 + timeout: 30m + artifacts: + expire_in: 1 week + paths: + - integration-logs/ + when: always + variables: + PANORAMIC_LOG_DIR: integration-logs + before_script: + # Defensive: clean up any leftover Datadog Agent processes from prior runs on this shared + # runner before we begin. Otherwise a stranded trace-agent/process-agent can hold ports + # (e.g., 8126) and break the first converged test. + - sudo pkill -9 -f /opt/datadog-agent/bin/agent/agent || true + - sudo pkill -9 -f /opt/datadog-agent/embedded/bin/ || true + script: + - make test-integration-macos-ci + +test-integration-macos-arm64: + extends: + - .macos-arm64-test-job + - .test-integration-macos-base + +test-integration-macos-amd64: + extends: + - .macos-amd64-test-job + - .test-integration-macos-base diff --git a/Makefile b/Makefile index 7f9950da9a9..d0cfa6cbf4f 100644 --- a/Makefile +++ b/Makefile @@ -592,6 +592,61 @@ test-integration-macos-run: ## Runs native macOS integration tests using already .PHONY: test-integration-macos test-integration-macos: build-panoramic build-adp-native test-integration-macos-run ## Builds and runs ADP integration tests natively on macOS (no Docker) +# Version of the Datadog Agent installed by `provision-macos-test-env`. Pinned for +# reproducibility; bump when the integration tests need newer Agent behavior. +MACOS_TEST_AGENT_VERSION ?= 7.78.0 +MACOS_TEST_AGENT_DMG_DIR ?= /tmp/saluki-dda-dmg-cache +MACOS_TEST_AGENT_DMG_URL ?= https://s3.amazonaws.com/dd-agent/datadog-agent-$(MACOS_TEST_AGENT_VERSION)-1.$(shell uname -m).dmg + +.PHONY: provision-macos-test-env +provision-macos-test-env: ## Idempotently installs the Datadog Agent at /opt/datadog-agent and bootstraps the IPC cert; required by converged native_macos integration tests. + @echo "[*] Provisioning macOS test environment..." + @if [ "$(shell uname -s)" != "Darwin" ]; then \ + echo "provision-macos-test-env only runs on macOS hosts" >&2; exit 1; \ + fi + @if [ ! -x /opt/datadog-agent/bin/agent/agent ]; then \ + echo "[*] Installing Datadog Agent $(MACOS_TEST_AGENT_VERSION)..."; \ + mkdir -p $(MACOS_TEST_AGENT_DMG_DIR); \ + DMG_PATH=$(MACOS_TEST_AGENT_DMG_DIR)/datadog-agent-$(MACOS_TEST_AGENT_VERSION).dmg; \ + if [ ! -f "$$DMG_PATH" ]; then \ + curl -fL "$(MACOS_TEST_AGENT_DMG_URL)" -o "$$DMG_PATH"; \ + fi; \ + sudo hdiutil detach /Volumes/datadog_agent 2>/dev/null || true; \ + sudo hdiutil attach "$$DMG_PATH" -mountpoint /Volumes/datadog_agent -nobrowse >/dev/null; \ + PKG=$$(find /Volumes/datadog_agent -name '*.pkg' | head -1); \ + echo "[*] Running installer (postinstall may fail; the binaries we need are written before postinstall runs)"; \ + sudo /usr/sbin/installer -pkg "$$PKG" -target / >/dev/null 2>&1 || true; \ + sudo hdiutil detach /Volumes/datadog_agent >/dev/null 2>&1; \ + test -x /opt/datadog-agent/bin/agent/agent; \ + else \ + echo "[*] Datadog Agent already installed at /opt/datadog-agent"; \ + fi + @if [ ! -f /opt/datadog-agent/etc/ipc_cert.pem ] || [ ! -f /opt/datadog-agent/etc/auth_token ]; then \ + echo "[*] Bootstrapping IPC cert + auth_token by running the Agent briefly..."; \ + sudo mkdir -p /opt/datadog-agent/run /opt/datadog-agent/etc; \ + sudo DD_API_KEY=bootstrap DD_HOSTNAME=bootstrap /opt/datadog-agent/bin/agent/agent run -c /opt/datadog-agent/etc >/tmp/saluki-agent-bootstrap.log 2>&1 & \ + AGENT_PID=$$!; \ + for i in $$(seq 1 30); do \ + sleep 1; \ + if [ -f /opt/datadog-agent/etc/ipc_cert.pem ] && [ -f /opt/datadog-agent/etc/auth_token ]; then break; fi; \ + done; \ + sudo kill $$AGENT_PID 2>/dev/null || true; \ + wait $$AGENT_PID 2>/dev/null || true; \ + test -f /opt/datadog-agent/etc/ipc_cert.pem; \ + else \ + echo "[*] IPC cert already present at /opt/datadog-agent/etc/ipc_cert.pem"; \ + fi + @echo "[*] Ensuring cert/auth_token readable by current user..." + @if ! cat /opt/datadog-agent/etc/ipc_cert.pem >/dev/null 2>&1 || ! cat /opt/datadog-agent/etc/auth_token >/dev/null 2>&1; then \ + sudo chown $$(whoami) /opt/datadog-agent/etc/ipc_cert.pem /opt/datadog-agent/etc/auth_token; \ + else \ + echo "[*] Files already readable by $$(whoami)."; \ + fi + @echo "[*] macOS test environment ready." + +.PHONY: test-integration-macos-ci +test-integration-macos-ci: build-panoramic build-adp-native provision-macos-test-env test-integration-macos-run ## CI entry point: builds binaries, ensures Agent + cert are provisioned, then runs the native_macos integration tests + .PHONY: ensure-rust-miri ensure-rust-miri: ifeq ($(shell command -v rustup >/dev/null || echo not-found), not-found) From 44fd6e60135dc33cdd68b7d06eccd381cd71cf0e Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Tue, 26 May 2026 15:23:12 -0400 Subject: [PATCH 17/56] chore(docs): fix vale lint errors Replaces 'e.g.' with 'for example' in doc comments (Google.Latin), swaps an em dash for a semicolon in a runtime-field docstring (Google.EmDash), and adds 'launchd' to the technical vocabulary. Also drops docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md; the plan was a one-time implementation artifact and the PR description on #1735 supersedes it. --- .../config/vocabularies/technical/accept.txt | 1 + bin/correctness/airlock/src/native.rs | 8 +- .../panoramic/src/assertions/mod.rs | 2 +- bin/correctness/panoramic/src/cli.rs | 2 +- bin/correctness/panoramic/src/config.rs | 2 +- ...26-05-21-macos-native-integration-tests.md | 941 ------------------ 6 files changed, 8 insertions(+), 948 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md diff --git a/.vale/styles/config/vocabularies/technical/accept.txt b/.vale/styles/config/vocabularies/technical/accept.txt index a4a23fc4af1..ad9315178e9 100644 --- a/.vale/styles/config/vocabularies/technical/accept.txt +++ b/.vale/styles/config/vocabularies/technical/accept.txt @@ -228,3 +228,4 @@ libtest mpmc dhat profiler +launchd diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/native.rs index 275c0da3a6f..4d999ee499b 100644 --- a/bin/correctness/airlock/src/native.rs +++ b/bin/correctness/airlock/src/native.rs @@ -29,7 +29,7 @@ use tracing::{debug, warn}; /// Shared cell that receives the exit code of a spawned [`NativeProcess`]. /// /// The cell is populated by the background exit watcher when the child exits on its own, or by -/// [`NativeProcess::cleanup`] when the test tears down. Consumers (e.g., the +/// [`NativeProcess::cleanup`] when the test tears down. Consumers (for example, the /// `process_exits_with` assertion in panoramic) read the cell after the exit token fires. /// /// The inner `Option` is `None` if the process was terminated by signal rather than exiting @@ -52,7 +52,7 @@ pub struct NativeProcessConfig { /// If `true`, the spawned process is placed into a new process group with itself as the /// group leader, and [`cleanup`][NativeProcess::cleanup] signals the entire group instead of /// only the immediate child. This is essential when the spawned binary forks helpers that - /// outlive their parent (e.g., the Datadog Core Agent spawns `trace-agent` and + /// outlive their parent (for example, the Datadog Core Agent spawns `trace-agent` and /// `process-agent` which orphan onto launchd if only the parent is killed). pub use_process_group: bool, } @@ -240,7 +240,7 @@ impl NativeProcess { /// cancels the exit token. pub async fn cleanup(mut self) { // If we asked for a process group, first send SIGTERM to the entire group. This gives - // descendants (e.g., trace-agent, process-agent spawned by the Datadog Core Agent) a + // descendants (for example, trace-agent, process-agent spawned by the Datadog Core Agent) a // chance to shut down cleanly before we hard-kill them. After a brief grace period we // send SIGKILL to the group to guarantee no orphans remain. #[cfg(unix)] @@ -274,7 +274,7 @@ impl NativeProcess { if let Some(handle) = self.exit_task.take() { let _ = handle.await; } - // Defensive: make sure the token is fired even if the watcher never set it (e.g., on a + // Defensive: make sure the token is fired even if the watcher never set it (for example, on a // failed wait). self.exit_token.cancel(); for handle in self.log_tasks.drain(..) { diff --git a/bin/correctness/panoramic/src/assertions/mod.rs b/bin/correctness/panoramic/src/assertions/mod.rs index 5a99f53930c..afb582da0aa 100644 --- a/bin/correctness/panoramic/src/assertions/mod.rs +++ b/bin/correctness/panoramic/src/assertions/mod.rs @@ -105,7 +105,7 @@ pub struct AssertionContext { /// Name of the container being tested. pub container_name: String, /// Whether the test is running natively (no container). When `true`, assertions that would - /// otherwise reach into a container (e.g., reading a file via `docker exec`) should operate + /// otherwise reach into a container (for example, reading a file via `docker exec`) should operate /// against the host filesystem / local process instead. pub is_native: bool, /// Exit code of the native target process, populated once it exits. `None` on the docker diff --git a/bin/correctness/panoramic/src/cli.rs b/bin/correctness/panoramic/src/cli.rs index 2c8cd270bad..d4689d1fbd2 100644 --- a/bin/correctness/panoramic/src/cli.rs +++ b/bin/correctness/panoramic/src/cli.rs @@ -30,7 +30,7 @@ pub struct RunCommand { #[argh(option, short = 't')] pub tests: Option, - /// run only tests with the given runtime (e.g., `docker`, `native_macos`, `kubernetes_in_docker`). + /// run only tests with the given runtime (for example, `docker`, `native_macos`, `kubernetes_in_docker`). /// Can be combined with `-t` to further restrict by name. #[argh(option)] pub runtime: Option, diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index 21adb0287e5..b892dfbca35 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -129,7 +129,7 @@ pub struct IntegrationConfig { /// way they would in production. When `false` (the default), only ADP is spawned (standalone /// mode). /// - /// On the `docker` runtime this field is informational — the converged image always runs + /// On the `docker` runtime this field is informational; the converged image always runs /// both processes via s6. #[serde(default)] pub requires_core_agent: bool, diff --git a/docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md b/docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md deleted file mode 100644 index 85a35d80070..00000000000 --- a/docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md +++ /dev/null @@ -1,941 +0,0 @@ -# macOS Native Integration Tests Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Enable a single integration test (`basic-startup`) to run as a native macOS process via panoramic, in a way that works both on the existing bare-metal `macos:sonoma-arm64` CI runner and locally on a developer's macOS machine. - -**Architecture:** Reuse the existing `Test::runtime()` mechanism (currently `"docker"` or `"kubernetes_in_docker"`) by adding a new `"native_macos"` runtime. At test discovery time, expand each `IntegrationConfig` with multiple declared runtimes into one `Test` instance per runtime. A new `NativeRunner` in panoramic handles the `native_macos` case by spawning the ADP binary directly via `tokio::process::Command` and using the existing assertion framework. The existing Docker path is untouched. - -**Tech Stack:** Rust, Tokio, the existing `panoramic` test runner and `airlock` driver crate. - -**Scope:** Only `basic-startup` on macOS. Only the standalone ADP path (no Core Agent, no IPC). No Tart wrapper in this PR — `make` target works directly on a macOS host; a follow-up PR can add Tart for non-macOS local dev. No CI job in this PR — that's a follow-up that depends on a separate `build-adp-macos-binary` job. This PR proves the end-to-end design works locally on macOS. - ---- - -## File Structure - -**New files:** -- `bin/correctness/airlock/src/native.rs` — `NativeProcess` abstraction (spawn, log capture, exit watch, cleanup) -- `bin/correctness/panoramic/src/native_runner.rs` — `NativeIntegrationRunner` analogous to the existing `IntegrationRunner` but for the native-process path -- `docs/superpowers/plans/2026-05-21-macos-native-integration-tests.md` — this plan - -**Modified files:** -- `bin/correctness/airlock/src/lib.rs` — export `native` module -- `bin/correctness/panoramic/src/config.rs` — add `runtimes: Vec` field to `IntegrationConfig`; dispatch `run()` based on the per-instance runtime -- `bin/correctness/panoramic/src/test.rs` — at discovery, expand multi-runtime integration configs into one `Test` per runtime -- `bin/correctness/panoramic/src/main.rs` — wire the new module -- `bin/correctness/panoramic/src/assertions/mod.rs` — `AssertionContext` already has `container_name` and `port_mappings`; for native, `container_name` doubles as the process display name and `port_mappings` is identity (no remapping needed). No code change expected here, but verify. -- `test/integration/cases/basic-startup/config.yaml` — add `runtimes: [docker, native_macos]` -- `Makefile` — add `test-integration-macos` target - -**Files NOT touched in this PR (deferred):** -- `bin/correctness/panoramic/src/runner.rs` (the existing `IntegrationRunner`) — leave alone to keep the Linux path zero-risk -- `bin/correctness/panoramic/src/assertions/file_contains.rs` — only one test in the corpus uses it, not basic-startup -- `tooling/generate-correctness-pipeline.sh` — CI pipeline gen, deferred to a follow-up -- `.gitlab/` files — CI integration deferred - ---- - -## Conventions - -- The ADP binary location is discovered via the `ADP_BINARY_PATH` env var, falling back to `target/release/agent-data-plane` relative to the panoramic working directory. -- Per-test process output (stdout + stderr) is captured into the existing `LogBuffer` exactly the same way the Docker path does, so the existing assertions work unchanged. -- The native runner respects the existing `TestContext` cancel token and writes per-test logs into the existing `log_dir` structure. - ---- - -## Task 1: Add the `NativeProcess` abstraction in `airlock` - -**Files:** -- Create: `bin/correctness/airlock/src/native.rs` -- Modify: `bin/correctness/airlock/src/lib.rs:1-3` -- Test: covered by integration end-to-end (no unit test for this initial slice; the structure is mostly straight `tokio::process` wiring that's easier to exercise through panoramic) - -- [ ] **Step 1: Create the `native.rs` module skeleton** - -Create `bin/correctness/airlock/src/native.rs`: - -```rust -//! Native process driver for non-containerized integration tests. -//! -//! This module mirrors the surface of the Docker [`Driver`][crate::driver::Driver] but spawns a -//! local binary instead of a container. It exists so that integration tests can run on macOS -//! hosts where ADP is exercised as a real macOS process rather than inside a Linux container. -//! -//! Only the small subset of the Docker driver surface needed by the panoramic -//! `NativeIntegrationRunner` is implemented: spawn, log capture, exit watching, and cleanup. - -use std::{ - collections::HashMap, - path::PathBuf, - process::Stdio, - sync::Arc, - time::Duration, -}; - -use saluki_error::{generic_error, ErrorContext as _, GenericError}; -use tokio::{ - io::{AsyncBufReadExt as _, BufReader}, - process::{Child, Command}, - sync::Mutex, - task::JoinHandle, -}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, warn}; - -/// Configuration for a native process to spawn. -#[derive(Clone)] -pub struct NativeProcessConfig { - /// Display name used for logs and reporting. - pub name: String, - /// Absolute path to the binary to execute. - pub binary_path: PathBuf, - /// Arguments passed to the binary. - pub args: Vec, - /// Environment variables to set for the process. - pub env: HashMap, - /// Working directory for the process. If `None`, inherits panoramic's working directory. - pub working_dir: Option, -} - -impl NativeProcessConfig { - /// Creates a new configuration with the given display name and binary path. - pub fn new(name: impl Into, binary_path: impl Into) -> Self { - Self { - name: name.into(), - binary_path: binary_path.into(), - args: Vec::new(), - env: HashMap::new(), - working_dir: None, - } - } - - /// Sets the arguments for the process. - pub fn with_args(mut self, args: Vec) -> Self { - self.args = args; - self - } - - /// Sets an environment variable for the process. - pub fn with_env(mut self, key: impl Into, value: impl Into) -> Self { - self.env.insert(key.into(), value.into()); - self - } - - /// Sets all environment variables for the process at once. - pub fn with_env_map(mut self, env: HashMap) -> Self { - self.env = env; - self - } - - /// Sets the working directory for the process. - pub fn with_working_dir(mut self, dir: PathBuf) -> Self { - self.working_dir = Some(dir); - self - } -} - -/// A spawned native process and its supporting tasks. -/// -/// `NativeProcess` owns the child process plus background tasks that pump stdout/stderr lines -/// into a shared buffer and observe the child's exit. Dropping or explicitly calling -/// [`cleanup`][Self::cleanup] kills the child and joins the background tasks. -pub struct NativeProcess { - name: String, - child: Option, - exit_token: CancellationToken, - log_tasks: Vec>, - exit_task: Option>, -} - -impl NativeProcess { - /// Spawns the process described by `config`. The provided `log_sink` receives each line of - /// captured stdout/stderr; the provided `exit_token` is cancelled when the process exits. - pub async fn spawn( - config: NativeProcessConfig, - log_sink: Arc>, - exit_token: CancellationToken, - ) -> Result { - if !config.binary_path.exists() { - return Err(generic_error!( - "Binary not found at expected path: {}", - config.binary_path.display() - )); - } - - let mut cmd = Command::new(&config.binary_path); - cmd.args(&config.args) - .envs(&config.env) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .kill_on_drop(true); - if let Some(ref wd) = config.working_dir { - cmd.current_dir(wd); - } - - let mut child = cmd - .spawn() - .with_error_context(|| format!("Failed to spawn '{}'.", config.binary_path.display()))?; - - let stdout = child - .stdout - .take() - .ok_or_else(|| generic_error!("Failed to capture stdout."))?; - let stderr = child - .stderr - .take() - .ok_or_else(|| generic_error!("Failed to capture stderr."))?; - - let stdout_task = spawn_log_pump(stdout, log_sink.clone(), false); - let stderr_task = spawn_log_pump(stderr, log_sink, true); - - let exit_token_for_watcher = exit_token.clone(); - let name_for_watcher = config.name.clone(); - let exit_task = tokio::spawn(async move { - // Wait for the child to exit; we cannot move it out of the struct here, so the - // exit watching is done in `cleanup`. This task is only used as a placeholder if - // we add SIGCHLD-style observation later. For now, the exit token is fired in - // `cleanup` after `child.wait().await`. - debug!(name = %name_for_watcher, "Native process exit watcher placeholder."); - exit_token_for_watcher.cancelled().await; - }); - - Ok(Self { - name: config.name, - child: Some(child), - exit_token, - log_tasks: vec![stdout_task, stderr_task], - exit_task: Some(exit_task), - }) - } - - /// Returns the display name of the process. - pub fn name(&self) -> &str { - &self.name - } - - /// Returns a handle to the cancellation token that fires when the process exits. - pub fn exit_token(&self) -> CancellationToken { - self.exit_token.clone() - } - - /// Waits for the process to exit. If `timeout` elapses first, the process is killed. - /// - /// Returns the exit code if available, or `None` if the process was killed by signal. - pub async fn wait_with_timeout(&mut self, timeout: Duration) -> Result, GenericError> { - let child = self - .child - .as_mut() - .ok_or_else(|| generic_error!("Process already cleaned up."))?; - match tokio::time::timeout(timeout, child.wait()).await { - Ok(Ok(status)) => Ok(status.code()), - Ok(Err(e)) => Err(generic_error!("Failed to wait for process: {}", e)), - Err(_) => { - let _ = child.kill().await; - let _ = child.wait().await; - Err(generic_error!("Process did not exit within timeout.")) - } - } - } - - /// Kills the child, joins background tasks, and cancels the exit token. - pub async fn cleanup(mut self) { - if let Some(mut child) = self.child.take() { - let _ = child.kill().await; - let _ = child.wait().await; - } - self.exit_token.cancel(); - if let Some(handle) = self.exit_task.take() { - let _ = handle.await; - } - for handle in self.log_tasks.drain(..) { - let _ = handle.await; - } - } -} - -impl Drop for NativeProcess { - fn drop(&mut self) { - if self.child.is_some() { - warn!( - name = %self.name, - "NativeProcess dropped without explicit cleanup; child will be killed via kill_on_drop." - ); - } - } -} - -/// A trait-object-friendly sink for log lines captured from a native process. -/// -/// This is intentionally minimal so panoramic's existing `LogBuffer` can wrap one of these -/// without depending on `airlock`. -pub trait LogSink: Send + Sync { - fn push_line(&mut self, line: String, is_stderr: bool); -} - -fn spawn_log_pump( - reader: R, - sink: Arc>, - is_stderr: bool, -) -> JoinHandle<()> -where - R: tokio::io::AsyncRead + Unpin + Send + 'static, -{ - let mut lines = BufReader::new(reader).lines(); - tokio::spawn(async move { - loop { - match lines.next_line().await { - Ok(Some(line)) => { - let mut sink = sink.lock().await; - sink.push_line(line, is_stderr); - } - Ok(None) => break, - Err(e) => { - debug!(error = %e, "Log pump read error; stopping."); - break; - } - } - } - }); -} -``` - -- [ ] **Step 2: Export the module from `airlock`** - -Modify `bin/correctness/airlock/src/lib.rs`: - -```rust -pub mod config; -pub mod docker; -pub mod driver; -pub mod native; -``` - -- [ ] **Step 3: Compile and verify** - -Run: `cd bin/correctness && cargo check -p airlock` -Expected: clean compile, no errors. - -- [ ] **Step 4: Commit** - -```bash -git add bin/correctness/airlock/src/native.rs bin/correctness/airlock/src/lib.rs -git commit -m "feat(airlock): add native process driver for non-containerized tests" -``` - ---- - -## Task 2: Bridge the existing `LogBuffer` to the `LogSink` trait - -**Files:** -- Modify: `bin/correctness/panoramic/src/assertions/mod.rs` (around `LogBuffer` definition) - -The Docker path populates `LogBuffer` via bollard `LogOutput`. The native path needs to populate the same `LogBuffer` via the `LogSink` trait so the existing assertions work unchanged. - -- [ ] **Step 1: Inspect the current `LogBuffer`** - -Run: `grep -n "pub struct LogBuffer\|impl LogBuffer\|push" bin/correctness/panoramic/src/assertions/mod.rs` -Note the existing API so the trait implementation matches. - -- [ ] **Step 2: Implement `LogSink` for `LogBuffer`** - -Add to `bin/correctness/panoramic/src/assertions/mod.rs`, after the existing `impl LogBuffer { ... }` block: - -```rust -impl airlock::native::LogSink for LogBuffer { - fn push_line(&mut self, line: String, is_stderr: bool) { - // Match the existing Docker log capture format: each entry is the raw line. The - // is_stderr flag is currently informational only. - let _ = is_stderr; - self.lines.push(line); - } -} -``` - -(Adjust field name `lines` if the struct uses something different — verify in Step 1.) - -- [ ] **Step 3: Verify it compiles** - -Run: `cd bin/correctness && cargo check -p panoramic` -Expected: clean compile. - -- [ ] **Step 4: Commit** - -```bash -git add bin/correctness/panoramic/src/assertions/mod.rs -git commit -m "feat(panoramic): implement LogSink for LogBuffer" -``` - ---- - -## Task 3: Add `runtimes` field to `IntegrationConfig` and expand at discovery - -**Files:** -- Modify: `bin/correctness/panoramic/src/config.rs` (`IntegrationConfig` struct, deserialization, `Test` impl) -- Modify: `bin/correctness/panoramic/src/test.rs` (`try_load_test` for `integration` type) - -- [ ] **Step 1: Add the `runtimes` field** - -Modify the `IntegrationConfig` struct in `bin/correctness/panoramic/src/config.rs`: - -```rust -#[derive(Clone, Debug, Deserialize)] -pub struct IntegrationConfig { - pub name: String, - - #[serde(default)] - pub description: Option, - - pub timeout: HumanDuration, - - pub container: ContainerConfig, - - pub assertions: Vec, - - /// Runtimes under which this test runs. - /// - /// Each value must be either `"docker"` (the default) or `"native_macos"`. When multiple - /// runtimes are declared, the test discovery layer expands the config into one independent - /// test case per runtime, named `{name}/{runtime}`. - #[serde(default = "default_runtimes")] - pub runtimes: Vec, - - /// Resolved runtime for this specific test instance after discovery-time expansion. - /// - /// At parse time, this is always empty. The discovery layer sets it when expanding a - /// multi-runtime config into per-runtime instances. - #[serde(skip)] - pub resolved_runtime: String, - - #[serde(skip)] - pub base_path: PathBuf, -} - -fn default_runtimes() -> Vec { - vec!["docker".to_string()] -} -``` - -- [ ] **Step 2: Surface the per-instance runtime via the `Test` trait impl** - -In the same file, update the `Test` impl for `IntegrationConfig`: - -```rust -#[async_trait] -impl Test for IntegrationConfig { - fn name(&self) -> String { - if self.resolved_runtime.is_empty() || self.runtimes.len() <= 1 { - self.name.clone() - } else { - format!("{}/{}", self.name, self.resolved_runtime) - } - } - - fn suite(&self) -> TestSuite { - TestSuite::Integration - } - - fn description(&self) -> Option { - self.description.clone() - } - - fn timeout(&self) -> Duration { - self.timeout.0 - } - - fn images(&self) -> BTreeMap<&str, String> { - let mut m = BTreeMap::new(); - // The native_macos runtime doesn't require any container image. - if self.resolved_runtime != "native_macos" { - m.insert("container", self.container.image.clone()); - } - m - } - - fn runtime(&self) -> String { - if self.resolved_runtime.is_empty() { - "docker".to_string() - } else { - self.resolved_runtime.clone() - } - } - - async fn run(&self, tctx: TestContext) -> TestResult { - match self.resolved_runtime.as_str() { - "native_macos" => { - let mut runner = crate::native_runner::NativeIntegrationRunner::new(self.clone(), tctx); - runner.run().await - } - // Default to the existing Docker path for "docker" or unset. - _ => { - let mut runner = crate::runner::IntegrationRunner::new(self.clone(), tctx); - runner.run().await - } - } - } -} -``` - -- [ ] **Step 3: Expand multi-runtime configs at discovery** - -Modify `try_load_test` in `bin/correctness/panoramic/src/test.rs` for the `"integration"` arm: - -```rust -"integration" => { - let config = IntegrationConfig::from_yaml(config_path)?; - if config.runtimes.is_empty() { - return Err(generic_error!("integration test '{}' has empty runtimes list", config.name)); - } - let mut tests: Vec> = Vec::new(); - for runtime in &config.runtimes { - if runtime != "docker" && runtime != "native_macos" { - return Err(generic_error!( - "integration test '{}' declares unknown runtime '{}' (expected 'docker' or 'native_macos')", - config.name, - runtime - )); - } - let mut variant = config.clone(); - variant.resolved_runtime = runtime.clone(); - tests.push(Box::new(variant)); - } - Ok(tests) -} -``` - -- [ ] **Step 4: Verify compilation (panoramic will fail until Task 4 lands)** - -Run: `cd bin/correctness && cargo check -p panoramic 2>&1 | tail -20` -Expected: FAIL on missing `crate::native_runner` module — that's the next task. - -- [ ] **Step 5: Commit** - -```bash -git add bin/correctness/panoramic/src/config.rs bin/correctness/panoramic/src/test.rs -git commit -m "feat(panoramic): add runtimes field to integration test config" -``` - ---- - -## Task 4: Add the `NativeIntegrationRunner` - -**Files:** -- Create: `bin/correctness/panoramic/src/native_runner.rs` -- Modify: `bin/correctness/panoramic/src/main.rs` (declare the module) - -- [ ] **Step 1: Create the runner module** - -Create `bin/correctness/panoramic/src/native_runner.rs`: - -```rust -//! Native-process integration test runner. -//! -//! This runner is the parallel of [`crate::runner::IntegrationRunner`] but for tests declared -//! with `runtime: native_macos`. Instead of building a Docker container, it spawns a binary -//! directly via [`airlock::native::NativeProcess`] and feeds its stdout/stderr into the same -//! [`LogBuffer`][crate::assertions::LogBuffer] used by the Docker path so the assertions work -//! unchanged. -//! -//! Scope (initial): only ADP-standalone tests. The binary is `agent-data-plane`, located via -//! the `ADP_BINARY_PATH` env var (falling back to `target/release/agent-data-plane`). - -use std::{ - collections::HashMap, - path::PathBuf, - sync::Arc, - time::{Duration, Instant}, -}; - -use airlock::native::{NativeProcess, NativeProcessConfig}; -use saluki_error::{generic_error, ErrorContext as _, GenericError}; -use tokio::sync::{Mutex, RwLock}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; - -use crate::{ - assertions::{create_assertion, AssertionContext, AssertionResult, LogBuffer}, - config::{AssertionStep, IntegrationConfig}, - reporter::{PhaseTiming, TestResult}, - test::TestContext, -}; - -const ADP_BINARY_ENV_VAR: &str = "ADP_BINARY_PATH"; -const DEFAULT_ADP_BINARY_PATH: &str = "target/release/agent-data-plane"; - -/// Runner for a single native-process integration test case. -pub(crate) struct NativeIntegrationRunner { - test_case: IntegrationConfig, - tctx: TestContext, - log_buffer: Arc>, -} - -impl NativeIntegrationRunner { - /// Creates a new runner for the given test case. - pub(crate) fn new(test_case: IntegrationConfig, tctx: TestContext) -> Self { - Self { - test_case, - tctx, - log_buffer: Arc::new(RwLock::new(LogBuffer::default())), - } - } - - /// Runs the test case and returns the result. - pub(crate) async fn run(&mut self) -> TestResult { - let started = Instant::now(); - let test_name = self.test_case.name(); - let mut phase_timings = Vec::new(); - - info!(test = %test_name, "Starting native integration test case."); - - // Phase: resolve binary path - let binary_path = match resolve_adp_binary_path() { - Ok(p) => p, - Err(e) => { - return make_error_result(test_name, started, "resolve_binary", e); - } - }; - debug!(test = %test_name, binary = %binary_path.display(), "Resolved ADP binary path."); - - // Phase: spawn process - let spawn_start = Instant::now(); - let exit_token = CancellationToken::new(); - - // Bridge the LogBuffer behind a Mutex. We have to take ownership of the - // buffer via an Arc> compatible shape; the simplest path is to construct a - // separate sink struct that pushes into the shared LogBuffer. - let sink_buf = self.log_buffer.clone(); - let log_sink: Arc> = - Arc::new(Mutex::new(NativeLogSink { buf: sink_buf })); - - let process_config = NativeProcessConfig::new(self.test_case.name.clone(), binary_path) - .with_args(vec!["run".to_string()]) - .with_env_map(self.test_case.container.env.clone()); - - let process = match NativeProcess::spawn(process_config, log_sink, exit_token.clone()).await { - Ok(p) => p, - Err(e) => { - phase_timings.push(PhaseTiming { - phase: "spawn".to_string(), - duration: spawn_start.elapsed(), - }); - return make_error_result(test_name, started, "spawn", e); - } - }; - phase_timings.push(PhaseTiming { - phase: "spawn".to_string(), - duration: spawn_start.elapsed(), - }); - - info!(test = %test_name, "Native process started."); - - // Phase: run assertions - let assertion_start = Instant::now(); - let assertion_results = self - .run_assertions(process.name().to_string(), exit_token.clone()) - .await; - phase_timings.push(PhaseTiming { - phase: "assertions".to_string(), - duration: assertion_start.elapsed(), - }); - - // Phase: cleanup - let cleanup_start = Instant::now(); - process.cleanup().await; - phase_timings.push(PhaseTiming { - phase: "cleanup".to_string(), - duration: cleanup_start.elapsed(), - }); - - let passed = assertion_results.iter().all(|r| r.passed); - TestResult { - name: test_name, - passed, - duration: started.elapsed(), - assertion_results: assertion_results.clone(), - error: None, - phase_timings, - assertion_details: assertion_results, - } - } - - async fn run_assertions( - &self, - process_display_name: String, - exit_token: CancellationToken, - ) -> Vec { - let mut results = Vec::new(); - let cancel_token = self.tctx.test_cancel_token(); - - for step in &self.test_case.assertions { - match step { - AssertionStep::Single(cfg) => { - let assertion = create_assertion(cfg.clone(), &self.test_case); - let ctx = AssertionContext { - log_buffer: self.log_buffer.clone(), - container_exit_token: exit_token.clone(), - cancel_token: cancel_token.clone(), - container_name: process_display_name.clone(), - port_mappings: HashMap::new(), - }; - results.push(assertion.check(&ctx).await); - } - AssertionStep::Parallel { parallel } => { - let futures: Vec<_> = parallel - .iter() - .map(|cfg| { - let assertion = create_assertion(cfg.clone(), &self.test_case); - let ctx = AssertionContext { - log_buffer: self.log_buffer.clone(), - container_exit_token: exit_token.clone(), - cancel_token: cancel_token.clone(), - container_name: process_display_name.clone(), - port_mappings: HashMap::new(), - }; - async move { assertion.check(&ctx).await } - }) - .collect(); - let parallel_results = futures::future::join_all(futures).await; - results.extend(parallel_results); - } - } - } - - results - } -} - -fn resolve_adp_binary_path() -> Result { - let explicit = std::env::var(ADP_BINARY_ENV_VAR).ok(); - let path = match explicit { - Some(p) => PathBuf::from(p), - None => PathBuf::from(DEFAULT_ADP_BINARY_PATH), - }; - - let canonical = path.canonicalize().with_error_context(|| { - format!( - "ADP binary not found at '{}'. Set {} or build via `cargo build --release --bin agent-data-plane`.", - path.display(), - ADP_BINARY_ENV_VAR - ) - })?; - Ok(canonical) -} - -fn make_error_result(name: String, started: Instant, phase: &str, e: GenericError) -> TestResult { - error!(test = %name, error = %e, phase, "Native integration test setup failed."); - TestResult { - name, - passed: false, - duration: started.elapsed(), - assertion_results: vec![], - error: Some(format!("Failed in phase '{}': {}", phase, e)), - phase_timings: vec![], - assertion_details: vec![], - } -} - -/// Bridge from `airlock::native::LogSink` to the panoramic `LogBuffer`. -struct NativeLogSink { - buf: Arc>, -} - -impl airlock::native::LogSink for NativeLogSink { - fn push_line(&mut self, line: String, is_stderr: bool) { - // Try a non-blocking write. If the lock is contended, do a blocking write — assertions - // hold the read lock briefly so contention is rare. - if let Ok(mut buf) = self.buf.try_write() { - buf.push_line(line, is_stderr); - } else { - // Fall back: spawn a task to do the write so we don't block this caller. We're - // already inside a tokio task here (the log pump), so blocking would stall it. - let buf = self.buf.clone(); - tokio::spawn(async move { - buf.write().await.push_line(line, is_stderr); - }); - } - } -} -``` - -- [ ] **Step 2: Declare the module in `main.rs`** - -Modify `bin/correctness/panoramic/src/main.rs`: - -Find the existing `mod runner;` line and add `mod native_runner;` after it. - -- [ ] **Step 3: Verify compilation** - -Run: `cd bin/correctness && cargo check -p panoramic 2>&1 | tail -30` -Expected: clean compile. - -If compile errors mention `AssertionContext` field names, the field names need to match the existing struct definition — check `bin/correctness/panoramic/src/assertions/mod.rs` for the exact shape and adjust. - -- [ ] **Step 4: Commit** - -```bash -git add bin/correctness/panoramic/src/native_runner.rs bin/correctness/panoramic/src/main.rs -git commit -m "feat(panoramic): add NativeIntegrationRunner for native_macos runtime" -``` - ---- - -## Task 5: Wire up `basic-startup` for the new runtime - -**Files:** -- Modify: `test/integration/cases/basic-startup/config.yaml` - -- [ ] **Step 1: Add the runtime opt-in** - -Modify `test/integration/cases/basic-startup/config.yaml` so the top-level keys read: - -```yaml -type: integration -name: "basic-startup" -description: "Verifies ADP starts successfully and remains stable" -timeout: 90s -runtimes: [docker, native_macos] - -container: - image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - -assertions: - - type: log_contains - pattern: "Agent Data Plane starting" - timeout: 5s - - parallel: - - type: process_stable_for - duration: 10s - - type: log_not_contains - pattern: "panic|PANIC" - regex: true - during: 10s -``` - -- [ ] **Step 2: Discovery-only sanity check** - -Run: `cargo run --release --bin panoramic -- list -d test/integration/cases` -Expected output includes: -``` -basic-startup/docker -basic-startup/native_macos -``` -…and all other tests show up exactly once with their original names. - -- [ ] **Step 3: Commit** - -```bash -git add test/integration/cases/basic-startup/config.yaml -git commit -m "test(integration): enable basic-startup on native_macos runtime" -``` - ---- - -## Task 6: Add the `test-integration-macos` make target - -**Files:** -- Modify: `Makefile` - -- [ ] **Step 1: Inspect existing integration test targets** - -Run: `grep -n "test-integration\|test-integration-quick\|build-panoramic" Makefile` -Note the existing pattern. - -- [ ] **Step 2: Add the new targets** - -Append to `Makefile`, near the existing `test-integration` rule: - -```makefile -.PHONY: build-adp-macos -build-adp-macos: ## Builds the ADP binary natively for macOS (release profile) - @echo "[*] Building agent-data-plane (release, native macOS target)..." - @cargo build --release --bin agent-data-plane - -.PHONY: test-integration-macos -test-integration-macos: build-panoramic build-adp-macos -test-integration-macos: ## Runs macOS native integration tests (no Docker) - @echo "[*] Running macOS native integration tests..." - @ADP_BINARY_PATH=$(shell pwd)/target/release/agent-data-plane \ - target/release/panoramic run -d $(shell pwd)/test/integration/cases \ - -t basic-startup/native_macos --no-tui \ - $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) -``` - -- [ ] **Step 3: Run the new target end-to-end** - -Run: `make test-integration-macos` - -Expected output: -- Panoramic launches one test (`basic-startup/native_macos`). -- The `log_contains` assertion for `"Agent Data Plane starting"` passes within 5s. -- The `process_stable_for` and `log_not_contains` assertions complete after ~10s. -- Test result: PASS. - -If the test fails, check: -- The `agent-data-plane` binary exists at `target/release/agent-data-plane`. -- No other ADP process is already bound to default ports (`lsof -i :8125 -i :8135`). -- The log buffer is actually receiving lines (look at `PANORAMIC_LOG_DIR` output if set). - -- [ ] **Step 4: Commit** - -```bash -git add Makefile -git commit -m "build: add test-integration-macos make target" -``` - ---- - -## Task 7: Verify the Docker path still works for the same test - -This is the regression check that ensures we didn't break anything on Linux. - -- [ ] **Step 1: Confirm the docker variant still shows up in discovery** - -Already covered by Task 5 Step 2, but re-confirm: - -Run: `cargo run --release --bin panoramic -- list -d test/integration/cases | grep basic-startup` -Expected: -``` -basic-startup/docker -basic-startup/native_macos -``` - -- [ ] **Step 2: Run the docker variant locally if Docker is available** - -Skip if Docker isn't available locally. Otherwise: - -Run: `target/release/panoramic run -d test/integration/cases -t basic-startup/docker --no-tui` -Expected: existing Docker path runs unchanged and passes. - -- [ ] **Step 3: Run unit tests for affected crates** - -Run: `cargo test -p airlock -p panoramic 2>&1 | tail -20` -Expected: all tests pass, no regressions. - -- [ ] **Step 4: Run formatter and clippy** - -Run: `make fmt && make check-clippy 2>&1 | tail -30` -Expected: clean. - ---- - -## Self-review checklist - -- **Spec coverage:** Single test on macOS running natively via panoramic — Task 5 + Task 6. Docker path preserved — Task 7. CI and Tart wrapper are explicitly deferred and not in spec for this PR. -- **No placeholders:** All code blocks are concrete. -- **Type consistency:** `NativeProcessConfig` defined in Task 1 is used in Task 4; field names match. `LogSink` defined in Task 1 is implemented in Task 4. `AssertionContext` field names match the existing struct shape (verified in Task 4 Step 3 — adjust if compile fails). -- **One risk to flag in execution:** the `AssertionContext` struct definition lives in `assertions/mod.rs` and may have a different field shape than what's written in Task 4's code. The first thing to verify when implementing Task 4 is the exact `AssertionContext` definition; adapt the `run_assertions` call sites in `native_runner.rs` to match. - ---- - -## What this PR explicitly does NOT do - -- No CI job — that requires building ADP as an artifact and a new `.gitlab/test.yml` entry; deferred. -- No Tart wrapper script — deferred to a follow-up so non-macOS developers can run macOS tests locally. -- No conversion of the other 16 standalone integration tests — only `basic-startup` is wired up. -- No converged (Agent + ADP) tests — they need Agent install plumbing and IPC, which is its own scope. -- No refactor of the existing Docker `IntegrationRunner`. The two runners remain parallel for now; merging via a shared trait is a follow-up. From 20b3498709d97047a70c0f287d408f73218b4b32 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 11:17:32 -0400 Subject: [PATCH 18/56] refactor(panoramic): drop dead native path on process_exits_with; small cleanups in native_runner Self-review cleanups, simplicity-focused: 1) ProcessExitsWithAssertion's native code path was unused. Zero tests reference 'process_exits_with' in the test corpus; the runtime-aware AdpExitsWithAssertion (added in an earlier commit) is what's used for the native path. Drop the native branch and restore process_exits_with to its original docker-only implementation. AssertionContext::native_exit_code is still used by adp_exits_with, so the field stays. 2) Inline the throwaway 'agent_exit_token' at the spawn call site. The Core Agent's exit token is never observed by assertions or runner logic; the named variable suggested otherwise. 3) Compute the per-test auth_token_path once instead of recomputing inside the requires_core_agent branch on the ADP env side. No behavior change; suite remains 24/24 passing inside the Tart VM in ~3 minutes. --- .../panoramic/src/assertions/process_exits.rs | 148 ++++++------------ .../panoramic/src/native_runner.rs | 22 ++- 2 files changed, 56 insertions(+), 114 deletions(-) diff --git a/bin/correctness/panoramic/src/assertions/process_exits.rs b/bin/correctness/panoramic/src/assertions/process_exits.rs index a4e979bf5b1..067d598cff1 100644 --- a/bin/correctness/panoramic/src/assertions/process_exits.rs +++ b/bin/correctness/panoramic/src/assertions/process_exits.rs @@ -3,6 +3,10 @@ use std::time::{Duration, Instant}; use crate::assertions::{Assertion, AssertionContext, AssertionResult}; /// Assertion that checks the container process exits with a specific exit code. +/// +/// Currently implemented only for the docker runtime. The native_macos runtime uses the +/// runtime-aware [`AdpExitsWithAssertion`][crate::assertions::AdpExitsWithAssertion] instead +/// (which delegates to the per-process exit code cell on native). pub struct ProcessExitsWithAssertion { expected_code: i64, timeout: Duration, @@ -31,15 +35,52 @@ impl Assertion for ProcessExitsWithAssertion { let started = Instant::now(); tokio::select! { + // Container exited - check exit code via Docker API _ = ctx.container_exit_token.cancelled() => { - if ctx.is_native { - self.check_native(ctx, started) - } else { - self.check_docker(ctx, started).await + let docker: bollard::Docker = match airlock::docker::connect() { + Ok(d) => d, + Err(e) => { + return AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!("Failed to connect to Docker: {}", e), + duration: started.elapsed(), + }; + } + }; + + match docker.inspect_container(&ctx.container_name, None).await { + Ok(container) => { + let exit_code = container.state.and_then(|s| s.exit_code).unwrap_or(-1); + if exit_code == self.expected_code { + AssertionResult { + name: self.name().to_string(), + passed: true, + message: format!("Process exited with expected code {}.", exit_code), + duration: started.elapsed(), + } + } else { + AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!( + "Process exited with code {}, expected {}.", + exit_code, self.expected_code + ), + duration: started.elapsed(), + } + } + } + Err(e) => AssertionResult { + name: self.name().to_string(), + passed: false, + message: format!("Failed to inspect container: {}", e), + duration: started.elapsed(), + } } } - // Timeout waiting for the process to exit. + // Timeout waiting for the container to exit. _ = tokio::time::sleep(self.timeout) => { AssertionResult { name: self.name().to_string(), @@ -51,100 +92,3 @@ impl Assertion for ProcessExitsWithAssertion { } } } - -impl ProcessExitsWithAssertion { - fn check_native(&self, ctx: &AssertionContext, started: Instant) -> AssertionResult { - let cell = match ctx.native_exit_code.as_ref() { - Some(c) => c, - None => { - return AssertionResult { - name: self.name().to_string(), - passed: false, - message: "Native exit code cell not provided in AssertionContext.".to_string(), - duration: started.elapsed(), - }; - } - }; - let exit_code = match cell.get() { - Some(Some(code)) => *code as i64, - Some(None) => { - return AssertionResult { - name: self.name().to_string(), - passed: false, - message: "Process was terminated by signal; no exit code available.".to_string(), - duration: started.elapsed(), - }; - } - None => { - return AssertionResult { - name: self.name().to_string(), - passed: false, - message: "Exit token fired but exit code not yet recorded.".to_string(), - duration: started.elapsed(), - }; - } - }; - if exit_code == self.expected_code { - AssertionResult { - name: self.name().to_string(), - passed: true, - message: format!("Process exited with expected code {}.", exit_code), - duration: started.elapsed(), - } - } else { - AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!( - "Process exited with code {}, expected {}.", - exit_code, self.expected_code - ), - duration: started.elapsed(), - } - } - } - - async fn check_docker(&self, ctx: &AssertionContext, started: Instant) -> AssertionResult { - let docker: bollard::Docker = match airlock::docker::connect() { - Ok(d) => d, - Err(e) => { - return AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!("Failed to connect to Docker: {}", e), - duration: started.elapsed(), - }; - } - }; - - match docker.inspect_container(&ctx.container_name, None).await { - Ok(container) => { - let exit_code = container.state.and_then(|s| s.exit_code).unwrap_or(-1); - if exit_code == self.expected_code { - AssertionResult { - name: self.name().to_string(), - passed: true, - message: format!("Process exited with expected code {}.", exit_code), - duration: started.elapsed(), - } - } else { - AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!( - "Process exited with code {}, expected {}.", - exit_code, self.expected_code - ), - duration: started.elapsed(), - } - } - } - Err(e) => AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!("Failed to inspect container: {}", e), - duration: started.elapsed(), - }, - } - } -} diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index fc50f60db94..1789a70a28e 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -99,17 +99,19 @@ impl NativeIntegrationRunner { } debug!(test = %test_name, state_dir = %state_dir.display(), "Prepared per-test state directory."); - // ADP and the (optional) Core Agent get independent exit tokens. The token passed to - // assertions is ADP's — we only care about ADP's exit lifecycle from the test's point - // of view; the Core Agent dying separately is an environmental fault, not a test - // signal. The Agent's token is used only by `NativeProcess` internals to fire when the - // Agent process truly exits. + // Only ADP's exit lifecycle is observable to assertions. The Core Agent (when present) + // gets a throwaway token at spawn time — it satisfies `NativeProcess::spawn`'s + // signature but nothing consumes the resulting cancellation. If the Agent dies + // independently it's treated as an environmental fault, not a test signal. let adp_exit_token = CancellationToken::new(); - let agent_exit_token = CancellationToken::new(); let log_sink: Arc> = Arc::new(Mutex::new(NativeLogSink { buf: self.log_buffer.clone(), })); + // Path that both the Agent and ADP use for auth_token / ipc_cert.pem. Always computed, + // only inserted into env when the Agent is in the picture (see comments below). + let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); + // Optional Phase: spawn the Core Agent (converged tests). // // Converged tests need both the Core Agent and ADP running side-by-side, sharing a @@ -134,7 +136,6 @@ impl NativeIntegrationRunner { // follows that advice for its post-config-stream IPC clients, and TLS fails with // UnknownIssuer because the platform default cert does not match what the per-test // Agent is actually serving. - let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); let mut agent_env = self.test_case.container.env.clone(); agent_env.insert("DD_AUTH_TOKEN_FILE_PATH".to_string(), auth_token_path.clone()); @@ -150,7 +151,7 @@ impl NativeIntegrationRunner { // for trace-agent), blocking subsequent tests. .with_process_group(); - let agent = match NativeProcess::spawn(agent_config, log_sink.clone(), agent_exit_token.clone()).await { + let agent = match NativeProcess::spawn(agent_config, log_sink.clone(), CancellationToken::new()).await { Ok(p) => p, Err(e) => { phase_timings.push(PhaseTiming { @@ -190,10 +191,7 @@ impl NativeIntegrationRunner { if self.test_case.requires_core_agent { // Point ADP's IPC client at the per-test auth token (and by derivation, the // per-test ipc_cert.pem in the same directory). - adp_env.insert( - "DD_AUTH_TOKEN_FILE_PATH".to_string(), - state_dir.join("auth_token").to_string_lossy().into_owned(), - ); + adp_env.insert("DD_AUTH_TOKEN_FILE_PATH".to_string(), auth_token_path); } let process_config = NativeProcessConfig::new(self.test_case.name.clone(), binary_path) .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) From 4ad4a11bcd6058401c7733c43d41da39c2b2d5fc Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 13:51:24 -0400 Subject: [PATCH 19/56] refactor(airlock): always put spawned processes in their own group NativeProcess used to support two cleanup modes via a per-config 'use_process_group: bool' knob: - If set, cleanup signals the entire process group (parent + forked helpers like trace-agent and process-agent). - If unset, cleanup signals just the immediate child PID. The two paths did almost the same thing (SIGTERM, grace, SIGKILL) and the no-group path existed only to spare single-process binaries the overhead of being a process-group leader. That overhead is zero in practice: putting a single-process binary into its own group has no observable effect, and killpg on a one-member group is equivalent to kill. Simplify by always creating a new process group at spawn time and always using killpg on cleanup. The 'use_process_group' field, the 'with_process_group()' builder method, the 'child_pid' fallback field, and the second cleanup branch all go away. Net -58 lines in airlock, no behavior change. Suite remains 24/24 passing inside the Tart VM (~3:15). --- bin/correctness/airlock/src/native.rs | 80 +++++-------------- .../panoramic/src/native_runner.rs | 6 +- 2 files changed, 22 insertions(+), 64 deletions(-) diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/native.rs index 4d999ee499b..89f09f0906d 100644 --- a/bin/correctness/airlock/src/native.rs +++ b/bin/correctness/airlock/src/native.rs @@ -49,12 +49,6 @@ pub struct NativeProcessConfig { pub env: HashMap, /// Working directory for the process. If `None`, inherits the caller's working directory. pub working_dir: Option, - /// If `true`, the spawned process is placed into a new process group with itself as the - /// group leader, and [`cleanup`][NativeProcess::cleanup] signals the entire group instead of - /// only the immediate child. This is essential when the spawned binary forks helpers that - /// outlive their parent (for example, the Datadog Core Agent spawns `trace-agent` and - /// `process-agent` which orphan onto launchd if only the parent is killed). - pub use_process_group: bool, } impl NativeProcessConfig { @@ -66,19 +60,9 @@ impl NativeProcessConfig { args: Vec::new(), env: HashMap::new(), working_dir: None, - use_process_group: false, } } - /// Places the spawned process in a new process group with itself as the group leader. - /// - /// Use this for binaries that fork long-lived helper processes that would otherwise orphan - /// when the parent is killed. - pub fn with_process_group(mut self) -> Self { - self.use_process_group = true; - self - } - /// Sets the arguments for the process. pub fn with_args(mut self, args: Vec) -> Self { self.args = args; @@ -116,15 +100,16 @@ pub trait LogSink: Send + Sync { /// the child process exits on its own (observed by the background watcher) or when /// [`cleanup`][Self::cleanup] is called. The exit code is recorded in the shared /// [`ExitCodeCell`] returned by [`exit_code_cell`][Self::exit_code_cell]. +/// +/// The spawned process is always made the leader of a new process group, so +/// [`cleanup`][Self::cleanup] can signal the entire group (parent plus any forked helpers). +/// This matters for binaries like the Datadog Core Agent that spawn `trace-agent` / +/// `process-agent` which would otherwise orphan onto launchd when only the parent is killed. pub struct NativeProcess { name: String, - /// PGID to signal on cleanup when the spawned process is a process group leader. `None` - /// when [`NativeProcessConfig::use_process_group`] was `false`. + /// PGID of the spawned process. We made the child the group leader at spawn time, so this + /// equals the child's PID. `None` only if spawn failed to return a PID (very rare). process_group: Option, - /// The child process. Owned by the exit watcher; we communicate with it via signals. - /// - /// `None` once `cleanup` has reaped it (or never set if spawn failed before assignment). - child_pid: Option, exit_token: CancellationToken, exit_code: ExitCodeCell, log_tasks: Vec>, @@ -153,25 +138,17 @@ impl NativeProcess { if let Some(ref wd) = config.working_dir { cmd.current_dir(wd); } - if config.use_process_group { - // Place the spawned process in a new process group so we can later signal all of - // its descendants together. - #[cfg(unix)] - cmd.process_group(0); - } + // Always place the spawned process in a new process group so cleanup can signal the + // entire group (parent + any forked helpers) without leaking orphans. + #[cfg(unix)] + cmd.process_group(0); let mut child = cmd .spawn() .with_error_context(|| format!("Failed to spawn '{}'.", config.binary_path.display()))?; - // When using a process group, capture the PGID. We made the child the group leader - // (process_group(0)), so PGID == child PID. - let child_pid = child.id(); - let process_group = if config.use_process_group { - child_pid.map(|pid| pid as i32) - } else { - None - }; + // PGID == child PID since we made the child the group leader (process_group(0)). + let process_group = child.id().map(|pid| pid as i32); let stdout = child .stdout @@ -211,7 +188,6 @@ impl NativeProcess { Ok(Self { name: config.name, process_group, - child_pid, exit_token, exit_code, log_tasks: vec![stdout_task, stderr_task], @@ -236,13 +212,13 @@ impl NativeProcess { self.exit_code.clone() } - /// Kills the child (and its process group, if configured), joins background tasks, and - /// cancels the exit token. + /// Kills the spawned process group, joins background tasks, and cancels the exit token. + /// + /// Sends SIGTERM to the whole group, waits a short grace period, then sends SIGKILL to + /// guarantee nothing is left behind. The grace period gives well-behaved descendants + /// (for example, the Core Agent's `trace-agent` / `process-agent` helpers) a chance to + /// shut down cleanly before we hard-kill them. pub async fn cleanup(mut self) { - // If we asked for a process group, first send SIGTERM to the entire group. This gives - // descendants (for example, trace-agent, process-agent spawned by the Datadog Core Agent) a - // chance to shut down cleanly before we hard-kill them. After a brief grace period we - // send SIGKILL to the group to guarantee no orphans remain. #[cfg(unix)] if let Some(pgid) = self.process_group { // SAFETY: killpg with a valid pgid is a safe syscall; we ignore the return value. @@ -253,20 +229,6 @@ impl NativeProcess { unsafe { libc::killpg(pgid, libc::SIGKILL); } - } else if let Some(pid) = self.child_pid { - // Fallback: just signal the direct child. The exit watcher owns the Child handle - // so we can't call kill() through it; use libc directly. - #[cfg(unix)] - unsafe { - libc::kill(pid as i32, libc::SIGTERM); - } - tokio::time::sleep(Duration::from_millis(200)).await; - #[cfg(unix)] - unsafe { - libc::kill(pid as i32, libc::SIGKILL); - } - #[cfg(not(unix))] - let _ = pid; } // The exit watcher will have observed the kill and set the exit code + fired the token. @@ -274,8 +236,8 @@ impl NativeProcess { if let Some(handle) = self.exit_task.take() { let _ = handle.await; } - // Defensive: make sure the token is fired even if the watcher never set it (for example, on a - // failed wait). + // Defensive: make sure the token is fired even if the watcher never set it (for example, + // on a failed wait). self.exit_token.cancel(); for handle in self.log_tasks.drain(..) { let _ = handle.await; diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index 1789a70a28e..c48836a396f 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -145,11 +145,7 @@ impl NativeIntegrationRunner { "-c".to_string(), state_dir.to_string_lossy().into_owned(), ]) - .with_env_map(agent_env) - // The Core Agent forks `trace-agent` and `process-agent` helpers; without a process - // group they orphan onto launchd on cleanup and continue holding ports (e.g., 8126 - // for trace-agent), blocking subsequent tests. - .with_process_group(); + .with_env_map(agent_env); let agent = match NativeProcess::spawn(agent_config, log_sink.clone(), CancellationToken::new()).await { Ok(p) => p, From 8ba72a3b07558b479cd5cf8ef318f491a6101713 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 14:00:37 -0400 Subject: [PATCH 20/56] refactor(panoramic): extract shared run_assertion_steps used by both runners The docker IntegrationRunner and the native_macos NativeIntegrationRunner each had their own ~100-line implementation of the assertion-step loop. The loops were structurally identical \u2014 single steps run sequentially, parallel blocks via join_all \u2014 but the docker version was the better one: it had fail-fast on the first failure, step-index debug logging, and consistent error reporting. The native version cut those corners and also recomputed the AssertionContext inside the parallel block. Lifts the docker version (verbatim) to a free function in the assertions module and has both runners delegate to it after building their own AssertionContext. Net -42 lines and native gets the better behavior for free. The spawn / log capture / cleanup architectures of the two runners stay separate; they are legitimately different (bollard vs tokio::process) and trying to abstract them behind a trait would add overhead without saving meaningful code. Suite remains 24/24 passing inside the Tart VM (~3:13). --- .../panoramic/src/assertions/mod.rs | 126 +++++++++++++++++- .../panoramic/src/native_runner.rs | 77 ++--------- bin/correctness/panoramic/src/runner.rs | 121 +---------------- 3 files changed, 141 insertions(+), 183 deletions(-) diff --git a/bin/correctness/panoramic/src/assertions/mod.rs b/bin/correctness/panoramic/src/assertions/mod.rs index afb582da0aa..f1c0f3bbc76 100644 --- a/bin/correctness/panoramic/src/assertions/mod.rs +++ b/bin/correctness/panoramic/src/assertions/mod.rs @@ -1,10 +1,12 @@ use std::{sync::Arc, time::Duration}; +use futures::future; use saluki_error::GenericError; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; +use tracing::{debug, error}; -use crate::config::{AssertionConfig, LogStream}; +use crate::config::{AssertionConfig, AssertionStep, IntegrationConfig, LogStream}; mod adp_exits; mod file_contains; @@ -192,3 +194,125 @@ pub fn create_assertion(config: &AssertionConfig) -> Result, ))), } } + +/// Runs the assertion steps from `test_case` against `ctx`, returning the per-assertion results. +/// +/// Iterates through the test case's assertion list, executing single steps sequentially and +/// parallel blocks concurrently. Stops at the first failure (fail-fast), so the returned vector +/// is truncated past the failing step. +/// +/// Used by both the docker and `native_macos` integration runners; the only thing that differs +/// between runtimes is how `ctx` is constructed (port mappings come from a Docker driver vs. +/// identity-mapped from the test config; `is_native` and `native_exit_code` flip). +pub(crate) async fn run_assertion_steps(test_case: &IntegrationConfig, ctx: &AssertionContext) -> Vec { + let mut results = Vec::new(); + let total_steps = test_case.assertions.len(); + + for (step_index, step) in test_case.assertions.iter().enumerate() { + match step { + AssertionStep::Single(assertion_config) => { + let assertion = match create_assertion(assertion_config) { + Ok(a) => a, + Err(e) => { + error!(error = %e, "Failed to create assertion from configuration."); + results.push(AssertionResult { + name: "config_error".to_string(), + passed: false, + message: format!("Failed to create assertion: {}.", e), + duration: Duration::ZERO, + }); + break; + } + }; + + debug!( + step = step_index + 1, + step_total = total_steps, + assertion_type = assertion.name(), + description = %assertion.description(), + "Running assertion..." + ); + + let result = assertion.check(ctx).await; + + if result.passed { + debug!( + assertion_type = assertion.name(), + duration = ?result.duration, + "Assertion passed." + ); + } else { + debug!( + assertion_type = assertion.name(), + duration = ?result.duration, + message = %result.message, + "Assertion failed." + ); + } + + let failed = !result.passed; + results.push(result); + + if failed { + debug!("Stopping assertion execution due to failure (fail-fast)."); + break; + } + } + + AssertionStep::Parallel { parallel } => { + let mut assertions = Vec::new(); + let mut config_error = false; + + for assertion_config in parallel { + match create_assertion(assertion_config) { + Ok(a) => assertions.push(a), + Err(e) => { + error!(error = %e, "Failed to create assertion from configuration."); + results.push(AssertionResult { + name: "config_error".to_string(), + passed: false, + message: format!("Failed to create assertion: {}.", e), + duration: Duration::ZERO, + }); + config_error = true; + break; + } + } + } + + if config_error { + break; + } + + debug!( + step = step_index + 1, + step_total = total_steps, + assertion_count = assertions.len(), + "Running parallel assertion block..." + ); + + let futures: Vec<_> = assertions.iter().map(|a| a.check(ctx)).collect(); + let parallel_results = future::join_all(futures).await; + + let any_failed = parallel_results.iter().any(|r| !r.passed); + + for result in parallel_results { + debug!( + assertion_type = %result.name, + passed = result.passed, + duration = ?result.duration, + "Parallel assertion completed." + ); + results.push(result); + } + + if any_failed { + debug!("Stopping assertion execution due to failure in parallel block (fail-fast)."); + break; + } + } + } + } + + results +} diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index c48836a396f..a95dca18de3 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -27,8 +27,8 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; use crate::{ - assertions::{create_assertion, AssertionContext, AssertionResult, LogBuffer}, - config::{parse_port_spec, AssertionStep, IntegrationConfig}, + assertions::{AssertionContext, AssertionResult, LogBuffer}, + config::{parse_port_spec, IntegrationConfig}, reporter::{PhaseTiming, TestResult}, test::{Test, TestContext}, }; @@ -270,69 +270,16 @@ impl NativeIntegrationRunner { &self, process_display_name: String, exit_token: CancellationToken, exit_code_cell: airlock::native::ExitCodeCell, ) -> Vec { - let mut results = Vec::new(); - let cancel_token = self.tctx.test_cancel_token(); - let port_mappings = self.build_port_mappings(); - - for step in &self.test_case.assertions { - match step { - AssertionStep::Single(cfg) => { - let assertion = match create_assertion(cfg) { - Ok(a) => a, - Err(e) => { - results.push(AssertionResult { - name: "create_assertion".to_string(), - passed: false, - message: format!("Failed to create assertion: {}", e), - duration: Duration::ZERO, - }); - continue; - } - }; - let ctx = AssertionContext { - log_buffer: self.log_buffer.clone(), - container_exit_token: exit_token.clone(), - cancel_token: cancel_token.clone(), - container_name: process_display_name.clone(), - is_native: true, - native_exit_code: Some(exit_code_cell.clone()), - port_mappings: port_mappings.clone(), - }; - results.push(assertion.check(&ctx).await); - } - AssertionStep::Parallel { parallel } => { - let mut futures = Vec::with_capacity(parallel.len()); - for cfg in parallel { - match create_assertion(cfg) { - Ok(a) => { - let ctx = AssertionContext { - log_buffer: self.log_buffer.clone(), - container_exit_token: exit_token.clone(), - cancel_token: cancel_token.clone(), - container_name: process_display_name.clone(), - is_native: true, - native_exit_code: Some(exit_code_cell.clone()), - port_mappings: port_mappings.clone(), - }; - futures.push(async move { a.check(&ctx).await }); - } - Err(e) => { - results.push(AssertionResult { - name: "create_assertion".to_string(), - passed: false, - message: format!("Failed to create parallel assertion: {}", e), - duration: Duration::ZERO, - }); - } - } - } - let parallel_results = futures::future::join_all(futures).await; - results.extend(parallel_results); - } - } - } - - results + let ctx = AssertionContext { + log_buffer: self.log_buffer.clone(), + container_exit_token: exit_token, + cancel_token: self.tctx.test_cancel_token(), + port_mappings: self.build_port_mappings(), + container_name: process_display_name, + is_native: true, + native_exit_code: Some(exit_code_cell), + }; + crate::assertions::run_assertion_steps(&self.test_case, &ctx).await } } diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index 02ac477ef97..ab13602b967 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -14,10 +14,7 @@ use std::{ use airlock::driver::{Driver, DriverConfig, DriverDetails}; use bollard::container::LogOutput; -use futures::{ - future, - stream::{self, StreamExt as _}, -}; +use futures::stream::{self, StreamExt as _}; use saluki_error::{generic_error, ErrorContext as _, GenericError}; use tokio::sync::{mpsc, RwLock, Semaphore}; use tokio_util::sync::CancellationToken; @@ -25,8 +22,8 @@ use tracing::{debug, error, info, warn}; use crate::test::{Test, TestContext}; use crate::{ - assertions::{create_assertion, AssertionContext, AssertionResult, LogBuffer}, - config::{parse_file_spec, parse_port_spec, AssertionStep, IntegrationConfig}, + assertions::{AssertionContext, AssertionResult, LogBuffer}, + config::{parse_file_spec, parse_port_spec, IntegrationConfig}, events::TestEvent, reporter::{PhaseTiming, TestResult}, }; @@ -808,9 +805,6 @@ impl IntegrationRunner { async fn run_assertions( &self, port_mappings: &HashMap, container_name: &str, exit_token: &CancellationToken, ) -> Vec { - let mut results = Vec::new(); - let total_steps = self.test_case.assertions.len(); - let ctx = AssertionContext { log_buffer: self.log_buffer.clone(), container_exit_token: exit_token.clone(), @@ -820,114 +814,7 @@ impl IntegrationRunner { is_native: false, native_exit_code: None, }; - - for (step_index, step) in self.test_case.assertions.iter().enumerate() { - match step { - AssertionStep::Single(assertion_config) => { - let assertion = match create_assertion(assertion_config) { - Ok(a) => a, - Err(e) => { - error!(error = %e, "Failed to create assertion from configuration."); - results.push(AssertionResult { - name: "config_error".to_string(), - passed: false, - message: format!("Failed to create assertion: {}.", e), - duration: Duration::ZERO, - }); - break; - } - }; - - debug!( - step = step_index + 1, - step_total = total_steps, - assertion_type = assertion.name(), - description = %assertion.description(), - "Running assertion..." - ); - - let result = assertion.check(&ctx).await; - - if result.passed { - debug!( - assertion_type = assertion.name(), - duration = ?result.duration, - "Assertion passed." - ); - } else { - debug!( - assertion_type = assertion.name(), - duration = ?result.duration, - message = %result.message, - "Assertion failed." - ); - } - - let failed = !result.passed; - results.push(result); - - if failed { - debug!("Stopping assertion execution due to failure (fail-fast)."); - break; - } - } - - AssertionStep::Parallel { parallel } => { - let mut assertions = Vec::new(); - let mut config_error = false; - - for assertion_config in parallel { - match create_assertion(assertion_config) { - Ok(a) => assertions.push(a), - Err(e) => { - error!(error = %e, "Failed to create assertion from configuration."); - results.push(AssertionResult { - name: "config_error".to_string(), - passed: false, - message: format!("Failed to create assertion: {}.", e), - duration: Duration::ZERO, - }); - config_error = true; - break; - } - } - } - - if config_error { - break; - } - - debug!( - step = step_index + 1, - step_total = total_steps, - assertion_count = assertions.len(), - "Running parallel assertion block..." - ); - - let futures: Vec<_> = assertions.iter().map(|a| a.check(&ctx)).collect(); - let parallel_results = future::join_all(futures).await; - - let any_failed = parallel_results.iter().any(|r| !r.passed); - - for result in parallel_results { - debug!( - assertion_type = %result.name, - passed = result.passed, - duration = ?result.duration, - "Parallel assertion completed." - ); - results.push(result); - } - - if any_failed { - debug!("Stopping assertion execution due to failure in parallel block (fail-fast)."); - break; - } - } - } - } - - results + crate::assertions::run_assertion_steps(&self.test_case, &ctx).await } async fn cleanup(&self, _driver: &Driver) -> Result<(), GenericError> { From 6e6d0c28d73c0c033d96553224fb510f654908e6 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 14:09:21 -0400 Subject: [PATCH 21/56] refactor(airlock): drop unused NativeProcess surface Three pieces of public surface that nothing called: - NativeProcess::exit_token() accessor: spawn callers already hold a clone of the token they passed in; nothing in panoramic or airlock ever fetched it from a NativeProcess. - NativeProcessConfig::with_working_dir() builder + the working_dir field + the corresponding cmd.current_dir() call: zero callers, was carrying an '#[allow(dead_code)]'. No behavior change. Suite remains 24/24 passing inside the Tart VM (~3:13). --- bin/correctness/airlock/src/native.rs | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/native.rs index 89f09f0906d..5f21d356ed5 100644 --- a/bin/correctness/airlock/src/native.rs +++ b/bin/correctness/airlock/src/native.rs @@ -47,8 +47,6 @@ pub struct NativeProcessConfig { pub args: Vec, /// Environment variables to set for the process. pub env: HashMap, - /// Working directory for the process. If `None`, inherits the caller's working directory. - pub working_dir: Option, } impl NativeProcessConfig { @@ -59,7 +57,6 @@ impl NativeProcessConfig { binary_path: binary_path.into(), args: Vec::new(), env: HashMap::new(), - working_dir: None, } } @@ -74,13 +71,6 @@ impl NativeProcessConfig { self.env = env; self } - - /// Sets the working directory for the process. - #[allow(dead_code)] - pub fn with_working_dir(mut self, dir: PathBuf) -> Self { - self.working_dir = Some(dir); - self - } } /// A trait-object-friendly sink for log lines captured from a native process. @@ -135,9 +125,6 @@ impl NativeProcess { .stdout(Stdio::piped()) .stderr(Stdio::piped()) .kill_on_drop(true); - if let Some(ref wd) = config.working_dir { - cmd.current_dir(wd); - } // Always place the spawned process in a new process group so cleanup can signal the // entire group (parent + any forked helpers) without leaking orphans. #[cfg(unix)] @@ -200,11 +187,6 @@ impl NativeProcess { &self.name } - /// Returns a handle to the cancellation token that fires when the process exits. - pub fn exit_token(&self) -> CancellationToken { - self.exit_token.clone() - } - /// Returns a clone of the shared exit-code cell. The cell is populated once the process /// exits (either on its own or via cleanup). Consumers should wait on [`exit_token`] before /// reading. From 10fef72c93b0b4081f8a2ce6e98b764e5505bef2 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 14:14:01 -0400 Subject: [PATCH 22/56] build: drop local-dev convenience target test-integration-macos The intended local-dev path (a Mac dev running ADP integration tests against their own host) doesn't actually work in practice: a Datadog Agent running on the dev machine will hold port 8125 (DSD UDP) and collide with the test ADP that tries to bind it. Removing the convenience target until a Tart-based local flow lands in a follow-up PR (#1721 has a working prototype). The CI chain is untouched: test-integration-macos-ci (CI entry point) -> build-panoramic + build-adp-native + provision-macos-test-env -> test-integration-macos-run The intermediate targets stay (they're factored out for the CI chain and don't pretend to be local-dev affordances). --- Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Makefile b/Makefile index d0cfa6cbf4f..272d457bde8 100644 --- a/Makefile +++ b/Makefile @@ -589,9 +589,6 @@ test-integration-macos-run: ## Runs native macOS integration tests using already $(if $(CASE),-t $(CASE),--runtime native_macos) --no-tui -p 1 \ $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) -.PHONY: test-integration-macos -test-integration-macos: build-panoramic build-adp-native test-integration-macos-run ## Builds and runs ADP integration tests natively on macOS (no Docker) - # Version of the Datadog Agent installed by `provision-macos-test-env`. Pinned for # reproducibility; bump when the integration tests need newer Agent behavior. MACOS_TEST_AGENT_VERSION ?= 7.78.0 From 33fa96fb5ec6f55bdc5fe278dbc37606f4bd3faf Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 14:26:26 -0400 Subject: [PATCH 23/56] chore(docs): add native_macos to vale vocabulary A doc comment added in 9f4237a821 references 'native_macos' (the new runtime identifier) and the spell checker doesn't recognize the compound token. Append it to the technical vocabulary alongside 'launchd'. --- .vale/styles/config/vocabularies/technical/accept.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.vale/styles/config/vocabularies/technical/accept.txt b/.vale/styles/config/vocabularies/technical/accept.txt index ad9315178e9..9024116614b 100644 --- a/.vale/styles/config/vocabularies/technical/accept.txt +++ b/.vale/styles/config/vocabularies/technical/accept.txt @@ -229,3 +229,4 @@ mpmc dhat profiler launchd +native_macos From 5b517fe62ea3833ad3b9ef89e60174d904f2850e Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 14:56:19 -0400 Subject: [PATCH 24/56] docs: refresh stale comments in native runner / native process module Audited the comments I added on this branch against the code after all the simplification work landed. Four bits had drifted: - airlock::native: ExitCodeCell doc named 'process_exits_with' as the consumer. The native code path on process_exits_with was removed in 9f4237a821; the actual consumer on native is 'adp_exits_with'. - airlock::native: exit-watcher comment listed 'process_stable_for / process_exits_with' as the assertions that unblock when the watcher fires the token. Same fix: it's process_stable_for / adp_exits_with. - airlock::native: exit_code_cell() docs referenced an [exit_token] accessor that was removed in 00c58a711d. Point consumers at the token they passed in to spawn(). - panoramic::native_runner: module-level 'Scope' comment said the initial scope was 'ADP-standalone tests: a single binary, no Core Agent, no IPC.' Converged support landed in 0c10a79787; rewrite to describe both shapes and the binary discovery for each. No code change. --- bin/correctness/airlock/src/native.rs | 8 +++---- .../panoramic/src/native_runner.rs | 22 ++++++++++++++----- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/native.rs index 5f21d356ed5..867de686124 100644 --- a/bin/correctness/airlock/src/native.rs +++ b/bin/correctness/airlock/src/native.rs @@ -30,7 +30,7 @@ use tracing::{debug, warn}; /// /// The cell is populated by the background exit watcher when the child exits on its own, or by /// [`NativeProcess::cleanup`] when the test tears down. Consumers (for example, the -/// `process_exits_with` assertion in panoramic) read the cell after the exit token fires. +/// `adp_exits_with` assertion in panoramic) read the cell after the exit token fires. /// /// The inner `Option` is `None` if the process was terminated by signal rather than exiting /// normally with a status code. @@ -151,7 +151,7 @@ impl NativeProcess { // Real exit watcher: moves the child into the task, calls `wait()`, records the exit // code, and fires the exit token so blocked assertions (process_stable_for / - // process_exits_with) unblock immediately rather than waiting for the test's own + // adp_exits_with) unblock immediately rather than waiting for the test's own // cleanup phase. let exit_code: ExitCodeCell = Arc::new(OnceLock::new()); let exit_code_for_watcher = exit_code.clone(); @@ -188,8 +188,8 @@ impl NativeProcess { } /// Returns a clone of the shared exit-code cell. The cell is populated once the process - /// exits (either on its own or via cleanup). Consumers should wait on [`exit_token`] before - /// reading. + /// exits (either on its own or via cleanup). Consumers should wait on the exit token they + /// passed to [`spawn`][Self::spawn] before reading. pub fn exit_code_cell(&self) -> ExitCodeCell { self.exit_code.clone() } diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/native_runner.rs index a95dca18de3..4cc98012d58 100644 --- a/bin/correctness/panoramic/src/native_runner.rs +++ b/bin/correctness/panoramic/src/native_runner.rs @@ -1,16 +1,26 @@ //! Native-process integration test runner. //! //! This runner is the parallel of [`crate::runner::IntegrationRunner`] but for tests declared -//! with `runtime: native_macos`. Instead of building a Docker container, it spawns a binary -//! directly via [`airlock::native::NativeProcess`] and feeds its stdout/stderr into the same +//! with `runtime: native_macos`. Instead of building a Docker container, it spawns binaries +//! directly via [`airlock::native::NativeProcess`] and feeds their stdout/stderr into the same //! [`LogBuffer`][crate::assertions::LogBuffer] used by the Docker path so the assertions work //! unchanged. //! -//! # Scope +//! # Supported test shapes //! -//! Initial scope is ADP-standalone tests: a single binary, no Core Agent, no IPC. The binary -//! path is discovered via the `ADP_BINARY_PATH` env var, falling back to -//! `target/release/agent-data-plane` (resolved relative to the current working directory). +//! - **Standalone**: only ADP is spawned. The default for tests that don't set +//! `requires_core_agent: true`. +//! - **Converged**: the Datadog Core Agent is spawned alongside ADP (when +//! `requires_core_agent: true`), sharing a per-test config directory so they authenticate +//! over IPC the same way they would in production. See the per-phase comments in +//! [`NativeIntegrationRunner::run`] for the cert/auth_token plumbing. +//! +//! # Binary discovery +//! +//! - ADP: `ADP_BINARY_PATH` env var, default `target/release/agent-data-plane` (resolved +//! relative to the current working directory). +//! - Core Agent (converged only): `CORE_AGENT_BINARY_PATH` env var, default +//! `/opt/datadog-agent/bin/agent/agent`. use std::{ collections::HashMap, From c7f5fe85bf3e7c96ea848ee16b873510226555b7 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 15:57:28 -0400 Subject: [PATCH 25/56] refactor(panoramic): scope runtime at the CLI level, drop per-test expansion The previous model expanded a test declaring `runtimes: [docker, native_macos]` into two Test instances (`foo/docker` and `foo/native_macos`) at discovery time, and `--runtime` was an optional post-discovery filter. This had two problems: - test names sprouted a runtime suffix that callers had to know about (e.g. CASE=foo had to become CASE=foo/native_macos) - on a host that only supports one runtime, the other variant was still discovered and would be attempted unless the caller knew to pass --runtime. The Linux CI hit this directly: it tried to spawn the native_macos variants of integration tests inside a Linux container that has neither the macOS Agent install nor a native ADP binary. Move runtime to a CLI-level scope: - `panoramic run` and `panoramic list` both accept --runtime. - When omitted, default to the runtime native to the host OS: `native_macos` on macOS, `docker` everywhere else. - Discovery emits at most one Test instance per integration config, using the active runtime as resolved_runtime, only when that runtime is in the config's `runtimes:` list. Otherwise the test is skipped. - Test names are no longer suffixed. - Correctness tests are unaffected; the flag only scopes integration discovery. The Makefile targets simplify: `test-integration` no longer needs to pass --runtime docker (it's the default on Linux), and the macOS target no longer needs --runtime native_macos either. CASE= works without any suffix on either runtime. --- Makefile | 4 +- bin/correctness/panoramic/src/cli.rs | 11 +++++- bin/correctness/panoramic/src/config.rs | 50 +++++++++++++++++-------- bin/correctness/panoramic/src/main.rs | 30 +++++++-------- 4 files changed, 61 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index 272d457bde8..26acbbca1d6 100644 --- a/Makefile +++ b/Makefile @@ -582,11 +582,11 @@ build-adp-native: ## Builds the agent-data-plane binary natively for the current cargo build --release --bin agent-data-plane .PHONY: test-integration-macos-run -test-integration-macos-run: ## Runs native macOS integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist). Defaults to all native_macos tests; override with CASE=/native_macos. +test-integration-macos-run: ## Runs native macOS integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist). Defaults to all native_macos-eligible tests; narrow with CASE=. @echo "[*] Running native macOS integration tests..." @ADP_BINARY_PATH="$(CURDIR)/target/release/agent-data-plane" \ target/release/panoramic run -d "$(CURDIR)/test/integration/cases" \ - $(if $(CASE),-t $(CASE),--runtime native_macos) --no-tui -p 1 \ + $(if $(CASE),-t $(CASE)) --no-tui -p 1 \ $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) # Version of the Datadog Agent installed by `provision-macos-test-env`. Pinned for diff --git a/bin/correctness/panoramic/src/cli.rs b/bin/correctness/panoramic/src/cli.rs index d4689d1fbd2..cccbe4109c8 100644 --- a/bin/correctness/panoramic/src/cli.rs +++ b/bin/correctness/panoramic/src/cli.rs @@ -30,8 +30,10 @@ pub struct RunCommand { #[argh(option, short = 't')] pub tests: Option, - /// run only tests with the given runtime (for example, `docker`, `native_macos`, `kubernetes_in_docker`). - /// Can be combined with `-t` to further restrict by name. + /// integration-test runtime to scope discovery to (for example, `docker` or `native_macos`). + /// Only integration tests whose `runtimes:` list contains this value are eligible to run. + /// Defaults to `native_macos` on macOS hosts and `docker` everywhere else. Correctness tests + /// are unaffected by this flag. #[argh(option)] pub runtime: Option, @@ -105,6 +107,11 @@ pub struct ListCommand { #[argh(option, short = 'd')] pub test_dirs: Vec, + /// integration-test runtime to scope discovery to. Same semantics as on `run`: defaults to + /// `native_macos` on macOS, `docker` everywhere else. Correctness tests are unaffected. + #[argh(option)] + pub runtime: Option, + /// output the discovered tests as json along with their image dependencies. a `ci` script depends on this for dynamic /// pipeline creation. #[argh(switch)] diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index b892dfbca35..4e1a9038354 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -156,6 +156,19 @@ pub const NATIVE_MACOS_RUNTIME: &str = "native_macos"; /// Runtime identifier for integration tests that run inside a Docker container. pub const DOCKER_RUNTIME: &str = "docker"; +/// Returns the integration-test runtime that is native to the host OS. +/// +/// `native_macos` on macOS hosts, `docker` everywhere else. Used as the default when a panoramic +/// subcommand is invoked without an explicit `--runtime` flag, so that callers on the most common +/// host get the most common runtime without having to spell it out. +pub fn default_host_runtime() -> &'static str { + if cfg!(target_os = "macos") { + NATIVE_MACOS_RUNTIME + } else { + DOCKER_RUNTIME + } +} + /// Container configuration for a test case. #[derive(Clone, Debug, Deserialize)] pub struct ContainerConfig { @@ -406,11 +419,7 @@ impl AssertionStep { #[async_trait] impl Test for IntegrationConfig { fn name(&self) -> String { - if self.resolved_runtime.is_empty() || self.runtimes.len() <= 1 { - self.name.clone() - } else { - format!("{}/{}", self.name, self.resolved_runtime) - } + self.name.clone() } fn suite(&self) -> TestSuite { @@ -706,7 +715,11 @@ fn canonicalize_file_entry(entry: &str, base_path: &Path) -> String { /// Each `config.yaml` found in a direct subdirectory must have a top-level `type` field set to /// `"integration"`, `"correctness"`, or `"correctness_matrix"`. Files with a missing or unknown /// `type` cause a panic. Multiple test types may coexist freely within the same directory. -pub fn discover_tests(dirs: &[PathBuf]) -> Result>, GenericError> { +/// +/// `integration_runtime` scopes integration-test discovery to a single runtime: an integration +/// test is included if and only if its `runtimes:` list contains this value. Correctness tests +/// are unaffected; they always discover. +pub fn discover_tests(dirs: &[PathBuf], integration_runtime: &str) -> Result>, GenericError> { let mut tests: Vec> = Vec::new(); for base_path in dirs { @@ -724,7 +737,7 @@ pub fn discover_tests(dirs: &[PathBuf]) -> Result>, GenericErr if path.is_dir() { let config_path = path.join("config.yaml"); if config_path.exists() { - match try_load_test(&config_path, &path) { + match try_load_test(&config_path, &path, integration_runtime) { Ok(loaded) => tests.extend(loaded), Err(e) => { // Previously we had a warning here that cannot be seen in TUI-mode. It is better to fail @@ -747,9 +760,12 @@ pub fn discover_tests(dirs: &[PathBuf]) -> Result>, GenericErr /// Load one or more test cases from a config file, dispatching on the top-level `type` field. /// /// Returns a `Vec` because a `correctness_matrix` config expands into multiple independent test -/// cases—one per variant—while `integration` and `correctness` configs each produce exactly -/// one test case. -fn try_load_test(config_path: &Path, dir_path: &Path) -> Result>, GenericError> { +/// cases—one per variant. `integration` configs produce zero or one test case depending on +/// whether the active `integration_runtime` is in the test's `runtimes:` list. `correctness` +/// configs produce exactly one test case. +fn try_load_test( + config_path: &Path, dir_path: &Path, integration_runtime: &str, +) -> Result>, GenericError> { let content = std::fs::read_to_string(config_path) .error_context(format!("Failed to read config file: {}", config_path.display()))?; @@ -771,7 +787,8 @@ fn try_load_test(config_path: &Path, dir_path: &Path) -> Result> = Vec::new(); + // Validate every declared runtime up front so a typo in any list surfaces at discovery + // time, even on hosts that wouldn't actually run that runtime. for runtime in &config.runtimes { if runtime != DOCKER_RUNTIME && runtime != NATIVE_MACOS_RUNTIME { return Err(generic_error!( @@ -782,11 +799,14 @@ fn try_load_test(config_path: &Path, dir_path: &Path) -> Result { let config_path_str = config_path diff --git a/bin/correctness/panoramic/src/main.rs b/bin/correctness/panoramic/src/main.rs index 30c65c7df83..b4586d20972 100644 --- a/bin/correctness/panoramic/src/main.rs +++ b/bin/correctness/panoramic/src/main.rs @@ -24,7 +24,7 @@ use self::cli::{Cli, Command}; mod config; mod dynamic_vars; mod mounts; -use self::config::discover_tests; +use self::config::{default_host_runtime, discover_tests}; mod events; use self::events::{create_event_channel, TestEvent}; @@ -108,7 +108,11 @@ async fn run_tests(cmd: cli::RunCommand, use_tui: bool) -> ExitCode { return ExitCode::from(2); } - let test_cases = match discover_tests(&cmd.test_dirs) { + let integration_runtime = cmd + .runtime + .clone() + .unwrap_or_else(|| default_host_runtime().to_string()); + let test_cases = match discover_tests(&cmd.test_dirs, &integration_runtime) { Ok(tests) => tests, Err(e) => { if use_tui { @@ -188,22 +192,14 @@ async fn run_tests(cmd: cli::RunCommand, use_tui: bool) -> ExitCode { .with_fail_fast(cmd.fail_fast) .with_event_sender(tx); - // Combine the optional --runtime filter and the optional -t name filter into a single - // predicate. A test passes if it matches BOTH constraints (i.e., AND semantics). When neither - // is set, no filter is installed and every discovered test runs. + // The runtime scope is already applied at discovery time. The optional -t name filter + // narrows further. When unset, every discovered test runs. let name_filter: Option> = cmd .tests .as_ref() .map(|s| s.split(',').map(|n| n.trim().to_string()).collect()); - let runtime_filter: Option = cmd.runtime.clone(); - if name_filter.is_some() || runtime_filter.is_some() { - args = args.with_filter(Box::new(move |t: &dyn test::Test| { - let name_ok = name_filter - .as_ref() - .is_none_or(|names| names.iter().any(|n| *n == t.name())); - let runtime_ok = runtime_filter.as_ref().is_none_or(|r| t.runtime() == *r); - name_ok && runtime_ok - })); + if let Some(names) = name_filter { + args = args.with_filter(Box::new(move |t: &dyn test::Test| names.iter().any(|n| *n == t.name()))); } // Spawn the test runner task (same code path for both modes). @@ -388,7 +384,11 @@ async fn list_tests(cmd: cli::ListCommand) -> ExitCode { info!("Discovering test cases from: {}...", dirs_str.join(", ")); } - let test_cases = match discover_tests(&cmd.test_dirs) { + let integration_runtime = cmd + .runtime + .clone() + .unwrap_or_else(|| default_host_runtime().to_string()); + let test_cases = match discover_tests(&cmd.test_dirs, &integration_runtime) { Ok(tests) => tests, Err(e) => { error!("Failed to discover tests: {}", e); From 9d56103c179f915542aab94824b8ba7298d19286 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 15:57:59 -0400 Subject: [PATCH 26/56] ci: mark macOS jobs as interruptible macOS runner capacity in the GitLab fleet is constrained. Marking macOS jobs as interruptible lets a newer pipeline on the same ref (for example, after a quick fixup push to an open PR) auto-cancel the in-flight macOS run and free the runner immediately, instead of holding the slot for the duration of the now-superseded run. Apply the flag at the shared .macos-{arm64,amd64}-test-job mixins in .gitlab-ci.yml so every macOS job inherits it (current scope: the four unit-test jobs plus the two integration-test jobs; future macOS jobs pick it up automatically). See: https://docs.gitlab.com/ci/yaml/#interruptible --- .gitlab-ci.yml | 9 +++++++++ .gitlab/e2e.yml | 2 ++ 2 files changed, 11 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 39c637b46d3..cb9e036786b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -151,8 +151,17 @@ default: KUBERNETES_MEMORY_REQUEST: "8Gi" KUBERNETES_MEMORY_LIMIT: "12Gi" +# Shared mixins for macOS bare-metal runner jobs. +# +# `interruptible: true` is set here so every macOS job inherits it. macOS runner capacity in +# the GitLab fleet is constrained; auto-cancelling superseded pipelines (for example, after a +# quick fixup push to an open PR) frees the runner immediately instead of holding the slot +# for the duration of the now-stale run. See: +# https://docs.gitlab.com/ci/yaml/#interruptible .macos-amd64-test-job: tags: ["macos:sonoma-amd64", "specific:true"] + interruptible: true .macos-arm64-test-job: tags: ["macos:sonoma-arm64", "specific:true"] + interruptible: true diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index f7d05600b10..56eafabf159 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -127,6 +127,8 @@ test-integration: needs: [] retry: 2 timeout: 30m + # `interruptible: true` is inherited from the .macos-{arm64,amd64}-test-job mixins; see the + # comment in .gitlab-ci.yml for rationale. artifacts: expire_in: 1 week paths: From d768cfc661fbaecb15499868de1669172c945987 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 16:22:28 -0400 Subject: [PATCH 27/56] refactor(panoramic): tighten runtime semantics from review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes from manual PR review: - airlock Cargo.toml: `tokio`, `tokio-util`, and `tracing` were incorrectly moved under `[target.'cfg(unix)'.dependencies]` when the native driver was added. They're consumed by the Docker driver too, so move them back to unconditional `[dependencies]`. Only `libc` is truly Unix-only (used for `killpg` / `SIGTERM` / `SIGKILL` inside `#[cfg(unix)]` blocks in `native.rs`). Added a short comment on the remaining cfg(unix) block. - panoramic config: `default_integration_runtimes` no longer hard-codes `["docker"]`. It now delegates to `default_host_runtime`, so an unspecified `runtimes:` field means 'eligible on this host's native runtime' rather than 'docker, no matter what host you're on'. The four configs that previously relied on the implicit `docker` default now declare `runtimes: [docker]` explicitly so their scope is unchanged across hosts: - adp-rar-disabled (docker-only intentionally, supervisor gap) - dogstatsd-bind-host (Linux-only: hostname -i + /etc/hosts hooks) - dogstatsd-bind-custom-hostname (same) - dogstatsd-forwarding (not yet validated under native_macos) - panoramic config: rename `resolved_runtime` to `active_runtime`. The 'resolved_' prefix made sense when discovery selected from a list of supported runtimes; under the new CLI-scoped model it's just the active runtime that discovery wrote in. Updated the doc to describe the new role. - panoramic config: `runtimes:` doc no longer claims tests are expanded per-runtime — that's a leftover from the pre-CLI-scoped model. - .gitlab/e2e.yml: removed the redundant inline comment about inheriting `interruptible: true`; the rationale lives in .gitlab-ci.yml with the mixin definition. --- .gitlab/e2e.yml | 2 -- bin/correctness/airlock/Cargo.toml | 9 ++++-- bin/correctness/panoramic/src/config.rs | 32 +++++++++++-------- .../cases/adp-rar-disabled/config.yaml | 1 + .../config.yaml | 4 +++ .../cases/dogstatsd-bind-host/config.yaml | 4 +++ .../cases/dogstatsd-forwarding/config.yaml | 2 ++ 7 files changed, 35 insertions(+), 19 deletions(-) diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index 56eafabf159..f7d05600b10 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -127,8 +127,6 @@ test-integration: needs: [] retry: 2 timeout: 30m - # `interruptible: true` is inherited from the .macos-{arm64,amd64}-test-job mixins; see the - # comment in .gitlab-ci.yml for rationale. artifacts: expire_in: 1 week paths: diff --git a/bin/correctness/airlock/Cargo.toml b/bin/correctness/airlock/Cargo.toml index fe97714f592..97f16969639 100644 --- a/bin/correctness/airlock/Cargo.toml +++ b/bin/correctness/airlock/Cargo.toml @@ -13,9 +13,6 @@ bollard = { workspace = true, features = ["http", "pipe"] } futures = { workspace = true } home = { workspace = true } saluki-error = { workspace = true } - -[target.'cfg(unix)'.dependencies] -libc = { workspace = true } tokio = { workspace = true, features = [ "fs", "io-util", @@ -26,3 +23,9 @@ tokio = { workspace = true, features = [ ] } tokio-util = { workspace = true } tracing = { workspace = true } + +# Native process driver uses libc's killpg/SIGTERM/SIGKILL inside #[cfg(unix)] blocks. The wider +# correctness/integration test suite is only operated on Linux/Docker today, so this gate exists +# more as a forward-looking marker than as something a Windows build actually depends on. +[target.'cfg(unix)'.dependencies] +libc = { workspace = true } diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index 4e1a9038354..fc617e8e240 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -114,11 +114,13 @@ pub struct IntegrationConfig { /// List of assertion steps to run. pub assertions: Vec, - /// Runtimes under which this test runs. + /// Runtimes under which this test is eligible to run. /// - /// Each value must be either `"docker"` (the default) or `"native_macos"`. When multiple - /// runtimes are declared, the test discovery layer expands the config into one independent - /// test case per runtime, named `{name}/{runtime}`. + /// Each value must be either `"docker"` (the default) or `"native_macos"`. The active + /// runtime for any given panoramic invocation is chosen at the CLI level (`--runtime`, + /// defaulting to the host's native runtime); a test discovers only when this list contains + /// that active runtime. Tests with multiple entries are portable across runtimes, but still + /// execute only once per invocation — in the active runtime. #[serde(default = "default_integration_runtimes")] pub runtimes: Vec, @@ -134,12 +136,14 @@ pub struct IntegrationConfig { #[serde(default)] pub requires_core_agent: bool, - /// Resolved runtime for this specific test instance after discovery-time expansion. + /// Active runtime for this test instance. /// - /// At parse time, this is always empty. The discovery layer sets it when expanding a - /// multi-runtime config into per-runtime instances. + /// Empty at parse time; the discovery layer sets it to whichever runtime the CLI is scoped + /// to (after confirming that runtime is listed in `runtimes`). Used by `Test::run` to + /// dispatch to the right runner and by `Test::runtime` / `Test::images` to report the + /// effective runtime to the CI pipeline generator. #[serde(skip)] - pub resolved_runtime: String, + pub active_runtime: String, /// Base path for resolving relative file paths. #[serde(skip)] @@ -147,7 +151,7 @@ pub struct IntegrationConfig { } fn default_integration_runtimes() -> Vec { - vec!["docker".to_string()] + vec![default_host_runtime().to_string()] } /// Runtime identifier for integration tests that run as native (non-containerized) processes. @@ -437,22 +441,22 @@ impl Test for IntegrationConfig { fn images(&self) -> BTreeMap<&str, String> { let mut m = BTreeMap::new(); // The native_macos runtime doesn't require any container image. - if self.resolved_runtime != NATIVE_MACOS_RUNTIME { + if self.active_runtime != NATIVE_MACOS_RUNTIME { m.insert("container", self.container.image.clone()); } m } fn runtime(&self) -> String { - if self.resolved_runtime.is_empty() { + if self.active_runtime.is_empty() { DOCKER_RUNTIME.to_string() } else { - self.resolved_runtime.clone() + self.active_runtime.clone() } } async fn run(&self, tctx: TestContext) -> TestResult { - match self.resolved_runtime.as_str() { + match self.active_runtime.as_str() { NATIVE_MACOS_RUNTIME => { let mut runner = crate::native_runner::NativeIntegrationRunner::new(self.clone(), tctx); runner.run().await @@ -805,7 +809,7 @@ fn try_load_test( return Ok(Vec::new()); } let mut variant = config.clone(); - variant.resolved_runtime = integration_runtime.to_string(); + variant.active_runtime = integration_runtime.to_string(); Ok(vec![Box::new(variant)]) } "correctness" => { diff --git a/test/integration/cases/adp-rar-disabled/config.yaml b/test/integration/cases/adp-rar-disabled/config.yaml index 8d4d0603b84..505c98cbb3d 100644 --- a/test/integration/cases/adp-rar-disabled/config.yaml +++ b/test/integration/cases/adp-rar-disabled/config.yaml @@ -2,6 +2,7 @@ type: integration name: "adp-rar-disabled" description: "Verify ADP gracefully handles RAR being disabled on the Core Agent" timeout: 120s +runtimes: [docker] # Docker-only: the assertion is that ADP stays up (handles failure gracefully via retry). In # the converged image, s6 restarts ADP on every exit, so 'process stable for 10s' really # means 'container stable for 10s'. There is no equivalent supervisor on the native_macos diff --git a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml index da16fbe35fc..2e65af84f7e 100644 --- a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml +++ b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml @@ -24,6 +24,10 @@ type: integration name: "dogstatsd-bind-custom-hostname" description: "Verifies DogStatsD resolves a custom hostname in bind_host via DNS" timeout: 120s +# Docker-only: uses PANORAMIC_DYNAMIC shell hooks that run `hostname -i` and write /etc/hosts +# from inside the container. Neither translates to the native_macos host runtime; re-enable +# once dynamic resolution grows a portable mechanism. +runtimes: [docker] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-bind-host/config.yaml b/test/integration/cases/dogstatsd-bind-host/config.yaml index a57bd677789..6d836c7a9f8 100644 --- a/test/integration/cases/dogstatsd-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-bind-host/config.yaml @@ -15,6 +15,10 @@ type: integration name: "dogstatsd-bind-host" description: "Verifies DogStatsD binds to the address specified by bind_host" timeout: 120s +# Docker-only: uses PANORAMIC_DYNAMIC shell hooks that run `hostname -i` and write /etc/hosts +# from inside the container. Neither translates to the native_macos host runtime; re-enable +# once dynamic resolution grows a portable mechanism. +runtimes: [docker] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-forwarding/config.yaml b/test/integration/cases/dogstatsd-forwarding/config.yaml index b609196793f..1b08d12d216 100644 --- a/test/integration/cases/dogstatsd-forwarding/config.yaml +++ b/test/integration/cases/dogstatsd-forwarding/config.yaml @@ -2,6 +2,8 @@ type: integration name: "dogstatsd-forwarding" description: "Verifies DogStatsD message forwarding mirrors framed payloads and preserves ingestion" timeout: 90s +# Not yet validated under native_macos; opt in by adding `native_macos` to this list once verified. +runtimes: [docker] container: image: "saluki-images/datadog-agent:testing-devel" From b7765688d0629952c9d4a434d379bee038f7fca5 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 16:37:59 -0400 Subject: [PATCH 28/56] build: verify Datadog Agent version in provision-macos-test-env Previously `provision-macos-test-env` skipped reinstallation if any Agent existed at /opt/datadog-agent, regardless of version. That made the target a no-op on a host with a different (potentially incompatible) version, which can produce confusing failures downstream when the integration tests assume specific Agent behavior. Run `/opt/datadog-agent/bin/agent/agent version` if an Agent is present and parse the second whitespace-separated field (e.g. "7.78.0" from "Agent 7.78.0 - Commit: ..."). If it matches MACOS_TEST_AGENT_VERSION, skip reinstall. If it does not, fail with a clear message instructing the user to either remove /opt/datadog-agent or bump MACOS_TEST_AGENT_VERSION. --- Makefile | 13 +++++++++++-- bin/correctness/airlock/src/{native.rs => unix.rs} | 0 .../src/{native_runner.rs => unix_runner.rs} | 0 3 files changed, 11 insertions(+), 2 deletions(-) rename bin/correctness/airlock/src/{native.rs => unix.rs} (100%) rename bin/correctness/panoramic/src/{native_runner.rs => unix_runner.rs} (100%) diff --git a/Makefile b/Makefile index 26acbbca1d6..62bcaa18cef 100644 --- a/Makefile +++ b/Makefile @@ -596,12 +596,21 @@ MACOS_TEST_AGENT_DMG_DIR ?= /tmp/saluki-dda-dmg-cache MACOS_TEST_AGENT_DMG_URL ?= https://s3.amazonaws.com/dd-agent/datadog-agent-$(MACOS_TEST_AGENT_VERSION)-1.$(shell uname -m).dmg .PHONY: provision-macos-test-env -provision-macos-test-env: ## Idempotently installs the Datadog Agent at /opt/datadog-agent and bootstraps the IPC cert; required by converged native_macos integration tests. +provision-macos-test-env: ## Idempotently installs the pinned Datadog Agent ($(MACOS_TEST_AGENT_VERSION)) at /opt/datadog-agent and bootstraps the IPC cert. Fails if a different version is already installed. @echo "[*] Provisioning macOS test environment..." @if [ "$(shell uname -s)" != "Darwin" ]; then \ echo "provision-macos-test-env only runs on macOS hosts" >&2; exit 1; \ fi - @if [ ! -x /opt/datadog-agent/bin/agent/agent ]; then \ + @if [ -x /opt/datadog-agent/bin/agent/agent ]; then \ + INSTALLED_VERSION=$$(/opt/datadog-agent/bin/agent/agent version 2>/dev/null | awk '{print $$2}'); \ + if [ "$$INSTALLED_VERSION" = "$(MACOS_TEST_AGENT_VERSION)" ]; then \ + echo "[*] Datadog Agent $$INSTALLED_VERSION already installed (matches expected version)"; \ + else \ + echo "ERROR: installed Datadog Agent version '$$INSTALLED_VERSION' does not match expected '$(MACOS_TEST_AGENT_VERSION)'." >&2; \ + echo " Remove /opt/datadog-agent or update MACOS_TEST_AGENT_VERSION and retry." >&2; \ + exit 1; \ + fi; \ + else \ echo "[*] Installing Datadog Agent $(MACOS_TEST_AGENT_VERSION)..."; \ mkdir -p $(MACOS_TEST_AGENT_DMG_DIR); \ DMG_PATH=$(MACOS_TEST_AGENT_DMG_DIR)/datadog-agent-$(MACOS_TEST_AGENT_VERSION).dmg; \ diff --git a/bin/correctness/airlock/src/native.rs b/bin/correctness/airlock/src/unix.rs similarity index 100% rename from bin/correctness/airlock/src/native.rs rename to bin/correctness/airlock/src/unix.rs diff --git a/bin/correctness/panoramic/src/native_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs similarity index 100% rename from bin/correctness/panoramic/src/native_runner.rs rename to bin/correctness/panoramic/src/unix_runner.rs From 6712fcf8ab55e5ba4a8c9bf464f373a4d10e5892 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Wed, 27 May 2026 16:39:42 -0400 Subject: [PATCH 29/56] refactor: rename native/native_macos to mac/unix/host_process "Native" was both confusing (relative to what?) and unspecified (mac? linux? eventually windows?). Replace with names that say what they mean. The split: - The lower-level driver code in airlock and the panoramic runner that uses it are portable across POSIX hosts (Linux + macOS), so they get "unix" naming. Only macOS is exercised today, but the code path works unchanged for any future Unix host opt-in. - The runtime identifier in test configs (the thing users write next to "docker" in `runtimes:` lists) is specifically the macOS- validated variant. It becomes `mac`. A future Linux host-process runtime would be its own identifier (probably `linux`) sharing the same Unix code path. - The boolean on AssertionContext that distinguished the docker runner from "the other one" gets a name that matches what it actually means: `is_host_process` (and `host_process_exit_code` for the exit-code cell). Concrete renames: - File: airlock/src/native.rs -> airlock/src/unix.rs panoramic/src/native_runner.rs -> panoramic/src/unix_runner.rs - Module: airlock::native -> airlock::unix panoramic::native_runner -> panoramic::unix_runner - Types: NativeProcess -> UnixProcess NativeProcessConfig -> UnixProcessConfig NativeIntegrationRunner -> UnixIntegrationRunner NativeLogSink -> PanoramicLogSink - Fields: AssertionContext::is_native -> is_host_process AssertionContext::native_exit_code -> host_process_exit_code - Const: NATIVE_MACOS_RUNTIME (="native_macos") -> MAC_RUNTIME (="mac") - Make: build-adp-native -> build-adp-host - Test YAML: `runtimes: [docker, native_macos]` -> `runtimes: [docker, mac]` - Vale vocab: removed `native_macos` (no longer referenced). Module docs, inline comments, error messages, and the GitLab job comment all reworded in the same pass. --- .gitlab/e2e.yml | 2 +- .../config/vocabularies/technical/accept.txt | 1 - Makefile | 16 +++-- bin/correctness/airlock/Cargo.toml | 2 +- bin/correctness/airlock/src/lib.rs | 2 +- bin/correctness/airlock/src/unix.rs | 42 +++++++------- .../panoramic/src/assertions/adp_exits.rs | 12 ++-- .../panoramic/src/assertions/file_contains.rs | 5 +- .../panoramic/src/assertions/mod.rs | 10 ++-- .../panoramic/src/assertions/process_exits.rs | 4 +- bin/correctness/panoramic/src/cli.rs | 6 +- bin/correctness/panoramic/src/config.rs | 37 ++++++------ bin/correctness/panoramic/src/main.rs | 2 +- bin/correctness/panoramic/src/runner.rs | 4 +- bin/correctness/panoramic/src/unix_runner.rs | 58 +++++++++---------- .../cases/adp-cmd-port/config.yaml | 2 +- .../cases/adp-config-check-exit/config.yaml | 4 +- .../cases/adp-config-check-warn/config.yaml | 2 +- .../cases/adp-config-stream/config.yaml | 2 +- .../cases/adp-disabled-exit/config.yaml | 2 +- .../adp-logging-default-path/config.yaml | 2 +- .../config.yaml | 2 +- .../config.yaml | 2 +- .../adp-memory-mode-disabled/config.yaml | 2 +- .../config.yaml | 2 +- .../config.yaml | 2 +- .../config.yaml | 6 +- .../config.yaml | 2 +- .../cases/adp-no-pipelines-exit/config.yaml | 2 +- .../cases/adp-rar-disabled/config.yaml | 4 +- .../cases/adp-rar-registration/config.yaml | 2 +- .../cases/basic-startup/config.yaml | 2 +- .../cases/dogstatsd-autoscale-udp/config.yaml | 2 +- .../config.yaml | 2 +- .../cases/dogstatsd-bind-host/config.yaml | 2 +- .../cases/dogstatsd-default-bind/config.yaml | 2 +- .../cases/dogstatsd-enabled/config.yaml | 2 +- .../cases/dogstatsd-forwarding/config.yaml | 2 +- .../config.yaml | 2 +- .../cases/otlp-traces-enabled/config.yaml | 2 +- .../privileged-api-endpoints/config.yaml | 2 +- .../cases/telemetry-endpoint/config.yaml | 2 +- .../unprivileged-api-endpoints/config.yaml | 2 +- 43 files changed, 135 insertions(+), 132 deletions(-) diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index f7d05600b10..8d4c61d1425 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -117,7 +117,7 @@ test-integration: - docker tag ${SALUKI_IMAGE_REPO_BASE}/bundled-agent-adp:${CI_COMMIT_SHA} saluki-images/datadog-agent:testing-devel - make test-integration-quick -# Runs the subset of integration tests that have opted in to the `native_macos` runtime +# Runs the subset of integration tests that have opted in to the `mac` runtime # directly on a bare-metal macOS runner. No Docker, no virtualization: panoramic spawns ADP # (and the Core Agent for converged tests) as real macOS processes against a per-test temp # state directory. The Datadog Agent install at /opt/datadog-agent is provisioned by the diff --git a/.vale/styles/config/vocabularies/technical/accept.txt b/.vale/styles/config/vocabularies/technical/accept.txt index 9024116614b..ad9315178e9 100644 --- a/.vale/styles/config/vocabularies/technical/accept.txt +++ b/.vale/styles/config/vocabularies/technical/accept.txt @@ -229,4 +229,3 @@ mpmc dhat profiler launchd -native_macos diff --git a/Makefile b/Makefile index 62bcaa18cef..e3604760b4a 100644 --- a/Makefile +++ b/Makefile @@ -569,10 +569,10 @@ list-integration-tests: build-panoramic list-integration-tests: ## Lists available ADP integration tests @target/release/panoramic list -d $(shell pwd)/test/integration/cases -.PHONY: build-adp-native -build-adp-native: check-rust-build-tools -build-adp-native: ## Builds the agent-data-plane binary natively for the current host (release profile) - @echo "[*] Building agent-data-plane (release, native host target)..." +.PHONY: build-adp-host +build-adp-host: check-rust-build-tools +build-adp-host: ## Builds the agent-data-plane binary for the current host (release profile) + @echo "[*] Building agent-data-plane (release, host target)..." @APP_FULL_NAME="$(ADP_APP_FULL_NAME)" \ APP_SHORT_NAME="$(ADP_APP_SHORT_NAME)" \ APP_IDENTIFIER="$(ADP_APP_IDENTIFIER)" \ @@ -582,8 +582,8 @@ build-adp-native: ## Builds the agent-data-plane binary natively for the current cargo build --release --bin agent-data-plane .PHONY: test-integration-macos-run -test-integration-macos-run: ## Runs native macOS integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist). Defaults to all native_macos-eligible tests; narrow with CASE=. - @echo "[*] Running native macOS integration tests..." +test-integration-macos-run: ## Runs the macOS host-process integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist). Defaults to all `mac`-runtime-eligible tests; narrow with CASE=. + @echo "[*] Running macOS host-process integration tests..." @ADP_BINARY_PATH="$(CURDIR)/target/release/agent-data-plane" \ target/release/panoramic run -d "$(CURDIR)/test/integration/cases" \ $(if $(CASE),-t $(CASE)) --no-tui -p 1 \ @@ -624,8 +624,6 @@ provision-macos-test-env: ## Idempotently installs the pinned Datadog Agent ($(M sudo /usr/sbin/installer -pkg "$$PKG" -target / >/dev/null 2>&1 || true; \ sudo hdiutil detach /Volumes/datadog_agent >/dev/null 2>&1; \ test -x /opt/datadog-agent/bin/agent/agent; \ - else \ - echo "[*] Datadog Agent already installed at /opt/datadog-agent"; \ fi @if [ ! -f /opt/datadog-agent/etc/ipc_cert.pem ] || [ ! -f /opt/datadog-agent/etc/auth_token ]; then \ echo "[*] Bootstrapping IPC cert + auth_token by running the Agent briefly..."; \ @@ -651,7 +649,7 @@ provision-macos-test-env: ## Idempotently installs the pinned Datadog Agent ($(M @echo "[*] macOS test environment ready." .PHONY: test-integration-macos-ci -test-integration-macos-ci: build-panoramic build-adp-native provision-macos-test-env test-integration-macos-run ## CI entry point: builds binaries, ensures Agent + cert are provisioned, then runs the native_macos integration tests +test-integration-macos-ci: build-panoramic build-adp-host provision-macos-test-env test-integration-macos-run ## CI entry point: builds binaries, ensures Agent + cert are provisioned, then runs the `mac`-runtime integration tests .PHONY: ensure-rust-miri ensure-rust-miri: diff --git a/bin/correctness/airlock/Cargo.toml b/bin/correctness/airlock/Cargo.toml index 97f16969639..b91d9e46dbf 100644 --- a/bin/correctness/airlock/Cargo.toml +++ b/bin/correctness/airlock/Cargo.toml @@ -24,7 +24,7 @@ tokio = { workspace = true, features = [ tokio-util = { workspace = true } tracing = { workspace = true } -# Native process driver uses libc's killpg/SIGTERM/SIGKILL inside #[cfg(unix)] blocks. The wider +# Unix process driver uses libc's killpg/SIGTERM/SIGKILL inside #[cfg(unix)] blocks. The wider # correctness/integration test suite is only operated on Linux/Docker today, so this gate exists # more as a forward-looking marker than as something a Windows build actually depends on. [target.'cfg(unix)'.dependencies] diff --git a/bin/correctness/airlock/src/lib.rs b/bin/correctness/airlock/src/lib.rs index bcf68aa6d38..764bb771690 100644 --- a/bin/correctness/airlock/src/lib.rs +++ b/bin/correctness/airlock/src/lib.rs @@ -1,4 +1,4 @@ pub mod config; pub mod docker; pub mod driver; -pub mod native; +pub mod unix; diff --git a/bin/correctness/airlock/src/unix.rs b/bin/correctness/airlock/src/unix.rs index 867de686124..4d59f96cf58 100644 --- a/bin/correctness/airlock/src/unix.rs +++ b/bin/correctness/airlock/src/unix.rs @@ -1,11 +1,12 @@ -//! Native process driver for non-containerized integration tests. +//! Unix process driver for non-containerized integration tests. //! //! This module mirrors the relevant surface of the Docker [`Driver`][crate::driver::Driver] but //! spawns a local binary instead of a container. It exists so that integration tests can run on -//! macOS hosts where ADP is exercised as a real macOS process rather than inside a Linux -//! container. +//! Unix hosts where ADP is exercised as a real host process rather than inside a container. The +//! code path is portable across POSIX hosts (Linux + macOS); only macOS is exercised today, but +//! the same module is used unchanged when we opt other Unix hosts into the suite. //! -//! Only the small subset of the Docker driver surface needed by the panoramic native runner is +//! Only the small subset of the Docker driver surface needed by the panoramic Unix runner is //! implemented: spawn, log capture, exit watching, and cleanup. use std::{ @@ -26,19 +27,19 @@ use tokio::{ use tokio_util::sync::CancellationToken; use tracing::{debug, warn}; -/// Shared cell that receives the exit code of a spawned [`NativeProcess`]. +/// Shared cell that receives the exit code of a spawned [`UnixProcess`]. /// /// The cell is populated by the background exit watcher when the child exits on its own, or by -/// [`NativeProcess::cleanup`] when the test tears down. Consumers (for example, the +/// [`UnixProcess::cleanup`] when the test tears down. Consumers (for example, the /// `adp_exits_with` assertion in panoramic) read the cell after the exit token fires. /// /// The inner `Option` is `None` if the process was terminated by signal rather than exiting /// normally with a status code. pub type ExitCodeCell = Arc>>; -/// Configuration for a native process to spawn. +/// Configuration for a Unix process to spawn. #[derive(Clone)] -pub struct NativeProcessConfig { +pub struct UnixProcessConfig { /// Display name used for logs and reporting. pub name: String, /// Absolute path to the binary to execute. @@ -49,7 +50,7 @@ pub struct NativeProcessConfig { pub env: HashMap, } -impl NativeProcessConfig { +impl UnixProcessConfig { /// Creates a new configuration with the given display name and binary path. pub fn new(name: impl Into, binary_path: impl Into) -> Self { Self { @@ -73,7 +74,7 @@ impl NativeProcessConfig { } } -/// A trait-object-friendly sink for log lines captured from a native process. +/// A trait-object-friendly sink for log lines captured from a Unix process. /// /// This is intentionally minimal so consumers can implement it on their own log buffer type /// without depending on `airlock`. @@ -83,9 +84,9 @@ pub trait LogSink: Send + Sync { fn push_line(&mut self, line: String, is_stderr: bool); } -/// A spawned native process and its supporting tasks. +/// A spawned Unix process and its supporting tasks. /// -/// `NativeProcess` owns the child process plus background tasks that pump stdout/stderr lines +/// `UnixProcess` owns the child process plus background tasks that pump stdout/stderr lines /// into a shared sink and observe the child's exit. The provided exit token is cancelled when /// the child process exits on its own (observed by the background watcher) or when /// [`cleanup`][Self::cleanup] is called. The exit code is recorded in the shared @@ -94,8 +95,9 @@ pub trait LogSink: Send + Sync { /// The spawned process is always made the leader of a new process group, so /// [`cleanup`][Self::cleanup] can signal the entire group (parent plus any forked helpers). /// This matters for binaries like the Datadog Core Agent that spawn `trace-agent` / -/// `process-agent` which would otherwise orphan onto launchd when only the parent is killed. -pub struct NativeProcess { +/// `process-agent` which would otherwise orphan onto the init/launchd system supervisor when +/// only the parent is killed. +pub struct UnixProcess { name: String, /// PGID of the spawned process. We made the child the group leader at spawn time, so this /// equals the child's PID. `None` only if spawn failed to return a PID (very rare). @@ -106,11 +108,11 @@ pub struct NativeProcess { exit_task: Option>, } -impl NativeProcess { +impl UnixProcess { /// Spawns the process described by `config`. The provided `log_sink` receives each line of /// captured stdout/stderr; the provided `exit_token` is cancelled when the process exits. pub async fn spawn( - config: NativeProcessConfig, log_sink: Arc>, exit_token: CancellationToken, + config: UnixProcessConfig, log_sink: Arc>, exit_token: CancellationToken, ) -> Result { if !config.binary_path.exists() { return Err(generic_error!( @@ -161,11 +163,11 @@ impl NativeProcess { match child.wait().await { Ok(status) => { let code = status.code(); - debug!(name = %name_for_watcher, ?code, "Native process exited."); + debug!(name = %name_for_watcher, ?code, "Unix process exited."); let _ = exit_code_for_watcher.set(code); } Err(e) => { - warn!(name = %name_for_watcher, error = %e, "Failed to wait on native process; treating as exited."); + warn!(name = %name_for_watcher, error = %e, "Failed to wait on Unix process; treating as exited."); let _ = exit_code_for_watcher.set(None); } } @@ -227,12 +229,12 @@ impl NativeProcess { } } -impl Drop for NativeProcess { +impl Drop for UnixProcess { fn drop(&mut self) { if self.exit_task.is_some() { warn!( name = %self.name, - "NativeProcess dropped without explicit cleanup; child may have been killed via kill_on_drop." + "UnixProcess dropped without explicit cleanup; child may have been killed via kill_on_drop." ); } } diff --git a/bin/correctness/panoramic/src/assertions/adp_exits.rs b/bin/correctness/panoramic/src/assertions/adp_exits.rs index ccb5de614a6..ca793a4dde4 100644 --- a/bin/correctness/panoramic/src/assertions/adp_exits.rs +++ b/bin/correctness/panoramic/src/assertions/adp_exits.rs @@ -12,9 +12,9 @@ use crate::{ /// `docker/s6-services/agent-data-plane/finish` when ADP exits. We grep the captured log buffer /// for that line. /// -/// On the `native_macos` runtime there is no supervisor. The native runner observes ADP's child -/// process exit directly and records the exit code in the shared cell on -/// [`AssertionContext::native_exit_code`]. +/// On the `mac` runtime there is no supervisor. The Unix runner observes ADP's child process +/// exit directly and records the exit code in the shared cell on +/// [`AssertionContext::host_process_exit_code`]. pub struct AdpExitsWithAssertion { expected_code: i64, timeout: Duration, @@ -38,7 +38,7 @@ impl Assertion for AdpExitsWithAssertion { async fn check(&self, ctx: &AssertionContext) -> AssertionResult { let started = Instant::now(); - if ctx.is_native { + if ctx.is_host_process { self.check_native(ctx, started).await } else { self.check_docker_via_supervisor_log(ctx, started).await @@ -48,13 +48,13 @@ impl Assertion for AdpExitsWithAssertion { impl AdpExitsWithAssertion { async fn check_native(&self, ctx: &AssertionContext, started: Instant) -> AssertionResult { - let cell = match ctx.native_exit_code.as_ref() { + let cell = match ctx.host_process_exit_code.as_ref() { Some(c) => c.clone(), None => { return AssertionResult { name: self.name().to_string(), passed: false, - message: "Native exit code cell not provided in AssertionContext.".to_string(), + message: "Host-process exit code cell not provided in AssertionContext.".to_string(), duration: started.elapsed(), }; } diff --git a/bin/correctness/panoramic/src/assertions/file_contains.rs b/bin/correctness/panoramic/src/assertions/file_contains.rs index 64d4df37377..39a051d5975 100644 --- a/bin/correctness/panoramic/src/assertions/file_contains.rs +++ b/bin/correctness/panoramic/src/assertions/file_contains.rs @@ -95,7 +95,7 @@ impl Assertion for FileContainsAssertion { }; } - let read_result = if ctx.is_native { + let read_result = if ctx.is_host_process { read_file_local(&self.path).await } else { read_file_in_container(&ctx.container_name, &self.path).await @@ -138,7 +138,8 @@ impl Assertion for FileContainsAssertion { /// Reads a file from the host filesystem. /// -/// Used by the `native_macos` runtime where ADP runs as a local process and writes log files to +/// Used by the `mac` runtime (and any future host-process runtime) where ADP runs as a local +/// process and writes log files to /// real host paths. Returns the same shape as [`read_file_in_container`]: `Ok(Some(contents))` /// when readable, `Ok(None)` when missing, `Err` for unexpected I/O failures. async fn read_file_local(path: &str) -> Result, String> { diff --git a/bin/correctness/panoramic/src/assertions/mod.rs b/bin/correctness/panoramic/src/assertions/mod.rs index f1c0f3bbc76..2667008ec78 100644 --- a/bin/correctness/panoramic/src/assertions/mod.rs +++ b/bin/correctness/panoramic/src/assertions/mod.rs @@ -109,11 +109,11 @@ pub struct AssertionContext { /// Whether the test is running natively (no container). When `true`, assertions that would /// otherwise reach into a container (for example, reading a file via `docker exec`) should operate /// against the host filesystem / local process instead. - pub is_native: bool, - /// Exit code of the native target process, populated once it exits. `None` on the docker + pub is_host_process: bool, + /// Exit code of the host target process, populated once it exits. `None` on the docker /// path or while the process is still running; `Some(None)` if the process was killed by /// signal; `Some(Some(code))` if it exited normally. - pub native_exit_code: Option, + pub host_process_exit_code: Option, } /// Trait for assertion implementations. @@ -201,9 +201,9 @@ pub fn create_assertion(config: &AssertionConfig) -> Result, /// parallel blocks concurrently. Stops at the first failure (fail-fast), so the returned vector /// is truncated past the failing step. /// -/// Used by both the docker and `native_macos` integration runners; the only thing that differs +/// Used by both the docker and `mac` integration runners; the only thing that differs /// between runtimes is how `ctx` is constructed (port mappings come from a Docker driver vs. -/// identity-mapped from the test config; `is_native` and `native_exit_code` flip). +/// identity-mapped from the test config; `is_host_process` and `host_process_exit_code` flip). pub(crate) async fn run_assertion_steps(test_case: &IntegrationConfig, ctx: &AssertionContext) -> Vec { let mut results = Vec::new(); let total_steps = test_case.assertions.len(); diff --git a/bin/correctness/panoramic/src/assertions/process_exits.rs b/bin/correctness/panoramic/src/assertions/process_exits.rs index 067d598cff1..8906599d848 100644 --- a/bin/correctness/panoramic/src/assertions/process_exits.rs +++ b/bin/correctness/panoramic/src/assertions/process_exits.rs @@ -4,9 +4,9 @@ use crate::assertions::{Assertion, AssertionContext, AssertionResult}; /// Assertion that checks the container process exits with a specific exit code. /// -/// Currently implemented only for the docker runtime. The native_macos runtime uses the +/// Currently implemented only for the docker runtime. The `mac` runtime uses the /// runtime-aware [`AdpExitsWithAssertion`][crate::assertions::AdpExitsWithAssertion] instead -/// (which delegates to the per-process exit code cell on native). +/// (which delegates to the per-process exit code cell on host-process runtimes). pub struct ProcessExitsWithAssertion { expected_code: i64, timeout: Duration, diff --git a/bin/correctness/panoramic/src/cli.rs b/bin/correctness/panoramic/src/cli.rs index cccbe4109c8..b2afa0ef092 100644 --- a/bin/correctness/panoramic/src/cli.rs +++ b/bin/correctness/panoramic/src/cli.rs @@ -30,9 +30,9 @@ pub struct RunCommand { #[argh(option, short = 't')] pub tests: Option, - /// integration-test runtime to scope discovery to (for example, `docker` or `native_macos`). + /// integration-test runtime to scope discovery to (for example, `docker` or `mac`). /// Only integration tests whose `runtimes:` list contains this value are eligible to run. - /// Defaults to `native_macos` on macOS hosts and `docker` everywhere else. Correctness tests + /// Defaults to `mac` on macOS hosts and `docker` everywhere else. Correctness tests /// are unaffected by this flag. #[argh(option)] pub runtime: Option, @@ -108,7 +108,7 @@ pub struct ListCommand { pub test_dirs: Vec, /// integration-test runtime to scope discovery to. Same semantics as on `run`: defaults to - /// `native_macos` on macOS, `docker` everywhere else. Correctness tests are unaffected. + /// `mac` on macOS, `docker` everywhere else. Correctness tests are unaffected. #[argh(option)] pub runtime: Option, diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index fc617e8e240..b4d37102001 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -116,17 +116,17 @@ pub struct IntegrationConfig { /// Runtimes under which this test is eligible to run. /// - /// Each value must be either `"docker"` (the default) or `"native_macos"`. The active + /// Each value must be either `"docker"` (the default) or `"mac"`. The active /// runtime for any given panoramic invocation is chosen at the CLI level (`--runtime`, /// defaulting to the host's native runtime); a test discovers only when this list contains /// that active runtime. Tests with multiple entries are portable across runtimes, but still - /// execute only once per invocation — in the active runtime. + /// execute only once per invocation, in the active runtime. #[serde(default = "default_integration_runtimes")] pub runtimes: Vec, /// Whether this test requires a Core Agent process to be running alongside ADP. /// - /// When `true`, the native runtime spawns the Datadog Core Agent as a side process before + /// When `true`, host-process runtimes (such as `mac`) spawn the Datadog Core Agent as a side process before /// starting ADP, sharing a per-test config directory so they communicate over IPC the same /// way they would in production. When `false` (the default), only ADP is spawned (standalone /// mode). @@ -154,20 +154,22 @@ fn default_integration_runtimes() -> Vec { vec![default_host_runtime().to_string()] } -/// Runtime identifier for integration tests that run as native (non-containerized) processes. -pub const NATIVE_MACOS_RUNTIME: &str = "native_macos"; +/// Runtime identifier for integration tests that run as host processes on macOS (no Docker, no +/// virtualization). Validated on macOS only today; future host-process runtimes for other Unix +/// platforms will get their own identifiers (for example, `linux`). +pub const MAC_RUNTIME: &str = "mac"; /// Runtime identifier for integration tests that run inside a Docker container. pub const DOCKER_RUNTIME: &str = "docker"; /// Returns the integration-test runtime that is native to the host OS. /// -/// `native_macos` on macOS hosts, `docker` everywhere else. Used as the default when a panoramic -/// subcommand is invoked without an explicit `--runtime` flag, so that callers on the most common -/// host get the most common runtime without having to spell it out. +/// `mac` on macOS hosts, `docker` everywhere else. Used as the default when a panoramic +/// subcommand is invoked without an explicit `--runtime` flag, so that callers on the most +/// common host get the most common runtime without having to spell it out. pub fn default_host_runtime() -> &'static str { if cfg!(target_os = "macos") { - NATIVE_MACOS_RUNTIME + MAC_RUNTIME } else { DOCKER_RUNTIME } @@ -240,8 +242,8 @@ pub enum AssertionConfig { /// On the `docker` runtime the converged image wraps ADP under s6, which keeps the /// container alive across ADP restarts and logs `agent-data-plane exited with code N` from /// `docker/s6-services/agent-data-plane/finish`. This assertion greps the log buffer for - /// that line. On the `native_macos` runtime ADP is spawned directly; the assertion reads - /// the exit code recorded by the native runner when ADP's child process exited. + /// that line. On the `mac` runtime ADP is spawned directly; the assertion reads + /// the exit code recorded by the Unix runner when ADP's child process exited. AdpExitsWith { /// The expected exit code. expected_code: i64, @@ -440,8 +442,9 @@ impl Test for IntegrationConfig { fn images(&self) -> BTreeMap<&str, String> { let mut m = BTreeMap::new(); - // The native_macos runtime doesn't require any container image. - if self.active_runtime != NATIVE_MACOS_RUNTIME { + // Host-process runtimes (such as `mac`) don't need a container image; the test's + // `container.image` field is informational for them. + if self.active_runtime != MAC_RUNTIME { m.insert("container", self.container.image.clone()); } m @@ -457,8 +460,8 @@ impl Test for IntegrationConfig { async fn run(&self, tctx: TestContext) -> TestResult { match self.active_runtime.as_str() { - NATIVE_MACOS_RUNTIME => { - let mut runner = crate::native_runner::NativeIntegrationRunner::new(self.clone(), tctx); + MAC_RUNTIME => { + let mut runner = crate::unix_runner::UnixIntegrationRunner::new(self.clone(), tctx); runner.run().await } // Default to the existing Docker path for "docker" or unset. @@ -794,13 +797,13 @@ fn try_load_test( // Validate every declared runtime up front so a typo in any list surfaces at discovery // time, even on hosts that wouldn't actually run that runtime. for runtime in &config.runtimes { - if runtime != DOCKER_RUNTIME && runtime != NATIVE_MACOS_RUNTIME { + if runtime != DOCKER_RUNTIME && runtime != MAC_RUNTIME { return Err(generic_error!( "integration test '{}' declares unknown runtime '{}' (expected '{}' or '{}')", config.name, runtime, DOCKER_RUNTIME, - NATIVE_MACOS_RUNTIME + MAC_RUNTIME )); } } diff --git a/bin/correctness/panoramic/src/main.rs b/bin/correctness/panoramic/src/main.rs index b4586d20972..f44640cae45 100644 --- a/bin/correctness/panoramic/src/main.rs +++ b/bin/correctness/panoramic/src/main.rs @@ -32,10 +32,10 @@ use self::events::{create_event_channel, TestEvent}; mod reporter; use self::reporter::{OutputFormat, Reporter, TestResult, TestSuiteResult}; -mod native_runner; mod runner; mod test; mod tui; +mod unix_runner; mod utils; #[tokio::main] diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index ab13602b967..0cb2a768a1c 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -811,8 +811,8 @@ impl IntegrationRunner { cancel_token: self.tctx.test_cancel_token(), port_mappings: port_mappings.clone(), container_name: container_name.to_string(), - is_native: false, - native_exit_code: None, + is_host_process: false, + host_process_exit_code: None, }; crate::assertions::run_assertion_steps(&self.test_case, &ctx).await } diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 4cc98012d58..2262a961e8d 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -1,8 +1,9 @@ -//! Native-process integration test runner. +//! Unix-process integration test runner. //! //! This runner is the parallel of [`crate::runner::IntegrationRunner`] but for tests declared -//! with `runtime: native_macos`. Instead of building a Docker container, it spawns binaries -//! directly via [`airlock::native::NativeProcess`] and feeds their stdout/stderr into the same +//! with `runtime: mac` (and, in the future, any other Unix host runtime opted in). Instead of +//! building a Docker container, it spawns binaries directly via +//! [`airlock::unix::UnixProcess`] and feeds their stdout/stderr into the same //! [`LogBuffer`][crate::assertions::LogBuffer] used by the Docker path so the assertions work //! unchanged. //! @@ -13,7 +14,7 @@ //! - **Converged**: the Datadog Core Agent is spawned alongside ADP (when //! `requires_core_agent: true`), sharing a per-test config directory so they authenticate //! over IPC the same way they would in production. See the per-phase comments in -//! [`NativeIntegrationRunner::run`] for the cert/auth_token plumbing. +//! [`UnixIntegrationRunner::run`] for the cert/auth_token plumbing. //! //! # Binary discovery //! @@ -29,7 +30,7 @@ use std::{ time::{Duration, Instant}, }; -use airlock::native::{LogSink, NativeProcess, NativeProcessConfig}; +use airlock::unix::{LogSink, UnixProcess, UnixProcessConfig}; use rand::distr::SampleString as _; use saluki_error::{ErrorContext as _, GenericError}; use tokio::sync::{Mutex, RwLock}; @@ -54,14 +55,14 @@ const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/opt/datadog-agent/bin/agent/agent const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); -/// Runner for a single native-process integration test case. -pub(crate) struct NativeIntegrationRunner { +/// Runner for a single Unix-process integration test case. +pub(crate) struct UnixIntegrationRunner { test_case: IntegrationConfig, tctx: TestContext, log_buffer: Arc>, } -impl NativeIntegrationRunner { +impl UnixIntegrationRunner { /// Creates a new runner for the given test case. pub(crate) fn new(test_case: IntegrationConfig, tctx: TestContext) -> Self { Self { @@ -77,7 +78,7 @@ impl NativeIntegrationRunner { let test_name = self.test_case.name(); let mut phase_timings = Vec::new(); - info!(test = %test_name, "Starting native integration test case."); + info!(test = %test_name, "Starting Unix integration test case."); // Phase: resolve binary path. let binary_path = match resolve_adp_binary_path() { @@ -110,11 +111,11 @@ impl NativeIntegrationRunner { debug!(test = %test_name, state_dir = %state_dir.display(), "Prepared per-test state directory."); // Only ADP's exit lifecycle is observable to assertions. The Core Agent (when present) - // gets a throwaway token at spawn time — it satisfies `NativeProcess::spawn`'s + // gets a throwaway token at spawn time — it satisfies `UnixProcess::spawn`'s // signature but nothing consumes the resulting cancellation. If the Agent dies // independently it's treated as an environmental fault, not a test signal. let adp_exit_token = CancellationToken::new(); - let log_sink: Arc> = Arc::new(Mutex::new(NativeLogSink { + let log_sink: Arc> = Arc::new(Mutex::new(PanoramicLogSink { buf: self.log_buffer.clone(), })); @@ -130,7 +131,7 @@ impl NativeIntegrationRunner { // then spawn ADP with `DD_AUTH_TOKEN_FILE_PATH` pointing at the per-test auth token so // ADP's IPC client uses the same per-test credentials (and ADP's own API server uses // the matching cert). - let mut core_agent: Option = None; + let mut core_agent: Option = None; if self.test_case.requires_core_agent { let agent_spawn_start = Instant::now(); let agent_binary = match resolve_core_agent_binary_path() { @@ -149,7 +150,7 @@ impl NativeIntegrationRunner { let mut agent_env = self.test_case.container.env.clone(); agent_env.insert("DD_AUTH_TOKEN_FILE_PATH".to_string(), auth_token_path.clone()); - let agent_config = NativeProcessConfig::new(format!("{}-core-agent", self.test_case.name), agent_binary) + let agent_config = UnixProcessConfig::new(format!("{}-core-agent", self.test_case.name), agent_binary) .with_args(vec![ "run".to_string(), "-c".to_string(), @@ -157,7 +158,7 @@ impl NativeIntegrationRunner { ]) .with_env_map(agent_env); - let agent = match NativeProcess::spawn(agent_config, log_sink.clone(), CancellationToken::new()).await { + let agent = match UnixProcess::spawn(agent_config, log_sink.clone(), CancellationToken::new()).await { Ok(p) => p, Err(e) => { phase_timings.push(PhaseTiming { @@ -199,11 +200,11 @@ impl NativeIntegrationRunner { // per-test ipc_cert.pem in the same directory). adp_env.insert("DD_AUTH_TOKEN_FILE_PATH".to_string(), auth_token_path); } - let process_config = NativeProcessConfig::new(self.test_case.name.clone(), binary_path) + let process_config = UnixProcessConfig::new(self.test_case.name.clone(), binary_path) .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) .with_env_map(adp_env); - let process = match NativeProcess::spawn(process_config, log_sink, adp_exit_token.clone()).await { + let process = match UnixProcess::spawn(process_config, log_sink, adp_exit_token.clone()).await { Ok(p) => p, Err(e) => { if let Some(agent) = core_agent.take() { @@ -262,10 +263,10 @@ impl NativeIntegrationRunner { } /// Builds the port mappings for assertions. In the Docker runner this maps container ports - /// to host ports allocated by Docker. On native there is no remapping: a port declared in - /// `exposed_ports` is reachable on the host at the same number. We populate identity entries - /// so the existing `port_listening` assertion (which expects every probed port to appear in - /// the mapping) works unchanged. + /// to host ports allocated by Docker. As a host process there is no remapping: a port + /// declared in `exposed_ports` is reachable on the host at the same number. We populate + /// identity entries so the existing `port_listening` assertion (which expects every probed + /// port to appear in the mapping) works unchanged. fn build_port_mappings(&self) -> HashMap { let mut mappings = HashMap::new(); for spec in &self.test_case.container.exposed_ports { @@ -277,8 +278,7 @@ impl NativeIntegrationRunner { } async fn run_assertions( - &self, process_display_name: String, exit_token: CancellationToken, - exit_code_cell: airlock::native::ExitCodeCell, + &self, process_display_name: String, exit_token: CancellationToken, exit_code_cell: airlock::unix::ExitCodeCell, ) -> Vec { let ctx = AssertionContext { log_buffer: self.log_buffer.clone(), @@ -286,8 +286,8 @@ impl NativeIntegrationRunner { cancel_token: self.tctx.test_cancel_token(), port_mappings: self.build_port_mappings(), container_name: process_display_name, - is_native: true, - native_exit_code: Some(exit_code_cell), + is_host_process: true, + host_process_exit_code: Some(exit_code_cell), }; crate::assertions::run_assertion_steps(&self.test_case, &ctx).await } @@ -344,7 +344,7 @@ fn create_test_state_dir() -> Result { let suffix = rand::distr::Alphanumeric .sample_string(&mut rand::rng(), 8) .to_lowercase(); - let dir = std::env::temp_dir().join(format!("panoramic-native-{}", suffix)); + let dir = std::env::temp_dir().join(format!("panoramic-unix-{}", suffix)); std::fs::create_dir_all(&dir) .with_error_context(|| format!("Failed to create state directory '{}'.", dir.display()))?; Ok(dir) @@ -353,7 +353,7 @@ fn create_test_state_dir() -> Result { fn make_error_result( name: String, started: Instant, phase: &str, e: GenericError, phase_timings: Vec, ) -> TestResult { - error!(test = %name, error = %e, phase, "Native integration test setup failed."); + error!(test = %name, error = %e, phase, "Unix integration test setup failed."); TestResult { name, passed: false, @@ -365,12 +365,12 @@ fn make_error_result( } } -/// Bridges [`airlock::native::LogSink`] to the panoramic [`LogBuffer`]. -struct NativeLogSink { +/// Bridges [`airlock::unix::LogSink`] to the panoramic [`LogBuffer`]. +struct PanoramicLogSink { buf: Arc>, } -impl LogSink for NativeLogSink { +impl LogSink for PanoramicLogSink { fn push_line(&mut self, line: String, is_stderr: bool) { // Try a non-blocking write first. If contended, spawn a task to defer the write so we // don't stall the log pump (which is itself a tokio task). diff --git a/test/integration/cases/adp-cmd-port/config.yaml b/test/integration/cases/adp-cmd-port/config.yaml index a65236c56ba..e97c3bba76e 100644 --- a/test/integration/cases/adp-cmd-port/config.yaml +++ b/test/integration/cases/adp-cmd-port/config.yaml @@ -13,7 +13,7 @@ type: integration name: "adp-cmd-port" description: "Verifies ADP connects to the correct port when cmd_port is set" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/adp-config-check-exit/config.yaml b/test/integration/cases/adp-config-check-exit/config.yaml index 7fb4667775d..143c84fe362 100644 --- a/test/integration/cases/adp-config-check-exit/config.yaml +++ b/test/integration/cases/adp-config-check-exit/config.yaml @@ -10,7 +10,7 @@ type: integration name: "adp-config-check-exit" description: "Verify config check exits ADP on high-severity incompatible key" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: @@ -32,7 +32,7 @@ assertions: pattern: "incompatible configuration detected" timeout: 30s # Verify ADP actually exited (not just logged the error and continued). On docker this - # observes the s6 supervisor's exit log; on native_macos it observes the process exit code + # observes the s6 supervisor's exit log; on mac it observes the process exit code # directly. Expected code is 1 because the high-severity check returns a non-zero status. - type: adp_exits_with expected_code: 1 diff --git a/test/integration/cases/adp-config-check-warn/config.yaml b/test/integration/cases/adp-config-check-warn/config.yaml index 9e1b5cd8a4c..72f8f32ce49 100644 --- a/test/integration/cases/adp-config-check-warn/config.yaml +++ b/test/integration/cases/adp-config-check-warn/config.yaml @@ -7,7 +7,7 @@ type: integration name: "adp-config-check-warn" description: "Verify config check warns on medium-severity incompatible keys without exiting" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/adp-config-stream/config.yaml b/test/integration/cases/adp-config-stream/config.yaml index ba587c28066..cd747f4e273 100644 --- a/test/integration/cases/adp-config-stream/config.yaml +++ b/test/integration/cases/adp-config-stream/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-config-stream" description: "Verify ADP receives configuration from Core Agent via config stream" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/adp-disabled-exit/config.yaml b/test/integration/cases/adp-disabled-exit/config.yaml index 7f651eac373..f0fdca080ff 100644 --- a/test/integration/cases/adp-disabled-exit/config.yaml +++ b/test/integration/cases/adp-disabled-exit/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-disabled-exit" description: "Verify ADP exits cleanly when data plane is not enabled" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/adp-logging-default-path/config.yaml b/test/integration/cases/adp-logging-default-path/config.yaml index 45b21e7b0e6..ca87955c94d 100644 --- a/test/integration/cases/adp-logging-default-path/config.yaml +++ b/test/integration/cases/adp-logging-default-path/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-logging-default-path" description: "Verifies ADP writes to the platform-default log file path (/var/log/datadog/agent-data-plane.log) when no override is provided" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml index 6dda23c78eb..f65c3d3d45c 100644 --- a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml +++ b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-logging-ignores-core-agent-log-file" description: "Verifies ADP ignores the Core Agent's `log_file` setting and continues to use its own per-subagent log file path" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml index 7d4abd43414..46c4e6ffc2a 100644 --- a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml +++ b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-logging-respects-data-plane-log-file" description: "Verifies ADP honors the per-subagent `data_plane.log_file` setting when explicitly configured" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/adp-memory-mode-disabled/config.yaml b/test/integration/cases/adp-memory-mode-disabled/config.yaml index 187ca95a3d3..136390cbadd 100644 --- a/test/integration/cases/adp-memory-mode-disabled/config.yaml +++ b/test/integration/cases/adp-memory-mode-disabled/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-memory-mode-disabled" description: "Verifies that memory limiting is disabled by default and bounds verification is skipped" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml b/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml index 0682c8a4e57..26c83dd4e1d 100644 --- a/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-memory-mode-permissive-exceeds-limit" description: "Verifies that permissive mode emits a best-effort warning when the calculated bounds exceed the configured limit, but the process still starts" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml b/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml index 9ab1d0dfd85..b61d873cc1c 100644 --- a/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-memory-mode-permissive-within-limit" description: "Verifies that permissive mode succeeds and verifies bounds when the calculated bounds fit within the configured limit" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml b/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml index 4faf6831a0d..92efa40f346 100644 --- a/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-memory-mode-strict-exceeds-limit" description: "Verifies that strict mode causes ADP to exit with code 1 when the calculated bounds exceed the configured limit" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" @@ -19,8 +19,8 @@ container: assertions: # Observe ADP's actual exit code, regardless of runtime. On docker (s6) this greps the - # supervisor's exit log line; on native_macos this reads the exit code recorded by the - # native runner. + # supervisor's exit log line; on mac this reads the exit code recorded by the + # Unix runner. - type: adp_exits_with expected_code: 1 timeout: 30s diff --git a/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml b/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml index f40d2ac6e3f..31a8734b016 100644 --- a/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-memory-mode-strict-within-limit" description: "Verifies that strict mode succeeds and verifies bounds when the calculated bounds fit within the configured limit" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-no-pipelines-exit/config.yaml b/test/integration/cases/adp-no-pipelines-exit/config.yaml index d876d5680f8..949e8533531 100644 --- a/test/integration/cases/adp-no-pipelines-exit/config.yaml +++ b/test/integration/cases/adp-no-pipelines-exit/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-no-pipelines-exit" description: "Verify ADP exits with error when no data pipelines enabled" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/adp-rar-disabled/config.yaml b/test/integration/cases/adp-rar-disabled/config.yaml index 505c98cbb3d..3c3987cc7b9 100644 --- a/test/integration/cases/adp-rar-disabled/config.yaml +++ b/test/integration/cases/adp-rar-disabled/config.yaml @@ -5,8 +5,8 @@ timeout: 120s runtimes: [docker] # Docker-only: the assertion is that ADP stays up (handles failure gracefully via retry). In # the converged image, s6 restarts ADP on every exit, so 'process stable for 10s' really -# means 'container stable for 10s'. There is no equivalent supervisor on the native_macos -# runtime, so this assertion does not translate; re-enable once the native runner grows a +# means 'container stable for 10s'. There is no equivalent supervisor on the mac +# runtime, so this assertion does not translate; re-enable once the Unix runner grows a # supervisor or the test is rewritten to assert on retry behavior directly. container: diff --git a/test/integration/cases/adp-rar-registration/config.yaml b/test/integration/cases/adp-rar-registration/config.yaml index 2d8580504c4..3e7a8abe83b 100644 --- a/test/integration/cases/adp-rar-registration/config.yaml +++ b/test/integration/cases/adp-rar-registration/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-rar-registration" description: "Verify ADP successfully registers with Remote Agent Registry" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] requires_core_agent: true container: diff --git a/test/integration/cases/basic-startup/config.yaml b/test/integration/cases/basic-startup/config.yaml index cd1e1adc7c6..999b4f3ef80 100644 --- a/test/integration/cases/basic-startup/config.yaml +++ b/test/integration/cases/basic-startup/config.yaml @@ -2,7 +2,7 @@ type: integration name: "basic-startup" description: "Verifies ADP starts successfully and remains stable" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml index 915f50aa6fb..5465b6d7e13 100644 --- a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml +++ b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml @@ -2,7 +2,7 @@ type: integration name: "dogstatsd-autoscale-udp" description: "Verifies DogStatsD UDP listener autoscaling (SO_REUSEPORT) starts cleanly on Linux" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml index 2e65af84f7e..0868807a8cf 100644 --- a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml +++ b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml @@ -25,7 +25,7 @@ name: "dogstatsd-bind-custom-hostname" description: "Verifies DogStatsD resolves a custom hostname in bind_host via DNS" timeout: 120s # Docker-only: uses PANORAMIC_DYNAMIC shell hooks that run `hostname -i` and write /etc/hosts -# from inside the container. Neither translates to the native_macos host runtime; re-enable +# from inside the container. Neither translates to the mac host runtime; re-enable # once dynamic resolution grows a portable mechanism. runtimes: [docker] diff --git a/test/integration/cases/dogstatsd-bind-host/config.yaml b/test/integration/cases/dogstatsd-bind-host/config.yaml index 6d836c7a9f8..169a63fae3d 100644 --- a/test/integration/cases/dogstatsd-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-bind-host/config.yaml @@ -16,7 +16,7 @@ name: "dogstatsd-bind-host" description: "Verifies DogStatsD binds to the address specified by bind_host" timeout: 120s # Docker-only: uses PANORAMIC_DYNAMIC shell hooks that run `hostname -i` and write /etc/hosts -# from inside the container. Neither translates to the native_macos host runtime; re-enable +# from inside the container. Neither translates to the mac host runtime; re-enable # once dynamic resolution grows a portable mechanism. runtimes: [docker] diff --git a/test/integration/cases/dogstatsd-default-bind/config.yaml b/test/integration/cases/dogstatsd-default-bind/config.yaml index 03a4151066f..4b85d7ce231 100644 --- a/test/integration/cases/dogstatsd-default-bind/config.yaml +++ b/test/integration/cases/dogstatsd-default-bind/config.yaml @@ -12,7 +12,7 @@ type: integration name: "dogstatsd-default-bind" description: "Verifies DogStatsD binds to 127.0.0.1 by default when bind_host is not configured" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-enabled/config.yaml b/test/integration/cases/dogstatsd-enabled/config.yaml index 6feecc21088..b7e61b58f1d 100644 --- a/test/integration/cases/dogstatsd-enabled/config.yaml +++ b/test/integration/cases/dogstatsd-enabled/config.yaml @@ -2,7 +2,7 @@ type: integration name: "dogstatsd-enabled" description: "Verifies DogStatsD pipeline starts and listens on UDP port" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/dogstatsd-forwarding/config.yaml b/test/integration/cases/dogstatsd-forwarding/config.yaml index 1b08d12d216..b2f457d9476 100644 --- a/test/integration/cases/dogstatsd-forwarding/config.yaml +++ b/test/integration/cases/dogstatsd-forwarding/config.yaml @@ -2,7 +2,7 @@ type: integration name: "dogstatsd-forwarding" description: "Verifies DogStatsD message forwarding mirrors framed payloads and preserves ingestion" timeout: 90s -# Not yet validated under native_macos; opt in by adding `native_macos` to this list once verified. +# Not yet validated under mac; opt in by adding `mac` to this list once verified. runtimes: [docker] container: diff --git a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml index a3b6286e8ea..ed58215c481 100644 --- a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml @@ -14,7 +14,7 @@ type: integration name: "dogstatsd-non-local-overrides-bind-host" description: "Verifies dogstatsd_non_local_traffic takes precedence over bind_host" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/otlp-traces-enabled/config.yaml b/test/integration/cases/otlp-traces-enabled/config.yaml index f3cff643ae0..8a482db57c8 100644 --- a/test/integration/cases/otlp-traces-enabled/config.yaml +++ b/test/integration/cases/otlp-traces-enabled/config.yaml @@ -2,7 +2,7 @@ type: integration name: "otlp-traces-enabled" description: "Verifies OTLP pipeline starts with native trace handling and proxying for metrics/logs" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/privileged-api-endpoints/config.yaml b/test/integration/cases/privileged-api-endpoints/config.yaml index d8e1e4ac80b..c72e146f419 100644 --- a/test/integration/cases/privileged-api-endpoints/config.yaml +++ b/test/integration/cases/privileged-api-endpoints/config.yaml @@ -2,7 +2,7 @@ type: integration name: "privileged-api-endpoints" description: "Verifies the logging and metrics override routes are exposed on the privileged API after the workers assert them dynamically." timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/telemetry-endpoint/config.yaml b/test/integration/cases/telemetry-endpoint/config.yaml index ac13b2c420c..22f5b38c588 100644 --- a/test/integration/cases/telemetry-endpoint/config.yaml +++ b/test/integration/cases/telemetry-endpoint/config.yaml @@ -2,7 +2,7 @@ type: integration name: "telemetry-endpoint" description: "Verifies the internal telemetry routes are exposed on the unprivileged API endpoint" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" diff --git a/test/integration/cases/unprivileged-api-endpoints/config.yaml b/test/integration/cases/unprivileged-api-endpoints/config.yaml index 3889b408349..d5014bc3edb 100644 --- a/test/integration/cases/unprivileged-api-endpoints/config.yaml +++ b/test/integration/cases/unprivileged-api-endpoints/config.yaml @@ -2,7 +2,7 @@ type: integration name: "unprivileged-api-endpoints" description: "Verifies the /ready, /live, and /memory/status endpoints are accessible on the unprivileged API" timeout: 120s -runtimes: [docker, native_macos] +runtimes: [docker, mac] container: image: "saluki-images/datadog-agent:testing-devel" From b56532f4d64fa7ed944259bd3242903fc17649c2 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 11:00:43 -0400 Subject: [PATCH 30/56] build: install Datadog Agent into /tmp sandbox; centralize test port isolation Two coordinated changes so the integration tests can run on a CI runner that already has a Datadog Agent installed and possibly running, without us mutating /opt/datadog-agent or fighting it for ports. provision-macos-test-env no longer touches /opt/datadog-agent at all. Instead it extracts the pinned-version Agent pkg into a sandbox under /tmp/saluki-dda/datadog-agent via 'hdiutil attach' + 'pkgutil --expand-full' (neither needs sudo). The Agent's Python home and IPC paths are dynamically resolved relative to the binary's location, so the relocated install works unchanged. Plumbing: - New Make var MACOS_TEST_AGENT_INSTALL_DIR (default /tmp/saluki-dda/datadog-agent). The provision target idempotently reuses an existing sandbox install if it matches the pinned version. - test-integration-macos-run sets CORE_AGENT_BINARY_PATH to the sandbox path; panoramic's default also moves here for parity. - unix_runner sets DD_RUN_PATH on the Core Agent env so the relocated install writes its remote-config db / pid file / sockets into the per-test state dir instead of the install prefix's run/ dir. - .gitlab/e2e.yml before_script pkill now matches both /opt and /tmp install paths so a leftover from either flavor gets cleaned up. Every default port the test target binds is now shifted off its canonical value via env vars set by the framework on both runtimes (unix_runner for mac, docker IntegrationRunner for docker). The shape of the shift is: take the default 4-digit port and prepend a '5': Agent: 5001->55001, 5002->-1 (GUI disabled), 5000->55000, 5004->55004, 6062->56062, 8125->58125, 8126->58126 ADP: 5100->55100, 5101->55101, 5102->55102 OTLP: 4317->54317, 4318->54318 Test container.env still wins over the framework defaults, so tests like adp-cmd-port that validate non-default port handling supply their own value. The Python harness in dogstatsd-forwarding follows the new DSD port. All assertions / exposed_ports / endpoint URLs across the test suite updated to the shifted port table. --- .gitlab/e2e.yml | 15 ++- Makefile | 75 ++++++----- bin/correctness/panoramic/src/runner.rs | 11 +- bin/correctness/panoramic/src/unix_runner.rs | 116 ++++++++++++++++-- .../cases/adp-cmd-port/config.yaml | 10 +- .../cases/adp-config-stream/config.yaml | 3 +- .../adp-logging-default-path/config.yaml | 3 +- .../config.yaml | 3 +- .../config.yaml | 3 +- .../cases/adp-rar-disabled/config.yaml | 3 +- .../cases/adp-rar-registration/config.yaml | 3 +- .../cases/dogstatsd-autoscale-udp/config.yaml | 5 +- .../config.yaml | 2 +- .../cases/dogstatsd-bind-host/config.yaml | 2 +- .../cases/dogstatsd-default-bind/config.yaml | 2 +- .../cases/dogstatsd-enabled/config.yaml | 5 +- .../cases/dogstatsd-forwarding/config.yaml | 5 +- .../run_forwarding_test.py | 8 +- .../config.yaml | 2 +- .../cases/otlp-traces-enabled/config.yaml | 8 +- .../privileged-api-endpoints/config.yaml | 14 +-- .../cases/telemetry-endpoint/config.yaml | 8 +- .../unprivileged-api-endpoints/config.yaml | 8 +- 23 files changed, 217 insertions(+), 97 deletions(-) diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index 8d4c61d1425..c46d3c6d2d9 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -120,8 +120,10 @@ test-integration: # Runs the subset of integration tests that have opted in to the `mac` runtime # directly on a bare-metal macOS runner. No Docker, no virtualization: panoramic spawns ADP # (and the Core Agent for converged tests) as real macOS processes against a per-test temp -# state directory. The Datadog Agent install at /opt/datadog-agent is provisioned by the -# Makefile target (idempotent: re-uses an existing install if the runner has one). +# state directory. The Datadog Agent is installed into a sandbox under /tmp/saluki-dda by +# the Makefile target (idempotent: re-uses the install across runs if the pinned version is +# already present). The sandbox install never touches /opt/datadog-agent, so any system +# install on the runner is left alone. .test-integration-macos-base: stage: e2e needs: [] @@ -137,9 +139,12 @@ test-integration: before_script: # Defensive: clean up any leftover Datadog Agent processes from prior runs on this shared # runner before we begin. Otherwise a stranded trace-agent/process-agent can hold ports - # (e.g., 8126) and break the first converged test. - - sudo pkill -9 -f /opt/datadog-agent/bin/agent/agent || true - - sudo pkill -9 -f /opt/datadog-agent/embedded/bin/ || true + # (e.g., 8126) and break the first converged test. Catch both: + # - any pre-existing system install at /opt/datadog-agent (we don't use it, but a + # conflicting one could still be running) + # - our own sandbox install under /tmp/saluki-dda from a prior run on this shared runner + - sudo pkill -9 -f /opt/datadog-agent/ || true + - sudo pkill -9 -f /tmp/saluki-dda/ || true script: - make test-integration-macos-ci diff --git a/Makefile b/Makefile index e3604760b4a..1c934a50c0f 100644 --- a/Makefile +++ b/Makefile @@ -585,6 +585,7 @@ build-adp-host: ## Builds the agent-data-plane binary for the current host (rele test-integration-macos-run: ## Runs the macOS host-process integration tests using already-built binaries (assumes target/release/{panoramic,agent-data-plane} exist). Defaults to all `mac`-runtime-eligible tests; narrow with CASE=. @echo "[*] Running macOS host-process integration tests..." @ADP_BINARY_PATH="$(CURDIR)/target/release/agent-data-plane" \ + CORE_AGENT_BINARY_PATH="$(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent" \ target/release/panoramic run -d "$(CURDIR)/test/integration/cases" \ $(if $(CASE),-t $(CASE)) --no-tui -p 1 \ $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) @@ -594,59 +595,71 @@ test-integration-macos-run: ## Runs the macOS host-process integration tests usi MACOS_TEST_AGENT_VERSION ?= 7.78.0 MACOS_TEST_AGENT_DMG_DIR ?= /tmp/saluki-dda-dmg-cache MACOS_TEST_AGENT_DMG_URL ?= https://s3.amazonaws.com/dd-agent/datadog-agent-$(MACOS_TEST_AGENT_VERSION)-1.$(shell uname -m).dmg +# Sandbox directory the Agent is installed into. Deliberately *not* /opt/datadog-agent: keeping +# our install isolated from any pre-existing system install (which a CI runner or developer host +# may already have at a different, conflicting version) avoids surprises in both directions. +MACOS_TEST_AGENT_INSTALL_DIR ?= /tmp/saluki-dda/datadog-agent .PHONY: provision-macos-test-env -provision-macos-test-env: ## Idempotently installs the pinned Datadog Agent ($(MACOS_TEST_AGENT_VERSION)) at /opt/datadog-agent and bootstraps the IPC cert. Fails if a different version is already installed. +provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGENT_VERSION)) into $(MACOS_TEST_AGENT_INSTALL_DIR) (a sandbox under /tmp) and bootstraps the IPC cert. Idempotent: re-uses the install if it already matches the pinned version. @echo "[*] Provisioning macOS test environment..." @if [ "$(shell uname -s)" != "Darwin" ]; then \ echo "provision-macos-test-env only runs on macOS hosts" >&2; exit 1; \ fi - @if [ -x /opt/datadog-agent/bin/agent/agent ]; then \ - INSTALLED_VERSION=$$(/opt/datadog-agent/bin/agent/agent version 2>/dev/null | awk '{print $$2}'); \ - if [ "$$INSTALLED_VERSION" = "$(MACOS_TEST_AGENT_VERSION)" ]; then \ - echo "[*] Datadog Agent $$INSTALLED_VERSION already installed (matches expected version)"; \ - else \ - echo "ERROR: installed Datadog Agent version '$$INSTALLED_VERSION' does not match expected '$(MACOS_TEST_AGENT_VERSION)'." >&2; \ - echo " Remove /opt/datadog-agent or update MACOS_TEST_AGENT_VERSION and retry." >&2; \ - exit 1; \ - fi; \ + @if [ -x $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent ] && \ + [ "$$($(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent version 2>/dev/null | awk '{print $$2}')" = "$(MACOS_TEST_AGENT_VERSION)" ]; then \ + echo "[*] Datadog Agent $(MACOS_TEST_AGENT_VERSION) already extracted to $(MACOS_TEST_AGENT_INSTALL_DIR)"; \ else \ - echo "[*] Installing Datadog Agent $(MACOS_TEST_AGENT_VERSION)..."; \ + echo "[*] Installing Datadog Agent $(MACOS_TEST_AGENT_VERSION) into $(MACOS_TEST_AGENT_INSTALL_DIR)..."; \ mkdir -p $(MACOS_TEST_AGENT_DMG_DIR); \ DMG_PATH=$(MACOS_TEST_AGENT_DMG_DIR)/datadog-agent-$(MACOS_TEST_AGENT_VERSION).dmg; \ if [ ! -f "$$DMG_PATH" ]; then \ curl -fL "$(MACOS_TEST_AGENT_DMG_URL)" -o "$$DMG_PATH"; \ fi; \ - sudo hdiutil detach /Volumes/datadog_agent 2>/dev/null || true; \ - sudo hdiutil attach "$$DMG_PATH" -mountpoint /Volumes/datadog_agent -nobrowse >/dev/null; \ - PKG=$$(find /Volumes/datadog_agent -name '*.pkg' | head -1); \ - echo "[*] Running installer (postinstall may fail; the binaries we need are written before postinstall runs)"; \ - sudo /usr/sbin/installer -pkg "$$PKG" -target / >/dev/null 2>&1 || true; \ - sudo hdiutil detach /Volumes/datadog_agent >/dev/null 2>&1; \ - test -x /opt/datadog-agent/bin/agent/agent; \ + MOUNT_DIR=$$(mktemp -d /tmp/saluki-dda-mount-XXXXXX); \ + hdiutil detach "$$MOUNT_DIR" 2>/dev/null || true; \ + hdiutil attach "$$DMG_PATH" -mountpoint "$$MOUNT_DIR" -nobrowse >/dev/null; \ + PKG=$$(find "$$MOUNT_DIR" -name '*.pkg' | head -1); \ + EXPAND_DIR=$$(mktemp -d /tmp/saluki-dda-expand-XXXXXX); \ + rm -rf "$$EXPAND_DIR"; \ + pkgutil --expand-full "$$PKG" "$$EXPAND_DIR" >/dev/null; \ + hdiutil detach "$$MOUNT_DIR" >/dev/null; \ + rmdir "$$MOUNT_DIR" 2>/dev/null || true; \ + PAYLOAD_DIR=$$(find "$$EXPAND_DIR" -type d -name Payload | head -1); \ + if [ -z "$$PAYLOAD_DIR" ] || [ ! -x "$$PAYLOAD_DIR/bin/agent/agent" ]; then \ + echo "ERROR: pkg payload did not contain bin/agent/agent. Expanded layout:" >&2; \ + find "$$EXPAND_DIR" -maxdepth 3 -type d >&2; \ + exit 1; \ + fi; \ + rm -rf $(MACOS_TEST_AGENT_INSTALL_DIR); \ + mkdir -p $$(dirname $(MACOS_TEST_AGENT_INSTALL_DIR)); \ + mv "$$PAYLOAD_DIR" $(MACOS_TEST_AGENT_INSTALL_DIR); \ + rm -rf "$$EXPAND_DIR"; \ + test -x $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ fi - @if [ ! -f /opt/datadog-agent/etc/ipc_cert.pem ] || [ ! -f /opt/datadog-agent/etc/auth_token ]; then \ + @if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ] || [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token ]; then \ echo "[*] Bootstrapping IPC cert + auth_token by running the Agent briefly..."; \ - sudo mkdir -p /opt/datadog-agent/run /opt/datadog-agent/etc; \ - sudo DD_API_KEY=bootstrap DD_HOSTNAME=bootstrap /opt/datadog-agent/bin/agent/agent run -c /opt/datadog-agent/etc >/tmp/saluki-agent-bootstrap.log 2>&1 & \ + mkdir -p $(MACOS_TEST_AGENT_INSTALL_DIR)/etc $(MACOS_TEST_AGENT_INSTALL_DIR)/run; \ + DD_API_KEY=bootstrap DD_HOSTNAME=bootstrap \ + DD_RUN_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/run \ + DD_CMD_PORT=55001 DD_GUI_PORT=-1 \ + DD_EXPVAR_PORT=55000 DD_APM_RECEIVER_PORT=58126 \ + DD_PROCESS_CONFIG_CMD_PORT=56062 DD_AGENT_IPC_PORT=55004 \ + DD_DOGSTATSD_PORT=58125 \ + $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent run -c $(MACOS_TEST_AGENT_INSTALL_DIR)/etc >/tmp/saluki-agent-bootstrap.log 2>&1 & \ AGENT_PID=$$!; \ for i in $$(seq 1 30); do \ sleep 1; \ - if [ -f /opt/datadog-agent/etc/ipc_cert.pem ] && [ -f /opt/datadog-agent/etc/auth_token ]; then break; fi; \ + if [ -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ] && [ -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token ]; then break; fi; \ done; \ - sudo kill $$AGENT_PID 2>/dev/null || true; \ + kill $$AGENT_PID 2>/dev/null || true; \ wait $$AGENT_PID 2>/dev/null || true; \ - test -f /opt/datadog-agent/etc/ipc_cert.pem; \ - else \ - echo "[*] IPC cert already present at /opt/datadog-agent/etc/ipc_cert.pem"; \ - fi - @echo "[*] Ensuring cert/auth_token readable by current user..." - @if ! cat /opt/datadog-agent/etc/ipc_cert.pem >/dev/null 2>&1 || ! cat /opt/datadog-agent/etc/auth_token >/dev/null 2>&1; then \ - sudo chown $$(whoami) /opt/datadog-agent/etc/ipc_cert.pem /opt/datadog-agent/etc/auth_token; \ + test -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem; \ else \ - echo "[*] Files already readable by $$(whoami)."; \ + echo "[*] IPC cert already present at $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem"; \ fi @echo "[*] macOS test environment ready." + @echo "[*] Agent binary: $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent" .PHONY: test-integration-macos-ci test-integration-macos-ci: build-panoramic build-adp-host provision-macos-test-env test-integration-macos-run ## CI entry point: builds binaries, ensures Agent + cert are provisioned, then runs the `mac`-runtime integration tests diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index 0cb2a768a1c..07cb06ebd75 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -693,8 +693,15 @@ impl IntegrationRunner { async fn build_driver_config(&self) -> Result { let container = &self.test_case.container; - // Convert env vars to the format expected by airlock. - let env_vars: Vec = container.env.iter().map(|(k, v)| format!("{}={}", k, v)).collect(); + // Merge framework-level port-isolation env vars with the test's own env. Framework + // defaults are applied first so the test's `container.env` (and any explicit override) + // takes precedence. Keeps the test surface consistent across the docker and `mac` + // runtimes — both see the same shifted port table. + let mut merged_env = crate::unix_runner::test_port_isolation_env(); + for (k, v) in &container.env { + merged_env.insert(k.clone(), v.clone()); + } + let env_vars: Vec = merged_env.iter().map(|(k, v)| format!("{}={}", k, v)).collect(); // Build the target config. let target_config = airlock::config::TargetConfig { diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 2262a961e8d..34dc2e558db 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -21,7 +21,9 @@ //! - ADP: `ADP_BINARY_PATH` env var, default `target/release/agent-data-plane` (resolved //! relative to the current working directory). //! - Core Agent (converged only): `CORE_AGENT_BINARY_PATH` env var, default -//! `/opt/datadog-agent/bin/agent/agent`. +//! `/tmp/saluki-dda/datadog-agent/bin/agent/agent` (the sandbox install written by +//! `make provision-macos-test-env`). Set the env var explicitly to point at a different +//! install (for example, a system-wide `/opt/datadog-agent` on a developer host). use std::{ collections::HashMap, @@ -48,13 +50,92 @@ const ADP_BINARY_ENV_VAR: &str = "ADP_BINARY_PATH"; const DEFAULT_ADP_BINARY_PATH: &str = "target/release/agent-data-plane"; const CORE_AGENT_BINARY_ENV_VAR: &str = "CORE_AGENT_BINARY_PATH"; -const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/opt/datadog-agent/bin/agent/agent"; +const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/tmp/saluki-dda/datadog-agent/bin/agent/agent"; /// How long to wait for the Core Agent to write its `auth_token` and `ipc_cert.pem` before /// giving up and failing the test. const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); +/// Framework-level env overrides that move every default-port the test target binds off its +/// canonical value so the test Core Agent + ADP can coexist with anything else listening on +/// those ports (e.g., a running system Datadog Agent on a shared CI runner). Tests can +/// override any of these via `container.env`; tests that test specific port behavior +/// (`adp-cmd-port`) supply their own values. +/// +/// Naming convention: every default port that's 4 digits gets a `5` prepended (8125 -> 58125, +/// 5001 -> 55001, etc.). The GUI is disabled outright since we don't exercise it. +/// +/// Note on scope: this also covers ADP-side ports (the `data_plane.*_listen_*` listen +/// addresses and the OTLP receiver endpoints). Those don't conflict with the system Agent +/// today — the system Agent doesn't bind them — but we shift them anyway so the test surface +/// has a single consistent port table, and so future port additions on either side don't +/// silently regress. +pub fn test_port_isolation_env() -> HashMap { + HashMap::from([ + // ----- Core Agent ports ----- + // CMD/IPC API. Shared key with ADP (used as the IPC client's destination port). + ("DD_CMD_PORT".to_string(), "55001".to_string()), + // GUI — disabled outright. No integration test exercises it. + ("DD_GUI_PORT".to_string(), "-1".to_string()), + // expvar / APM / process / secondary IPC — not assertion targets, but the Agent will + // still try to bind them on startup, so shift them out of the way. + ("DD_EXPVAR_PORT".to_string(), "55000".to_string()), + ("DD_APM_RECEIVER_PORT".to_string(), "58126".to_string()), + ("DD_PROCESS_CONFIG_CMD_PORT".to_string(), "56062".to_string()), + ("DD_AGENT_IPC_PORT".to_string(), "55004".to_string()), + // DogStatsD UDP. In converged tests the Core Agent's DSD is disabled by + // DD_DATA_PLANE_ENABLED so this mainly affects ADP (the actual listener) and the + // bootstrap-mode Agent. + ("DD_DOGSTATSD_PORT".to_string(), "58125".to_string()), + // ----- ADP ports ----- + // API listen addresses are URI-style; ListenAddress accepts `tcp://host:port`. + ( + "DD_DATA_PLANE_API_LISTEN_ADDRESS".to_string(), + "tcp://0.0.0.0:55100".to_string(), + ), + ( + "DD_DATA_PLANE_SECURE_API_LISTEN_ADDRESS".to_string(), + "tcp://0.0.0.0:55101".to_string(), + ), + ( + "DD_DATA_PLANE_TELEMETRY_LISTEN_ADDR".to_string(), + "tcp://0.0.0.0:55102".to_string(), + ), + // ----- OTLP receiver endpoints ----- + // Matches the Datadog Agent's OTLP env var shape (DD_OTLP_CONFIG_*). + ( + "DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_GRPC_ENDPOINT".to_string(), + "0.0.0.0:54317".to_string(), + ), + ( + "DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_HTTP_ENDPOINT".to_string(), + "0.0.0.0:54318".to_string(), + ), + ]) +} + +/// Builds the env for a target process (Core Agent or ADP) under the Unix runner. +/// +/// Precedence (lowest to highest): +/// 1. framework port-isolation defaults (`test_port_isolation_env`) +/// 2. the test's declared `container.env` +/// 3. forced overrides supplied by the caller (auth token path, run path, …) +/// +/// Forced overrides are bottom-of-stack from the framework's perspective but top-of-stack here +/// because they're path-bindings tests must not be able to override (they identify per-test +/// state directories that the runner owns). +fn build_process_env(test_env: &HashMap, forced: &[(&str, String)]) -> HashMap { + let mut env = test_port_isolation_env(); + for (k, v) in test_env { + env.insert(k.clone(), v.clone()); + } + for (k, v) in forced { + env.insert((*k).to_string(), v.clone()); + } + env +} + /// Runner for a single Unix-process integration test case. pub(crate) struct UnixIntegrationRunner { test_case: IntegrationConfig, @@ -147,8 +228,25 @@ impl UnixIntegrationRunner { // follows that advice for its post-config-stream IPC clients, and TLS fails with // UnknownIssuer because the platform default cert does not match what the per-test // Agent is actually serving. - let mut agent_env = self.test_case.container.env.clone(); - agent_env.insert("DD_AUTH_TOKEN_FILE_PATH".to_string(), auth_token_path.clone()); + // Forced runner-owned bindings: + // DD_AUTH_TOKEN_FILE_PATH: pin to the per-test path. The Agent's authoritative + // config (sent to ADP via the config stream) would otherwise advertise the + // platform default, ADP would follow that advice for its post-config-stream IPC + // clients, and TLS would fail with UnknownIssuer because the platform default + // cert does not match what the per-test Agent is actually serving. + // DD_RUN_PATH: the Agent's default `run_path` is the install prefix's `run/` + // directory (e.g., /opt/datadog-agent/run). Without overriding it, a relocated + // Agent install would try to write its runtime state (remote-config db, + // sockets, pid file) back to the canonical /opt path — typically not writable + // in CI. Scope it to the per-test state directory so each test gets a clean + // slate and nothing leaks across runs. + let agent_env = build_process_env( + &self.test_case.container.env, + &[ + ("DD_AUTH_TOKEN_FILE_PATH", auth_token_path.clone()), + ("DD_RUN_PATH", state_dir.to_string_lossy().into_owned()), + ], + ); let agent_config = UnixProcessConfig::new(format!("{}-core-agent", self.test_case.name), agent_binary) .with_args(vec![ @@ -194,12 +292,14 @@ impl UnixIntegrationRunner { // Phase: spawn ADP. let spawn_start = Instant::now(); let config_path_str = config_path.to_string_lossy().into_owned(); - let mut adp_env = self.test_case.container.env.clone(); - if self.test_case.requires_core_agent { + let adp_forced: Vec<(&str, String)> = if self.test_case.requires_core_agent { // Point ADP's IPC client at the per-test auth token (and by derivation, the // per-test ipc_cert.pem in the same directory). - adp_env.insert("DD_AUTH_TOKEN_FILE_PATH".to_string(), auth_token_path); - } + vec![("DD_AUTH_TOKEN_FILE_PATH", auth_token_path)] + } else { + Vec::new() + }; + let adp_env = build_process_env(&self.test_case.container.env, &adp_forced); let process_config = UnixProcessConfig::new(self.test_case.name.clone(), binary_path) .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) .with_env_map(adp_env); diff --git a/test/integration/cases/adp-cmd-port/config.yaml b/test/integration/cases/adp-cmd-port/config.yaml index e97c3bba76e..77f21060ade 100644 --- a/test/integration/cases/adp-cmd-port/config.yaml +++ b/test/integration/cases/adp-cmd-port/config.yaml @@ -4,10 +4,12 @@ # the IPC/CMD API server listens on. ADP connects to this port for # remote agent registration and config streaming. # -# When cmd_port is set to a non-default value (5101 instead of 5001), -# ADP must read cmd_port and connect to the correct port. Without this -# fix, ADP tries to connect to the hardcoded default (5001) and -# crash-loops. +# When cmd_port is set to a non-default value (7777 in this test), ADP +# must read cmd_port and connect to the correct port. Without this fix, +# ADP tries to connect to its hardcoded default and crash-loops. +# This test explicitly overrides the framework-level cmd_port default +# (see panoramic::unix_runner::test_port_isolation_env) to validate the +# non-default path end-to-end. type: integration name: "adp-cmd-port" diff --git a/test/integration/cases/adp-config-stream/config.yaml b/test/integration/cases/adp-config-stream/config.yaml index cd747f4e273..8e292eb314b 100644 --- a/test/integration/cases/adp-config-stream/config.yaml +++ b/test/integration/cases/adp-config-stream/config.yaml @@ -14,9 +14,8 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "false" DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_PORT: "8125" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # Make sure we initially see ADP try to reach out to the Core Agent for its configuration. diff --git a/test/integration/cases/adp-logging-default-path/config.yaml b/test/integration/cases/adp-logging-default-path/config.yaml index ca87955c94d..596f0cec00d 100644 --- a/test/integration/cases/adp-logging-default-path/config.yaml +++ b/test/integration/cases/adp-logging-default-path/config.yaml @@ -13,9 +13,8 @@ container: DD_DATA_PLANE_ENABLED: "true" DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_PORT: "8125" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # Wait for ADP to receive its initial configuration from the Core Agent and reload logging. diff --git a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml index f65c3d3d45c..746e0bc0cab 100644 --- a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml +++ b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml @@ -14,9 +14,8 @@ container: DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" DD_LOG_FILE: "/tmp/coreagent-only.log" - DD_DOGSTATSD_PORT: "8125" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # Wait for ADP to receive its initial configuration from the Core Agent and reload logging. diff --git a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml index 46c4e6ffc2a..f528356f88c 100644 --- a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml +++ b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml @@ -14,9 +14,8 @@ container: DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" DD_DATA_PLANE_LOG_FILE: "/tmp/adp-custom.log" - DD_DOGSTATSD_PORT: "8125" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # Wait for ADP to receive its initial configuration from the Core Agent and reload logging. diff --git a/test/integration/cases/adp-rar-disabled/config.yaml b/test/integration/cases/adp-rar-disabled/config.yaml index 3c3987cc7b9..28a5ccc9b3d 100644 --- a/test/integration/cases/adp-rar-disabled/config.yaml +++ b/test/integration/cases/adp-rar-disabled/config.yaml @@ -21,9 +21,8 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "false" DD_DATA_PLANE_REMOTE_AGENT_ENABLED: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_PORT: "8125" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # ADP should report registration failure (Core Agent returns error when RAR disabled). diff --git a/test/integration/cases/adp-rar-registration/config.yaml b/test/integration/cases/adp-rar-registration/config.yaml index 3e7a8abe83b..97958b982a1 100644 --- a/test/integration/cases/adp-rar-registration/config.yaml +++ b/test/integration/cases/adp-rar-registration/config.yaml @@ -14,9 +14,8 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "false" DD_DATA_PLANE_REMOTE_AGENT_ENABLED: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_PORT: "8125" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # ADP should report registration success. diff --git a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml index 5465b6d7e13..58e06c3084d 100644 --- a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml +++ b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml @@ -12,11 +12,10 @@ container: DD_DATA_PLANE_ENABLED: "true" DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_PORT: "8125" DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" DD_DOGSTATSD_AUTOSCALE_UDP_LISTENERS: "true" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # Make sure the process becomes healthy and stays up without errors when autoscale is enabled, @@ -25,7 +24,7 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 8125 + port: 58125 protocol: udp timeout: 10s - type: log_not_contains diff --git a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml index 0868807a8cf..578ab45b0b7 100644 --- a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml +++ b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml @@ -45,7 +45,7 @@ container: assertions: - type: log_contains - pattern: 'listen_addr:"udp://{{PANORAMIC_DYNAMIC_CONTAINER_IP}}:8125"' + pattern: 'listen_addr:"udp://{{PANORAMIC_DYNAMIC_CONTAINER_IP}}:58125"' timeout: 15s - parallel: - type: process_stable_for diff --git a/test/integration/cases/dogstatsd-bind-host/config.yaml b/test/integration/cases/dogstatsd-bind-host/config.yaml index 169a63fae3d..af9d702ba3b 100644 --- a/test/integration/cases/dogstatsd-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-bind-host/config.yaml @@ -33,7 +33,7 @@ container: assertions: - type: log_contains - pattern: 'listen_addr:"udp://{{PANORAMIC_DYNAMIC_CONTAINER_IP}}:8125"' + pattern: 'listen_addr:"udp://{{PANORAMIC_DYNAMIC_CONTAINER_IP}}:58125"' timeout: 15s - parallel: - type: process_stable_for diff --git a/test/integration/cases/dogstatsd-default-bind/config.yaml b/test/integration/cases/dogstatsd-default-bind/config.yaml index 4b85d7ce231..edc6d9ba53f 100644 --- a/test/integration/cases/dogstatsd-default-bind/config.yaml +++ b/test/integration/cases/dogstatsd-default-bind/config.yaml @@ -25,7 +25,7 @@ container: assertions: - type: log_contains - pattern: 'listen_addr:"udp://127.0.0.1:8125"' + pattern: 'listen_addr:"udp://127.0.0.1:58125"' timeout: 15s - parallel: - type: process_stable_for diff --git a/test/integration/cases/dogstatsd-enabled/config.yaml b/test/integration/cases/dogstatsd-enabled/config.yaml index b7e61b58f1d..1efea0014fd 100644 --- a/test/integration/cases/dogstatsd-enabled/config.yaml +++ b/test/integration/cases/dogstatsd-enabled/config.yaml @@ -12,10 +12,9 @@ container: DD_DATA_PLANE_ENABLED: "true" DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_PORT: "8125" DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" exposed_ports: - - "8125/udp" + - "58125/udp" assertions: # Make sure the process becomes healthy, and stays up without errors, listening for DSD (UDP), @@ -24,7 +23,7 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 8125 + port: 58125 protocol: udp timeout: 10s - type: log_not_contains diff --git a/test/integration/cases/dogstatsd-forwarding/config.yaml b/test/integration/cases/dogstatsd-forwarding/config.yaml index b2f457d9476..259d044412e 100644 --- a/test/integration/cases/dogstatsd-forwarding/config.yaml +++ b/test/integration/cases/dogstatsd-forwarding/config.yaml @@ -16,7 +16,6 @@ container: DD_DATA_PLANE_ENABLED: "true" DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_PORT: "8125" DD_DOGSTATSD_TCP_PORT: "9126" DD_DOGSTATSD_SOCKET: "/tmp/dsd-forwarding.sock" DD_DOGSTATSD_STREAM_SOCKET: "/tmp/dsd-forwarding-stream.sock" @@ -26,7 +25,7 @@ container: files: - "run_forwarding_test.py:/forwarding-test/run_forwarding_test.py" exposed_ports: - - "8125/udp" + - "58125/udp" - "9126/tcp" assertions: @@ -65,7 +64,7 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 8125 + port: 58125 protocol: udp timeout: 10s - type: port_listening diff --git a/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py b/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py index c5584b419f2..bee91d3b34d 100644 --- a/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py +++ b/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py @@ -23,13 +23,15 @@ FORWARDED_UDS_STREAM_PATH = "/tmp/dsd-forwarded-uds-stream-packet" PARSED_METRIC_PATH = "/tmp/dsd-metric-parsed" FORWARD_ADDR = ("127.0.0.1", 9125) -DOGSTATSD_ADDR = ("127.0.0.1", 8125) +# Matches the framework-level shifted default in panoramic::unix_runner::test_port_isolation_env. +# When the framework provides DD_DOGSTATSD_PORT=58125, ADP binds DSD on 58125. +DOGSTATSD_ADDR = ("127.0.0.1", 58125) DOGSTATSD_STREAM_ADDR = ("127.0.0.1", 9126) DOGSTATSD_UDS_PATH = "/tmp/dsd-forwarding.sock" DOGSTATSD_UDS_STREAM_PATH = "/tmp/dsd-forwarding-stream.sock" TELEMETRY_URLS = ( - "http://127.0.0.1:5100/metrics", - "http://127.0.0.1:5100/compat/metrics", + "http://127.0.0.1:55100/metrics", + "http://127.0.0.1:55100/compat/metrics", ) PROBE_TIMEOUT_SECS = 60 PROBE_INTERVAL_SECS = 0.25 diff --git a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml index ed58215c481..047ddf94e7f 100644 --- a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml @@ -32,7 +32,7 @@ container: assertions: - type: log_contains - pattern: 'listen_addr:"udp://0.0.0.0:8125"' + pattern: 'listen_addr:"udp://0.0.0.0:58125"' timeout: 15s - parallel: - type: process_stable_for diff --git a/test/integration/cases/otlp-traces-enabled/config.yaml b/test/integration/cases/otlp-traces-enabled/config.yaml index 8a482db57c8..aa0c2b57472 100644 --- a/test/integration/cases/otlp-traces-enabled/config.yaml +++ b/test/integration/cases/otlp-traces-enabled/config.yaml @@ -15,8 +15,8 @@ container: DD_DATA_PLANE_OTLP_PROXY_ENABLED: "true" DD_DATA_PLANE_OTLP_PROXY_TRACES_ENABLED: "false" exposed_ports: - - "4317/tcp" - - "4318/tcp" + - "54317/tcp" + - "54318/tcp" assertions: # Make sure the process becomes healthy, and stays up without errors, listening for OTLP (HTTP and gRPC), @@ -25,11 +25,11 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 4317 + port: 54317 protocol: tcp timeout: 10s - type: port_listening - port: 4318 + port: 54318 protocol: tcp timeout: 10s - type: log_not_contains diff --git a/test/integration/cases/privileged-api-endpoints/config.yaml b/test/integration/cases/privileged-api-endpoints/config.yaml index c72e146f419..7960df03195 100644 --- a/test/integration/cases/privileged-api-endpoints/config.yaml +++ b/test/integration/cases/privileged-api-endpoints/config.yaml @@ -13,14 +13,14 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "5101/tcp" + - "55101/tcp" assertions: - parallel: - type: process_stable_for duration: 10s - type: port_listening - port: 5101 + port: 55101 protocol: tcp timeout: 20s # Each of the four routes below is registered as POST-only by the corresponding override @@ -28,31 +28,31 @@ assertions: # if the worker failed to assert its DynamicRoute, the request would return 404 instead. # Asserting "status != 404" lets us verify route registration without exercising the routes. - type: http_check - endpoint: "https://localhost:5101/logging/override" + endpoint: "https://localhost:55101/logging/override" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/logging/reset" + endpoint: "https://localhost:55101/logging/reset" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/metrics/override" + endpoint: "https://localhost:55101/metrics/override" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/metrics/reset" + endpoint: "https://localhost:55101/metrics/reset" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/config" + endpoint: "https://localhost:55101/config" status: not_equal: 404 insecure_skip_verify: true diff --git a/test/integration/cases/telemetry-endpoint/config.yaml b/test/integration/cases/telemetry-endpoint/config.yaml index 22f5b38c588..1017c1a97ab 100644 --- a/test/integration/cases/telemetry-endpoint/config.yaml +++ b/test/integration/cases/telemetry-endpoint/config.yaml @@ -13,7 +13,7 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "5100/tcp" + - "55100/tcp" assertions: # Make sure the process becomes healthy, and stays up without errors, with the telemetry routes @@ -22,16 +22,16 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 5100 + port: 55100 protocol: tcp timeout: 10s - type: http_check - endpoint: "http://localhost:5100/metrics" + endpoint: "http://localhost:55100/metrics" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:5100/compat/metrics" + endpoint: "http://localhost:55100/compat/metrics" status: equal: 200 timeout: 10s diff --git a/test/integration/cases/unprivileged-api-endpoints/config.yaml b/test/integration/cases/unprivileged-api-endpoints/config.yaml index d5014bc3edb..7acea33d6aa 100644 --- a/test/integration/cases/unprivileged-api-endpoints/config.yaml +++ b/test/integration/cases/unprivileged-api-endpoints/config.yaml @@ -13,24 +13,24 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "5100/tcp" + - "55100/tcp" assertions: - parallel: - type: process_stable_for duration: 10s - type: http_check - endpoint: "http://localhost:5100/ready" + endpoint: "http://localhost:55100/ready" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:5100/live" + endpoint: "http://localhost:55100/live" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:5100/memory/status" + endpoint: "http://localhost:55100/memory/status" status: equal: 200 timeout: 10s From 686ac82c27295160a1e5209d5832bbb2da0407ca Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 11:06:23 -0400 Subject: [PATCH 31/56] chore(docs): fix vale lint errors in unix_runner.rs - Replace 'e.g.' with 'for example' (Google.Latin) - Replace em-dashes with parentheses (Google.EmDash) --- bin/correctness/panoramic/src/unix_runner.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 34dc2e558db..1b8dcc1b976 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -59,7 +59,7 @@ const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); /// Framework-level env overrides that move every default-port the test target binds off its /// canonical value so the test Core Agent + ADP can coexist with anything else listening on -/// those ports (e.g., a running system Datadog Agent on a shared CI runner). Tests can +/// those ports (for example, a running system Datadog Agent on a shared CI runner). Tests can /// override any of these via `container.env`; tests that test specific port behavior /// (`adp-cmd-port`) supply their own values. /// @@ -68,7 +68,7 @@ const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); /// /// Note on scope: this also covers ADP-side ports (the `data_plane.*_listen_*` listen /// addresses and the OTLP receiver endpoints). Those don't conflict with the system Agent -/// today — the system Agent doesn't bind them — but we shift them anyway so the test surface +/// today (the system Agent doesn't bind them), but we shift them anyway so the test surface /// has a single consistent port table, and so future port additions on either side don't /// silently regress. pub fn test_port_isolation_env() -> HashMap { From 68fe1e4ebc584e0da1063ec9c04180a4314a35d0 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 11:08:38 -0400 Subject: [PATCH 32/56] ci: stop pkilling /opt/datadog-agent in macOS e2e before_script We no longer install to /opt/datadog-agent (sandbox install lives in /tmp/saluki-dda), and every default port the test Agent + ADP bind is shifted out of the canonical range (test_port_isolation_env in panoramic::unix_runner). A system Datadog Agent on the runner can no longer conflict with us, so the /opt pkill is just dead weight \u2014 worse, it would interrupt an actual running Agent if anyone runs this target on a customer machine. Drop it; keep the /tmp/saluki-dda sweep to clean up our own strays between runs. --- .gitlab/e2e.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index c46d3c6d2d9..1d3e0264520 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -137,13 +137,12 @@ test-integration: variables: PANORAMIC_LOG_DIR: integration-logs before_script: - # Defensive: clean up any leftover Datadog Agent processes from prior runs on this shared - # runner before we begin. Otherwise a stranded trace-agent/process-agent can hold ports - # (e.g., 8126) and break the first converged test. Catch both: - # - any pre-existing system install at /opt/datadog-agent (we don't use it, but a - # conflicting one could still be running) - # - our own sandbox install under /tmp/saluki-dda from a prior run on this shared runner - - sudo pkill -9 -f /opt/datadog-agent/ || true + # Defensive: clean up any leftover Agent processes from prior runs on this shared runner. + # All test-Agent ports are shifted out of the canonical range (see + # panoramic::unix_runner::test_port_isolation_env), so we don't need to touch a system + # install at /opt/datadog-agent; we only sweep our own sandbox under /tmp/saluki-dda where + # a stranded trace-agent / process-agent from a prior run could still hold our shifted + # ports and break the first converged test. - sudo pkill -9 -f /tmp/saluki-dda/ || true script: - make test-integration-macos-ci From 3466d686019221cda86cdd97a6de801648edb0b4 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 11:20:46 -0400 Subject: [PATCH 33/56] build: strip xattrs after pkg extraction in provision-macos-test-env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apple Silicon CI runners on macOS 14+ reject the relocated Agent binary with a silent 'Killed: 9' — no log output, codesign verifies fine on disk. Root cause: pkg-extraction (vs running the .pkg through installer) leaves filesystem xattrs on the extracted binaries (notably com.apple.provenance), and amfid refuses to launch a binary with provenance tagging that did not go through the normal install trust path. `installer` clears these xattrs as part of its trust-establishment flow. When we bypass it (which we do deliberately to install into a /tmp sandbox), we need to do the equivalent ourselves. `xattr -cr` strips every xattr from the install dir; codesign signatures live inside the Mach-O itself so they survive untouched. Failure tolerant (|| true) for the rare case where a single file is unreadable from a prior run — the binaries we actually care about will have been stripped before that case can trigger. --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 1c934a50c0f..4acf2687cb9 100644 --- a/Makefile +++ b/Makefile @@ -635,6 +635,12 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE mkdir -p $$(dirname $(MACOS_TEST_AGENT_INSTALL_DIR)); \ mv "$$PAYLOAD_DIR" $(MACOS_TEST_AGENT_INSTALL_DIR); \ rm -rf "$$EXPAND_DIR"; \ + # Strip macOS provenance / quarantine xattrs from the entire sandbox. Out-of-band pkg \ + # extraction (vs. running the .pkg through `installer`) leaves attributes (notably \ + # com.apple.provenance on macOS 14+) that cause amfid to refuse to launch the binary \ + # on Apple Silicon with a silent "Killed: 9" — no log output, codesign still verifies. \ + # Clearing all xattrs after extraction is what `installer` effectively does for us. \ + xattr -cr $(MACOS_TEST_AGENT_INSTALL_DIR) 2>/dev/null || true; \ test -x $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ fi @if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ] || [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token ]; then \ From b6b153ee89a33ade3413556a297ed8ad6d3e5606 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 11:25:32 -0400 Subject: [PATCH 34/56] ci: opt the pipeline into interruptible auto-cancel on new commits The macOS jobs are marked `interruptible: true` (added in bcdff89928), but in practice they never get auto-cancelled when a new commit supersedes the in-flight pipeline. Root cause: no explicit `workflow.auto_cancel.on_new_commit` setting means GitLab uses the default 'conservative' mode, which only cancels the pipeline if no non-interruptible job has started running. Since our Linux jobs aren't marked interruptible (and shouldn't be \u2014 they exercise dedicated CI infrastructure with fast turnaround), they start immediately and pin the entire pipeline in place; the macOS jobs ride along, holding their bare-metal runner slots. Switch to 'interruptible' mode: cancel just the jobs marked interruptible, regardless of what else is running. Net effect is narrow \u2014 the macOS unit + integration jobs are the only ones currently opted into interruptible, and they're the only ones whose behavior changes. Non-interruptible Linux jobs continue running uninterrupted as before. --- .gitlab-ci.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cb9e036786b..c0c80d9c774 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,6 +14,15 @@ stages: # # This mostly controls how we tag our ADP container images and set various bits of metadata. workflow: + # Auto-cancel any interruptible jobs from a previous pipeline when a new commit lands on the + # same ref. The default ('conservative') only cancels the pipeline if no non-interruptible + # job has started yet — since our Linux jobs are non-interruptible and start immediately, + # that mode effectively never cancels anything. 'interruptible' cancels just the jobs marked + # interruptible: true (currently the macOS unit + integration jobs, which run on scarce + # bare-metal runner capacity); non-interruptible jobs continue to completion as before. + # https://docs.gitlab.com/ci/yaml/#workflowauto_cancelon_new_commit + auto_cancel: + on_new_commit: interruptible rules: - if: $CI_COMMIT_TAG == null variables: From 2dd590d16af8e435a0a0660ee5547072432d43d1 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 11:49:20 -0400 Subject: [PATCH 35/56] build: fix shell syntax error in provision-macos-test-env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inline '# Strip macOS provenance / quarantine xattrs ...' comment block I added in the previous commit was inside a backslash-continued make recipe. Make joins all backslash-continued lines into a single logical shell line before passing to sh; once joined, the first '#' starts a shell comment that consumes everything to end-of-line — which is end-of-string for the joined logical line. That swallowed the closing 'fi' of the surrounding if/else, leaving shell with an unterminated control block and emitting: /bin/sh: -c: line 1: syntax error: unexpected end of file Move the explanation to a Make-level comment block above the target (safe — comments outside recipes are stripped by make before any shell ever sees them) and leave only the actual xattr -cr command inside the recipe. --- Makefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 4acf2687cb9..5be7178d014 100644 --- a/Makefile +++ b/Makefile @@ -600,6 +600,13 @@ MACOS_TEST_AGENT_DMG_URL ?= https://s3.amazonaws.com/dd-agent/datadog-agent-$(MA # may already have at a different, conflicting version) avoids surprises in both directions. MACOS_TEST_AGENT_INSTALL_DIR ?= /tmp/saluki-dda/datadog-agent +# Note on `xattr -cr` in the install step: +# +# Out-of-band pkg extraction (vs. running the .pkg through `installer`) leaves filesystem +# attributes (notably com.apple.provenance on macOS 14+) that cause amfid to refuse to launch +# the binary on Apple Silicon with a silent "Killed: 9" — no log output, codesign still +# verifies fine on disk. Clearing all xattrs after extraction is what `installer` effectively +# does for us as part of its trust-establishment flow. .PHONY: provision-macos-test-env provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGENT_VERSION)) into $(MACOS_TEST_AGENT_INSTALL_DIR) (a sandbox under /tmp) and bootstraps the IPC cert. Idempotent: re-uses the install if it already matches the pinned version. @echo "[*] Provisioning macOS test environment..." @@ -635,11 +642,6 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE mkdir -p $$(dirname $(MACOS_TEST_AGENT_INSTALL_DIR)); \ mv "$$PAYLOAD_DIR" $(MACOS_TEST_AGENT_INSTALL_DIR); \ rm -rf "$$EXPAND_DIR"; \ - # Strip macOS provenance / quarantine xattrs from the entire sandbox. Out-of-band pkg \ - # extraction (vs. running the .pkg through `installer`) leaves attributes (notably \ - # com.apple.provenance on macOS 14+) that cause amfid to refuse to launch the binary \ - # on Apple Silicon with a silent "Killed: 9" — no log output, codesign still verifies. \ - # Clearing all xattrs after extraction is what `installer` effectively does for us. \ xattr -cr $(MACOS_TEST_AGENT_INSTALL_DIR) 2>/dev/null || true; \ test -x $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ fi From 637d8ca1c9b2c188865079e88b7c8875c03016aa Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 12:10:24 -0400 Subject: [PATCH 36/56] build: add diagnostics to bootstrap step on macOS provision The Apple Silicon CI runner kills the relocated Datadog Agent with SIGKILL during the IPC bootstrap ("Killed: 9") and we have no signal on what amfid / Gatekeeper / mount-options / runtime crash is at play because the bootstrap log file is captured to /tmp/saluki-agent-bootstrap.log and then thrown away when the make target exits. Add a one-time diagnostic dump that prints (on failure only): - sw_vers, uname - mount options of the partition the install lives on - perms, mach-o arch of the agent binary - xattr listing (provenance, quarantine) - codesign --verify --verbose - spctl --assess (what Gatekeeper says) - Result of 'agent version' as a cheap exec sanity check - The actual bootstrap log if the agent never wrote the IPC cert This is debug noise we'll want to remove once the underlying issue is diagnosed; for now we need it to know whether the kill is from amfid (codesign / signature issue), the kernel (noexec mount), or the agent itself (runtime crash on this environment). --- Makefile | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5be7178d014..f192ec20e40 100644 --- a/Makefile +++ b/Makefile @@ -647,6 +647,21 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE fi @if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ] || [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token ]; then \ echo "[*] Bootstrapping IPC cert + auth_token by running the Agent briefly..."; \ + echo "--- diag: host ---"; \ + sw_vers; uname -a; \ + echo "--- diag: install dir mount + perms ---"; \ + mount | grep -E "$$(df $(MACOS_TEST_AGENT_INSTALL_DIR) | tail -1 | awk '{print $$NF}')" || true; \ + ls -la $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ + file $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ + echo "--- diag: xattr ---"; \ + xattr -l $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent || true; \ + echo "--- diag: codesign ---"; \ + codesign --verify --verbose=2 $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent 2>&1 || true; \ + echo "--- diag: spctl ---"; \ + spctl --assess --type execute --verbose $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent 2>&1 || true; \ + echo "--- diag: try 'agent version' (cheap exec sanity check) ---"; \ + $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent version || echo " version subcommand failed with exit $$?"; \ + echo "--- diag: end ---"; \ mkdir -p $(MACOS_TEST_AGENT_INSTALL_DIR)/etc $(MACOS_TEST_AGENT_INSTALL_DIR)/run; \ DD_API_KEY=bootstrap DD_HOSTNAME=bootstrap \ DD_RUN_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/run \ @@ -662,7 +677,12 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE done; \ kill $$AGENT_PID 2>/dev/null || true; \ wait $$AGENT_PID 2>/dev/null || true; \ - test -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem; \ + if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ]; then \ + echo "--- diag: bootstrap log (agent never wrote IPC cert) ---"; \ + cat /tmp/saluki-agent-bootstrap.log 2>&1 || echo " (log file empty or missing)"; \ + echo "--- diag: end ---"; \ + exit 1; \ + fi; \ else \ echo "[*] IPC cert already present at $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem"; \ fi From a478503e203e9757fb31f1380cda469e5f11b450 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 12:48:49 -0400 Subject: [PATCH 37/56] build: set DD_AUTH_TOKEN_FILE_PATH explicitly in macOS bootstrap The Datadog Agent's default auth_token path on macOS is hardcoded to /opt/datadog-agent/etc/auth_token, regardless of the -c flag (which only redirects datadog.yaml lookup, not the IPC artifacts). With our sandbox install at /tmp/saluki-dda, the bootstrap agent was either silently failing to write to /opt/datadog-agent/etc (no permission) or writing somewhere else, while our wait loop polled the sandbox etc/ that never got populated. Set DD_AUTH_TOKEN_FILE_PATH (and DD_IPC_CERT_FILE_PATH for symmetry) to the sandbox etc/ paths. We already do this on the per-test runner in panoramic::unix_runner; bootstrap needs the same treatment. Also expand the post-failure diagnostic block to (a) list sandbox etc/ and run/ contents, (b) hunt the filesystem for any auth_token / ipc_cert.pem files that landed elsewhere \u2014 in case the env var doesn't do what we expect either. --- Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile b/Makefile index f192ec20e40..af0cc064d33 100644 --- a/Makefile +++ b/Makefile @@ -665,6 +665,8 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE mkdir -p $(MACOS_TEST_AGENT_INSTALL_DIR)/etc $(MACOS_TEST_AGENT_INSTALL_DIR)/run; \ DD_API_KEY=bootstrap DD_HOSTNAME=bootstrap \ DD_RUN_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/run \ + DD_AUTH_TOKEN_FILE_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token \ + DD_IPC_CERT_FILE_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem \ DD_CMD_PORT=55001 DD_GUI_PORT=-1 \ DD_EXPVAR_PORT=55000 DD_APM_RECEIVER_PORT=58126 \ DD_PROCESS_CONFIG_CMD_PORT=56062 DD_AGENT_IPC_PORT=55004 \ @@ -680,6 +682,11 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ]; then \ echo "--- diag: bootstrap log (agent never wrote IPC cert) ---"; \ cat /tmp/saluki-agent-bootstrap.log 2>&1 || echo " (log file empty or missing)"; \ + echo "--- diag: post-failure file listings ---"; \ + echo "sandbox etc/:"; ls -la $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ 2>&1 || true; \ + echo "sandbox run/:"; ls -la $(MACOS_TEST_AGENT_INSTALL_DIR)/run/ 2>&1 || true; \ + echo "any auth_token / ipc_cert.pem on the filesystem:"; \ + find /tmp/saluki-dda /opt/datadog-agent /var/run /private/var $$HOME -name auth_token -o -name ipc_cert.pem 2>/dev/null || true; \ echo "--- diag: end ---"; \ exit 1; \ fi; \ From b57a22ad3ecd882c4b95bc0f5564a95bc2a862de Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 13:10:32 -0400 Subject: [PATCH 38/56] build: simplify after CI confirmed which mitigations were necessary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous CI run on this branch hit 22/24 passing once DD_AUTH_TOKEN_FILE_PATH was set correctly in the bootstrap. Two tests still failed (otlp-traces-enabled, adp-config-stream) and the diag dump confirmed several earlier guesses weren't actually needed. Cleaning up the now-known-unnecessary scaffolding: * xattr -cr in provision-macos-test-env: CI's extracted binary has no xattrs (com.apple.provenance was a local-mac-only artifact from curl-downloaded files on my host). Drop the strip and its preamble comment. * The hefty diagnostic block before the bootstrap agent launch: served its purpose, drop. On bootstrap failure now just cat the bootstrap log so a future regression isn't blind. * Framework port shift for ADP listen addresses (5100/5101/5102) and OTLP receiver (4317/4318): the system Datadog Agent doesn't bind these, so they didn't need shifting in the first place. The env vars also wouldn't have worked because saluki-config uses double- underscore as the nesting separator (Env::split("__")), so the single-underscore env vars I'd written (DD_DATA_PLANE_API_LISTEN_ADDRESS, DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_GRPC_ENDPOINT, etc.) were silently ignored. Removing them simplifies the scope to 'ports a stock Datadog Agent actually binds', which is what the user asked for originally. * Revert the test-config edits that flipped 5100/5101/5102 → 55100/etc and 4317/4318 → 54317/etc. Those tests now assert against canonical port numbers again with no shift needed. --- Makefile | 33 +------------ bin/correctness/panoramic/src/unix_runner.rs | 46 ++++--------------- .../run_forwarding_test.py | 4 +- .../cases/otlp-traces-enabled/config.yaml | 8 ++-- .../privileged-api-endpoints/config.yaml | 14 +++--- .../cases/telemetry-endpoint/config.yaml | 8 ++-- .../unprivileged-api-endpoints/config.yaml | 8 ++-- 7 files changed, 33 insertions(+), 88 deletions(-) diff --git a/Makefile b/Makefile index af0cc064d33..e6b4ba8cc0d 100644 --- a/Makefile +++ b/Makefile @@ -600,13 +600,6 @@ MACOS_TEST_AGENT_DMG_URL ?= https://s3.amazonaws.com/dd-agent/datadog-agent-$(MA # may already have at a different, conflicting version) avoids surprises in both directions. MACOS_TEST_AGENT_INSTALL_DIR ?= /tmp/saluki-dda/datadog-agent -# Note on `xattr -cr` in the install step: -# -# Out-of-band pkg extraction (vs. running the .pkg through `installer`) leaves filesystem -# attributes (notably com.apple.provenance on macOS 14+) that cause amfid to refuse to launch -# the binary on Apple Silicon with a silent "Killed: 9" — no log output, codesign still -# verifies fine on disk. Clearing all xattrs after extraction is what `installer` effectively -# does for us as part of its trust-establishment flow. .PHONY: provision-macos-test-env provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGENT_VERSION)) into $(MACOS_TEST_AGENT_INSTALL_DIR) (a sandbox under /tmp) and bootstraps the IPC cert. Idempotent: re-uses the install if it already matches the pinned version. @echo "[*] Provisioning macOS test environment..." @@ -642,26 +635,10 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE mkdir -p $$(dirname $(MACOS_TEST_AGENT_INSTALL_DIR)); \ mv "$$PAYLOAD_DIR" $(MACOS_TEST_AGENT_INSTALL_DIR); \ rm -rf "$$EXPAND_DIR"; \ - xattr -cr $(MACOS_TEST_AGENT_INSTALL_DIR) 2>/dev/null || true; \ test -x $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ fi @if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ] || [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token ]; then \ echo "[*] Bootstrapping IPC cert + auth_token by running the Agent briefly..."; \ - echo "--- diag: host ---"; \ - sw_vers; uname -a; \ - echo "--- diag: install dir mount + perms ---"; \ - mount | grep -E "$$(df $(MACOS_TEST_AGENT_INSTALL_DIR) | tail -1 | awk '{print $$NF}')" || true; \ - ls -la $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ - file $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent; \ - echo "--- diag: xattr ---"; \ - xattr -l $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent || true; \ - echo "--- diag: codesign ---"; \ - codesign --verify --verbose=2 $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent 2>&1 || true; \ - echo "--- diag: spctl ---"; \ - spctl --assess --type execute --verbose $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent 2>&1 || true; \ - echo "--- diag: try 'agent version' (cheap exec sanity check) ---"; \ - $(MACOS_TEST_AGENT_INSTALL_DIR)/bin/agent/agent version || echo " version subcommand failed with exit $$?"; \ - echo "--- diag: end ---"; \ mkdir -p $(MACOS_TEST_AGENT_INSTALL_DIR)/etc $(MACOS_TEST_AGENT_INSTALL_DIR)/run; \ DD_API_KEY=bootstrap DD_HOSTNAME=bootstrap \ DD_RUN_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/run \ @@ -680,14 +657,8 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE kill $$AGENT_PID 2>/dev/null || true; \ wait $$AGENT_PID 2>/dev/null || true; \ if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ]; then \ - echo "--- diag: bootstrap log (agent never wrote IPC cert) ---"; \ - cat /tmp/saluki-agent-bootstrap.log 2>&1 || echo " (log file empty or missing)"; \ - echo "--- diag: post-failure file listings ---"; \ - echo "sandbox etc/:"; ls -la $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ 2>&1 || true; \ - echo "sandbox run/:"; ls -la $(MACOS_TEST_AGENT_INSTALL_DIR)/run/ 2>&1 || true; \ - echo "any auth_token / ipc_cert.pem on the filesystem:"; \ - find /tmp/saluki-dda /opt/datadog-agent /var/run /private/var $$HOME -name auth_token -o -name ipc_cert.pem 2>/dev/null || true; \ - echo "--- diag: end ---"; \ + echo "ERROR: bootstrap Agent did not write the IPC cert. Bootstrap log:" >&2; \ + cat /tmp/saluki-agent-bootstrap.log >&2 2>/dev/null || true; \ exit 1; \ fi; \ else \ diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 1b8dcc1b976..3f1bb0c50fb 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -57,24 +57,22 @@ const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/tmp/saluki-dda/datadog-agent/bin/ const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); -/// Framework-level env overrides that move every default-port the test target binds off its -/// canonical value so the test Core Agent + ADP can coexist with anything else listening on -/// those ports (for example, a running system Datadog Agent on a shared CI runner). Tests can -/// override any of these via `container.env`; tests that test specific port behavior -/// (`adp-cmd-port`) supply their own values. +/// Framework-level env overrides that move every conflict-prone default port off its canonical +/// value so the test Core Agent + ADP can coexist with a system Datadog Agent already running +/// on a shared CI runner. Tests can override any of these via `container.env`; tests that test +/// specific port behavior (`adp-cmd-port`) supply their own values. /// /// Naming convention: every default port that's 4 digits gets a `5` prepended (8125 -> 58125, /// 5001 -> 55001, etc.). The GUI is disabled outright since we don't exercise it. /// -/// Note on scope: this also covers ADP-side ports (the `data_plane.*_listen_*` listen -/// addresses and the OTLP receiver endpoints). Those don't conflict with the system Agent -/// today (the system Agent doesn't bind them), but we shift them anyway so the test surface -/// has a single consistent port table, and so future port additions on either side don't -/// silently regress. +/// Scope is intentionally narrow: only ports a stock Datadog Agent binds by default. ADP's own +/// listen addresses (5100/5101/5102) and the OTLP receiver (4317/4318) are not bound by the +/// system Agent and don't need to be shifted; keeping them on defaults means tests can assert +/// against canonical port numbers with no extra plumbing. pub fn test_port_isolation_env() -> HashMap { HashMap::from([ - // ----- Core Agent ports ----- - // CMD/IPC API. Shared key with ADP (used as the IPC client's destination port). + // CMD/IPC API. Shared key between the Core Agent (listener) and ADP (IPC client). + // `adp-cmd-port` overrides this via container.env to validate the non-default path. ("DD_CMD_PORT".to_string(), "55001".to_string()), // GUI — disabled outright. No integration test exercises it. ("DD_GUI_PORT".to_string(), "-1".to_string()), @@ -88,30 +86,6 @@ pub fn test_port_isolation_env() -> HashMap { // DD_DATA_PLANE_ENABLED so this mainly affects ADP (the actual listener) and the // bootstrap-mode Agent. ("DD_DOGSTATSD_PORT".to_string(), "58125".to_string()), - // ----- ADP ports ----- - // API listen addresses are URI-style; ListenAddress accepts `tcp://host:port`. - ( - "DD_DATA_PLANE_API_LISTEN_ADDRESS".to_string(), - "tcp://0.0.0.0:55100".to_string(), - ), - ( - "DD_DATA_PLANE_SECURE_API_LISTEN_ADDRESS".to_string(), - "tcp://0.0.0.0:55101".to_string(), - ), - ( - "DD_DATA_PLANE_TELEMETRY_LISTEN_ADDR".to_string(), - "tcp://0.0.0.0:55102".to_string(), - ), - // ----- OTLP receiver endpoints ----- - // Matches the Datadog Agent's OTLP env var shape (DD_OTLP_CONFIG_*). - ( - "DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_GRPC_ENDPOINT".to_string(), - "0.0.0.0:54317".to_string(), - ), - ( - "DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_HTTP_ENDPOINT".to_string(), - "0.0.0.0:54318".to_string(), - ), ]) } diff --git a/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py b/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py index bee91d3b34d..da6121ae4dc 100644 --- a/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py +++ b/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py @@ -30,8 +30,8 @@ DOGSTATSD_UDS_PATH = "/tmp/dsd-forwarding.sock" DOGSTATSD_UDS_STREAM_PATH = "/tmp/dsd-forwarding-stream.sock" TELEMETRY_URLS = ( - "http://127.0.0.1:55100/metrics", - "http://127.0.0.1:55100/compat/metrics", + "http://127.0.0.1:5100/metrics", + "http://127.0.0.1:5100/compat/metrics", ) PROBE_TIMEOUT_SECS = 60 PROBE_INTERVAL_SECS = 0.25 diff --git a/test/integration/cases/otlp-traces-enabled/config.yaml b/test/integration/cases/otlp-traces-enabled/config.yaml index aa0c2b57472..8a482db57c8 100644 --- a/test/integration/cases/otlp-traces-enabled/config.yaml +++ b/test/integration/cases/otlp-traces-enabled/config.yaml @@ -15,8 +15,8 @@ container: DD_DATA_PLANE_OTLP_PROXY_ENABLED: "true" DD_DATA_PLANE_OTLP_PROXY_TRACES_ENABLED: "false" exposed_ports: - - "54317/tcp" - - "54318/tcp" + - "4317/tcp" + - "4318/tcp" assertions: # Make sure the process becomes healthy, and stays up without errors, listening for OTLP (HTTP and gRPC), @@ -25,11 +25,11 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 54317 + port: 4317 protocol: tcp timeout: 10s - type: port_listening - port: 54318 + port: 4318 protocol: tcp timeout: 10s - type: log_not_contains diff --git a/test/integration/cases/privileged-api-endpoints/config.yaml b/test/integration/cases/privileged-api-endpoints/config.yaml index 7960df03195..c72e146f419 100644 --- a/test/integration/cases/privileged-api-endpoints/config.yaml +++ b/test/integration/cases/privileged-api-endpoints/config.yaml @@ -13,14 +13,14 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "55101/tcp" + - "5101/tcp" assertions: - parallel: - type: process_stable_for duration: 10s - type: port_listening - port: 55101 + port: 5101 protocol: tcp timeout: 20s # Each of the four routes below is registered as POST-only by the corresponding override @@ -28,31 +28,31 @@ assertions: # if the worker failed to assert its DynamicRoute, the request would return 404 instead. # Asserting "status != 404" lets us verify route registration without exercising the routes. - type: http_check - endpoint: "https://localhost:55101/logging/override" + endpoint: "https://localhost:5101/logging/override" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:55101/logging/reset" + endpoint: "https://localhost:5101/logging/reset" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:55101/metrics/override" + endpoint: "https://localhost:5101/metrics/override" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:55101/metrics/reset" + endpoint: "https://localhost:5101/metrics/reset" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:55101/config" + endpoint: "https://localhost:5101/config" status: not_equal: 404 insecure_skip_verify: true diff --git a/test/integration/cases/telemetry-endpoint/config.yaml b/test/integration/cases/telemetry-endpoint/config.yaml index 1017c1a97ab..22f5b38c588 100644 --- a/test/integration/cases/telemetry-endpoint/config.yaml +++ b/test/integration/cases/telemetry-endpoint/config.yaml @@ -13,7 +13,7 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "55100/tcp" + - "5100/tcp" assertions: # Make sure the process becomes healthy, and stays up without errors, with the telemetry routes @@ -22,16 +22,16 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 55100 + port: 5100 protocol: tcp timeout: 10s - type: http_check - endpoint: "http://localhost:55100/metrics" + endpoint: "http://localhost:5100/metrics" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:55100/compat/metrics" + endpoint: "http://localhost:5100/compat/metrics" status: equal: 200 timeout: 10s diff --git a/test/integration/cases/unprivileged-api-endpoints/config.yaml b/test/integration/cases/unprivileged-api-endpoints/config.yaml index 7acea33d6aa..d5014bc3edb 100644 --- a/test/integration/cases/unprivileged-api-endpoints/config.yaml +++ b/test/integration/cases/unprivileged-api-endpoints/config.yaml @@ -13,24 +13,24 @@ container: DD_DATA_PLANE_STANDALONE_MODE: "true" DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "55100/tcp" + - "5100/tcp" assertions: - parallel: - type: process_stable_for duration: 10s - type: http_check - endpoint: "http://localhost:55100/ready" + endpoint: "http://localhost:5100/ready" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:55100/live" + endpoint: "http://localhost:5100/live" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:55100/memory/status" + endpoint: "http://localhost:5100/memory/status" status: equal: 200 timeout: 10s From 8d83c4c516035c09e792d398f90064b3c412e04c Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 13:28:10 -0400 Subject: [PATCH 39/56] feat(panoramic): unix_runner writes captured stdout/stderr to disk The Docker runner writes `stdout.log` and `stderr.log` into each test's log_dir alongside `result.log`, but the unix_runner doesn't \u2014 so when an integration test fails on the `mac` runtime, the artifact upload only contains result.log and a truncated assertion context, with no way to see the actual ADP / Core Agent output that triggered the failure. Mirror the Docker runner's write_logs behavior. Same code shape, same output filenames, same log_dir layout. Useful for the open adp-config-stream failure on Apple Silicon CI; should stop being needed once that's diagnosed but the parity is worth keeping. --- bin/correctness/panoramic/src/unix_runner.rs | 35 ++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 3f1bb0c50fb..bf34d3c1d9f 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -324,6 +324,18 @@ impl UnixIntegrationRunner { duration: cleanup_start.elapsed(), }); + // Phase: write captured logs to disk so the artifact upload picks them up. Matches the + // Docker runner's behavior; without this the artifact only contains result.log and a + // failed assertion's truncated context is all we have to debug from. + let write_logs_start = Instant::now(); + if let Err(e) = self.write_logs().await { + debug!(test = %test_name, error = %e, "Failed to write captured logs to disk."); + } + phase_timings.push(PhaseTiming { + phase: "write_logs".to_string(), + duration: write_logs_start.elapsed(), + }); + let passed = assertion_results.iter().all(|r| r.passed); TestResult { name: test_name, @@ -351,6 +363,29 @@ impl UnixIntegrationRunner { mappings } + async fn write_logs(&self) -> Result<(), GenericError> { + use std::io::Write as _; + + let log_dir = self.tctx.log_dir(); + let buffer = self.log_buffer.read().await; + + let stdout_path = log_dir.join("stdout.log"); + let mut stdout_file = std::fs::File::create(&stdout_path) + .with_error_context(|| format!("Failed to create stdout log at '{}'.", stdout_path.display()))?; + for line in &buffer.stdout { + writeln!(stdout_file, "{}", line).error_context("Failed to write stdout log line.")?; + } + + let stderr_path = log_dir.join("stderr.log"); + let mut stderr_file = std::fs::File::create(&stderr_path) + .with_error_context(|| format!("Failed to create stderr log at '{}'.", stderr_path.display()))?; + for line in &buffer.stderr { + writeln!(stderr_file, "{}", line).error_context("Failed to write stderr log line.")?; + } + + Ok(()) + } + async fn run_assertions( &self, process_display_name: String, exit_token: CancellationToken, exit_code_cell: airlock::unix::ExitCodeCell, ) -> Vec { From c84babec132e4f069244bf08f5f229674a2b4414 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 13:54:35 -0400 Subject: [PATCH 40/56] ci: sweep stranded agent-data-plane processes from prior runs The before_script pkill catches stranded Core Agent processes (they live under /tmp/saluki-dda/), but ADP itself lives at $CI_PROJECT_DIR/target/release/agent-data-plane and was sneaking past the existing pattern. A stranded ADP from a prior pipeline holds UDP 58125 (the shifted DSD port) and other listen ports across runs, which breaks every test on the next pipeline with 'Address already in use' during 'dsd_in' source startup \u2014 even though our own Core Agent comes up cleanly. Add a second pkill that matches '/target/release/agent-data-plane' so both the Core Agent and any stranded ADP get cleaned up between runs. --- .gitlab/e2e.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index 1d3e0264520..a3572903f9f 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -137,13 +137,16 @@ test-integration: variables: PANORAMIC_LOG_DIR: integration-logs before_script: - # Defensive: clean up any leftover Agent processes from prior runs on this shared runner. - # All test-Agent ports are shifted out of the canonical range (see + # Defensive: clean up any leftover Agent/ADP processes from prior runs on this shared + # runner. All test-Agent ports are shifted out of the canonical range (see # panoramic::unix_runner::test_port_isolation_env), so we don't need to touch a system - # install at /opt/datadog-agent; we only sweep our own sandbox under /tmp/saluki-dda where - # a stranded trace-agent / process-agent from a prior run could still hold our shifted - # ports and break the first converged test. + # install at /opt/datadog-agent. We do need to sweep: + # - our own Core Agent sandbox under /tmp/saluki-dda (trace-agent / process-agent + # children that survived a non-graceful job termination still hold our shifted ports) + # - any stranded agent-data-plane process from a prior pipeline (built into + # $CI_PROJECT_DIR/target/release/, holds UDP 58125 / TCP 5100–5102 etc. across runs) - sudo pkill -9 -f /tmp/saluki-dda/ || true + - sudo pkill -9 -f /target/release/agent-data-plane || true script: - make test-integration-macos-ci From c2423c8a0d0622a659fb626e43b82d539b4c9049 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 14:24:13 -0400 Subject: [PATCH 41/56] ci: move macOS arm64 jobs to the shared virtualized Tart runner pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapts the change from #1747 to all macOS arm64 jobs in this branch: the existing two unit-test jobs that #1747 covers (unit-tests-macos-arm64, unit-tests-miri-macos-arm64) plus the integration job we add here (test-integration-macos-arm64). All three already extend the `.macos-arm64-test-job` mixin, so updating the mixin once flips them as a set. Benefits per the upstream PR: fresh-VM isolation between pipelines, host toolchain decoupling, two jobs can share a single mac2.metal host, better utilization → fewer instances needed. amd64 stays on the dedicated bare-metal pool for now; the equivalent amd64 Tart infra isn't yet in place. Comment block reworked to document the asymmetry. --- .gitlab-ci.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c0c80d9c774..4cc71b8561c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -160,17 +160,23 @@ default: KUBERNETES_MEMORY_REQUEST: "8Gi" KUBERNETES_MEMORY_LIMIT: "12Gi" -# Shared mixins for macOS bare-metal runner jobs. +# Shared mixins for macOS runner jobs. +# +# arm64 jobs use the shared virtualized macOS Tart runner pool (`macos:tart`). Each job runs +# in a fresh VM, which gives clean isolation between pipelines, decouples the host toolchain, +# and lets two jobs share a single `mac2.metal` host. amd64 still uses the dedicated bare-metal +# pool (`macos:sonoma-amd64`); migrating it requires equivalent infra that isn't yet in place. # # `interruptible: true` is set here so every macOS job inherits it. macOS runner capacity in -# the GitLab fleet is constrained; auto-cancelling superseded pipelines (for example, after a -# quick fixup push to an open PR) frees the runner immediately instead of holding the slot -# for the duration of the now-stale run. See: +# the GitLab fleet is finite; auto-cancelling superseded pipelines (for example, after a quick +# fixup push to an open PR) frees the runner immediately instead of holding the slot for the +# duration of the now-stale run. See: # https://docs.gitlab.com/ci/yaml/#interruptible .macos-amd64-test-job: tags: ["macos:sonoma-amd64", "specific:true"] interruptible: true .macos-arm64-test-job: - tags: ["macos:sonoma-arm64", "specific:true"] + tags: ["macos:tart"] + image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/ci-platform-machine-images/tart-vm:saluki-sonoma-latest interruptible: true From fe5e9a4221eb1b00f66b6526c65ed91181d26da9 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 14:49:37 -0400 Subject: [PATCH 42/56] refactor: simplification pass on the macos integration changes Dead code and over-engineering called out in self-review. No behavior changes; CI was green before this commit. Dropped: * process_exits.rs (whole file): ProcessExitsWithAssertion was unused after adp_exits_with subsumed it, and its docker-API-based exit-code lookup wouldn't have worked in converged tests anyway (s6 keeps the container alive across ADP restarts). * The AssertionConfig::ProcessExitsWith enum variant and its dispatch arm / match-arm mentions in assertions/mod.rs + config.rs. * UnixProcess::exit_token field and the 'defensive' cancel in cleanup(): the watcher task always cancels the token before cleanup() awaits its handle, so the cancel was unreachable. The watcher captures the token directly now. * 'Real exit watcher' wording (was a historical contrast with an earlier dummy version). * Duplicate inline 'PGID == child PID' comment in airlock::unix (already documented on the field). * Duplicate 'Forced runner-owned bindings' comment block in unix_runner.rs (same content was given in two adjacent blocks). * Defensive hdiutil detach of a freshly-mktemp'd path in provision-macos-test-env (impossible-to-be-mounted). Simplified: * PanoramicLogSink::push_line: dropped the try_write fast-path / spawn-fallback dichotomy. Just always spawn the write task. Fewer code paths and removes the race that could let line B land before line A (line A took the spawn path, line B succeeded sync). * 'mktemp -d ; rm -rf' for the pkgutil --expand-full target dir collapsed to one logical line. * AdpExitsWithAssertion doc rewritten now that it's the only exit assertion (no longer awkwardly contrasts with a removed peer). * file_contains::read_file_local doc flow fix. Net: 173 deletions, 42 insertions across 8 files. Build, clippy, vale all clean. --- Makefile | 4 +- bin/correctness/airlock/src/unix.rs | 20 ++-- .../panoramic/src/assertions/adp_exits.rs | 18 ++-- .../panoramic/src/assertions/file_contains.rs | 6 +- .../panoramic/src/assertions/mod.rs | 5 - .../panoramic/src/assertions/process_exits.rs | 94 ------------------- bin/correctness/panoramic/src/config.rs | 16 +--- bin/correctness/panoramic/src/unix_runner.rs | 52 ++++------ 8 files changed, 42 insertions(+), 173 deletions(-) delete mode 100644 bin/correctness/panoramic/src/assertions/process_exits.rs diff --git a/Makefile b/Makefile index e6b4ba8cc0d..f89c57e846e 100644 --- a/Makefile +++ b/Makefile @@ -617,11 +617,9 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE curl -fL "$(MACOS_TEST_AGENT_DMG_URL)" -o "$$DMG_PATH"; \ fi; \ MOUNT_DIR=$$(mktemp -d /tmp/saluki-dda-mount-XXXXXX); \ - hdiutil detach "$$MOUNT_DIR" 2>/dev/null || true; \ hdiutil attach "$$DMG_PATH" -mountpoint "$$MOUNT_DIR" -nobrowse >/dev/null; \ PKG=$$(find "$$MOUNT_DIR" -name '*.pkg' | head -1); \ - EXPAND_DIR=$$(mktemp -d /tmp/saluki-dda-expand-XXXXXX); \ - rm -rf "$$EXPAND_DIR"; \ + EXPAND_DIR=$$(mktemp -d /tmp/saluki-dda-expand-XXXXXX) && rm -rf "$$EXPAND_DIR"; \ pkgutil --expand-full "$$PKG" "$$EXPAND_DIR" >/dev/null; \ hdiutil detach "$$MOUNT_DIR" >/dev/null; \ rmdir "$$MOUNT_DIR" 2>/dev/null || true; \ diff --git a/bin/correctness/airlock/src/unix.rs b/bin/correctness/airlock/src/unix.rs index 4d59f96cf58..6c39e8cb3a5 100644 --- a/bin/correctness/airlock/src/unix.rs +++ b/bin/correctness/airlock/src/unix.rs @@ -102,7 +102,6 @@ pub struct UnixProcess { /// PGID of the spawned process. We made the child the group leader at spawn time, so this /// equals the child's PID. `None` only if spawn failed to return a PID (very rare). process_group: Option, - exit_token: CancellationToken, exit_code: ExitCodeCell, log_tasks: Vec>, exit_task: Option>, @@ -136,7 +135,6 @@ impl UnixProcess { .spawn() .with_error_context(|| format!("Failed to spawn '{}'.", config.binary_path.display()))?; - // PGID == child PID since we made the child the group leader (process_group(0)). let process_group = child.id().map(|pid| pid as i32); let stdout = child @@ -151,13 +149,11 @@ impl UnixProcess { let stdout_task = spawn_log_pump(stdout, log_sink.clone(), false); let stderr_task = spawn_log_pump(stderr, log_sink, true); - // Real exit watcher: moves the child into the task, calls `wait()`, records the exit - // code, and fires the exit token so blocked assertions (process_stable_for / - // adp_exits_with) unblock immediately rather than waiting for the test's own - // cleanup phase. + // Exit watcher: moves the child into the task, calls `wait()`, records the exit code, + // and fires the exit token so blocked assertions (process_stable_for / adp_exits_with) + // unblock immediately rather than waiting for the test's own cleanup phase. let exit_code: ExitCodeCell = Arc::new(OnceLock::new()); let exit_code_for_watcher = exit_code.clone(); - let exit_token_for_watcher = exit_token.clone(); let name_for_watcher = config.name.clone(); let exit_task = tokio::spawn(async move { match child.wait().await { @@ -171,13 +167,12 @@ impl UnixProcess { let _ = exit_code_for_watcher.set(None); } } - exit_token_for_watcher.cancel(); + exit_token.cancel(); }); Ok(Self { name: config.name, process_group, - exit_token, exit_code, log_tasks: vec![stdout_task, stderr_task], exit_task: Some(exit_task), @@ -215,14 +210,11 @@ impl UnixProcess { } } - // The exit watcher will have observed the kill and set the exit code + fired the token. - // Join it so we don't leak the task. + // The exit watcher will have observed the kill, set the exit code, and fired the exit + // token. Join it (and the log pumps) so we don't leak tasks. if let Some(handle) = self.exit_task.take() { let _ = handle.await; } - // Defensive: make sure the token is fired even if the watcher never set it (for example, - // on a failed wait). - self.exit_token.cancel(); for handle in self.log_tasks.drain(..) { let _ = handle.await; } diff --git a/bin/correctness/panoramic/src/assertions/adp_exits.rs b/bin/correctness/panoramic/src/assertions/adp_exits.rs index ca793a4dde4..eec0d1f6f12 100644 --- a/bin/correctness/panoramic/src/assertions/adp_exits.rs +++ b/bin/correctness/panoramic/src/assertions/adp_exits.rs @@ -5,16 +5,18 @@ use crate::{ config::LogStream, }; -/// Assertion that checks ADP exited with a specific exit code, abstracting over the runtime. +/// Assertion that checks ADP exited with a specific exit code. /// -/// On the `docker` runtime ADP runs under s6, which keeps the container alive across ADP -/// restarts and logs `"agent-data-plane exited with code N"` from -/// `docker/s6-services/agent-data-plane/finish` when ADP exits. We grep the captured log buffer -/// for that line. +/// The detection mechanism differs per runtime because ADP isn't a top-level process in every +/// case: /// -/// On the `mac` runtime there is no supervisor. The Unix runner observes ADP's child process -/// exit directly and records the exit code in the shared cell on -/// [`AssertionContext::host_process_exit_code`]. +/// - **`docker`**: ADP runs under s6 inside the converged container; s6 keeps the container +/// alive across ADP restarts and logs `"agent-data-plane exited with code N"` from +/// `docker/s6-services/agent-data-plane/finish` when ADP exits. We grep the captured log +/// buffer for that line. +/// - **`mac`** (and any host-process runtime): the Unix runner observes ADP's child process +/// exit directly and records the exit code in the shared cell on +/// [`AssertionContext::host_process_exit_code`]; we read it from there. pub struct AdpExitsWithAssertion { expected_code: i64, timeout: Duration, diff --git a/bin/correctness/panoramic/src/assertions/file_contains.rs b/bin/correctness/panoramic/src/assertions/file_contains.rs index 39a051d5975..79400f5c753 100644 --- a/bin/correctness/panoramic/src/assertions/file_contains.rs +++ b/bin/correctness/panoramic/src/assertions/file_contains.rs @@ -139,9 +139,9 @@ impl Assertion for FileContainsAssertion { /// Reads a file from the host filesystem. /// /// Used by the `mac` runtime (and any future host-process runtime) where ADP runs as a local -/// process and writes log files to -/// real host paths. Returns the same shape as [`read_file_in_container`]: `Ok(Some(contents))` -/// when readable, `Ok(None)` when missing, `Err` for unexpected I/O failures. +/// process and writes log files to real host paths. Returns the same shape as +/// [`read_file_in_container`]: `Ok(Some(contents))` when readable, `Ok(None)` when missing +/// or unreadable, `Err` for unexpected I/O failures. async fn read_file_local(path: &str) -> Result, String> { match tokio::fs::read_to_string(path).await { Ok(contents) => Ok(Some(contents)), diff --git a/bin/correctness/panoramic/src/assertions/mod.rs b/bin/correctness/panoramic/src/assertions/mod.rs index 2667008ec78..0aa8d429dd9 100644 --- a/bin/correctness/panoramic/src/assertions/mod.rs +++ b/bin/correctness/panoramic/src/assertions/mod.rs @@ -13,7 +13,6 @@ mod file_contains; mod http_check; mod log_contains; mod port_listening; -mod process_exits; mod process_stable; pub use adp_exits::AdpExitsWithAssertion; @@ -21,7 +20,6 @@ pub use file_contains::FileContainsAssertion; pub use http_check::HttpCheckAssertion; pub use log_contains::{LogContainsAssertion, LogNotContainsAssertion}; pub use port_listening::PortListeningAssertion; -pub use process_exits::ProcessExitsWithAssertion; pub use process_stable::ProcessStableForAssertion; /// Result of running an assertion. @@ -133,9 +131,6 @@ pub trait Assertion: Send + Sync { pub fn create_assertion(config: &AssertionConfig) -> Result, GenericError> { match config { AssertionConfig::ProcessStableFor { duration } => Ok(Box::new(ProcessStableForAssertion::new(duration.0))), - AssertionConfig::ProcessExitsWith { expected_code, timeout } => { - Ok(Box::new(ProcessExitsWithAssertion::new(*expected_code, timeout.0))) - } AssertionConfig::AdpExitsWith { expected_code, timeout } => { Ok(Box::new(AdpExitsWithAssertion::new(*expected_code, timeout.0))) } diff --git a/bin/correctness/panoramic/src/assertions/process_exits.rs b/bin/correctness/panoramic/src/assertions/process_exits.rs deleted file mode 100644 index 8906599d848..00000000000 --- a/bin/correctness/panoramic/src/assertions/process_exits.rs +++ /dev/null @@ -1,94 +0,0 @@ -use std::time::{Duration, Instant}; - -use crate::assertions::{Assertion, AssertionContext, AssertionResult}; - -/// Assertion that checks the container process exits with a specific exit code. -/// -/// Currently implemented only for the docker runtime. The `mac` runtime uses the -/// runtime-aware [`AdpExitsWithAssertion`][crate::assertions::AdpExitsWithAssertion] instead -/// (which delegates to the per-process exit code cell on host-process runtimes). -pub struct ProcessExitsWithAssertion { - expected_code: i64, - timeout: Duration, -} - -impl ProcessExitsWithAssertion { - pub fn new(expected_code: i64, timeout: Duration) -> Self { - Self { expected_code, timeout } - } -} - -#[async_trait::async_trait] -impl Assertion for ProcessExitsWithAssertion { - fn name(&self) -> &'static str { - "process_exits_with" - } - - fn description(&self) -> String { - format!( - "Process exits with code {} within {:?}.", - self.expected_code, self.timeout - ) - } - - async fn check(&self, ctx: &AssertionContext) -> AssertionResult { - let started = Instant::now(); - - tokio::select! { - // Container exited - check exit code via Docker API - _ = ctx.container_exit_token.cancelled() => { - let docker: bollard::Docker = match airlock::docker::connect() { - Ok(d) => d, - Err(e) => { - return AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!("Failed to connect to Docker: {}", e), - duration: started.elapsed(), - }; - } - }; - - match docker.inspect_container(&ctx.container_name, None).await { - Ok(container) => { - let exit_code = container.state.and_then(|s| s.exit_code).unwrap_or(-1); - if exit_code == self.expected_code { - AssertionResult { - name: self.name().to_string(), - passed: true, - message: format!("Process exited with expected code {}.", exit_code), - duration: started.elapsed(), - } - } else { - AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!( - "Process exited with code {}, expected {}.", - exit_code, self.expected_code - ), - duration: started.elapsed(), - } - } - } - Err(e) => AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!("Failed to inspect container: {}", e), - duration: started.elapsed(), - } - } - } - - // Timeout waiting for the container to exit. - _ = tokio::time::sleep(self.timeout) => { - AssertionResult { - name: self.name().to_string(), - passed: false, - message: format!("Process did not exit within {:?}.", self.timeout), - duration: started.elapsed(), - } - } - } - } -} diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index b4d37102001..b67630f77aa 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -228,14 +228,6 @@ pub enum AssertionConfig { duration: HumanDuration, }, - /// Check that the process exits with a specific exit code. - ProcessExitsWith { - /// The expected exit code. - expected_code: i64, - /// Timeout for waiting for the process to exit. - timeout: HumanDuration, - }, - /// Check that ADP itself exits with a specific exit code, abstracting over the runtime's /// observation mechanism. /// @@ -367,9 +359,7 @@ impl AssertionConfig { crate::dynamic_vars::resolve_placeholders(p, vars); } } - AssertionConfig::ProcessStableFor { .. } - | AssertionConfig::ProcessExitsWith { .. } - | AssertionConfig::AdpExitsWith { .. } => {} + AssertionConfig::ProcessStableFor { .. } | AssertionConfig::AdpExitsWith { .. } => {} } } @@ -392,9 +382,7 @@ impl AssertionConfig { crate::dynamic_vars::find_unresolved(p, &mut out); } } - AssertionConfig::ProcessStableFor { .. } - | AssertionConfig::ProcessExitsWith { .. } - | AssertionConfig::AdpExitsWith { .. } => {} + AssertionConfig::ProcessStableFor { .. } | AssertionConfig::AdpExitsWith { .. } => {} } out } diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index bf34d3c1d9f..b3a84039c64 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -195,25 +195,19 @@ impl UnixIntegrationRunner { }; debug!(test = %test_name, binary = %agent_binary.display(), "Resolved Core Agent binary path."); - // The Agent and ADP must agree on the auth_token / ipc_cert.pem path. The Agent's - // authoritative config (sent to ADP via the config stream) overrides ADP's env vars - // by design, so the Agent must itself be told about the per-test path — otherwise - // it advertises the platform default (`/opt/datadog-agent/etc/auth_token`), ADP - // follows that advice for its post-config-stream IPC clients, and TLS fails with - // UnknownIssuer because the platform default cert does not match what the per-test - // Agent is actually serving. // Forced runner-owned bindings: - // DD_AUTH_TOKEN_FILE_PATH: pin to the per-test path. The Agent's authoritative - // config (sent to ADP via the config stream) would otherwise advertise the - // platform default, ADP would follow that advice for its post-config-stream IPC - // clients, and TLS would fail with UnknownIssuer because the platform default - // cert does not match what the per-test Agent is actually serving. - // DD_RUN_PATH: the Agent's default `run_path` is the install prefix's `run/` - // directory (e.g., /opt/datadog-agent/run). Without overriding it, a relocated - // Agent install would try to write its runtime state (remote-config db, - // sockets, pid file) back to the canonical /opt path — typically not writable - // in CI. Scope it to the per-test state directory so each test gets a clean - // slate and nothing leaks across runs. + // DD_AUTH_TOKEN_FILE_PATH — pin Agent + ADP to the same per-test path. The Agent's + // authoritative config (sent to ADP via the config stream) overrides ADP's env + // vars, so the Agent itself must be told about the per-test path; otherwise it + // advertises the platform default (`/opt/datadog-agent/etc/auth_token`), ADP + // follows that advice for its post-config-stream IPC clients, and TLS fails + // with UnknownIssuer because the platform default cert does not match what the + // per-test Agent is actually serving. + // DD_RUN_PATH — Agent's default `run_path` is the install prefix's `run/` dir + // (e.g., /opt/datadog-agent/run). Without overriding, a relocated Agent install + // would try to write its runtime state (remote-config db, sockets, pid file) + // back to /opt — typically not writable in CI. Scope it to the per-test state + // directory so each test gets a clean slate and nothing leaks across runs. let agent_env = build_process_env( &self.test_case.container.env, &[ @@ -481,24 +475,18 @@ struct PanoramicLogSink { impl LogSink for PanoramicLogSink { fn push_line(&mut self, line: String, is_stderr: bool) { - // Try a non-blocking write first. If contended, spawn a task to defer the write so we - // don't stall the log pump (which is itself a tokio task). - if let Ok(mut buf) = self.buf.try_write() { + // The log pump (in airlock::unix) holds the LogSink's outer mutex while calling us, + // so writes from a single pump are already serialized. Spawn a small task to actually + // append to the buffer so we never block the pump on `.write().await` ordering with + // concurrent assertion readers. + let buf = self.buf.clone(); + tokio::spawn(async move { + let mut buf = buf.write().await; if is_stderr { buf.stderr.push(line); } else { buf.stdout.push(line); } - } else { - let buf = self.buf.clone(); - tokio::spawn(async move { - let mut buf = buf.write().await; - if is_stderr { - buf.stderr.push(line); - } else { - buf.stdout.push(line); - } - }); - } + }); } } From 60efe1c34b95cfdb892427ef0e5a8c4a709e4f0f Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 15:16:51 -0400 Subject: [PATCH 43/56] refactor(panoramic): hoist 'env' out of 'container' into a top-level field Environment variables are applied identically by both runtimes (docker injects them into the container env, the Unix runner passes them to the spawned Agent + ADP processes), so they belong at the top level of the integration test config, not nested under 'container' \u2014 which implies they're a docker-specific knob. Schema: new top-level 'env: map' field on IntegrationConfig. ContainerConfig's 'env' field removed. All 28 test configs migrated; the readers in panoramic (runner.rs, unix_runner.rs, dynamic_vars.rs) updated to 'self.test_case.env'. Local sanity-checked all 24 mac-runtime tests still pass. Also fills out the framework port-isolation env helper to cover ADP's own listen addresses (data_plane.api_listen_address, .secure_api_listen_address, .telemetry_listen_addr) and the OTLP receiver endpoints (otlp_config.receiver.protocols.{grpc,http}.endpoint) that I'd previously left on defaults. Those defaults conflict between two ADP instances on the same host (which the local dev flow hits if a prior 'make test-integration-macos-run' left strays behind, and which any concurrent local invocation would also hit). Using double underscores at every dot boundary, as required by saluki-config's 'Env::split("__")' nesting convention. Updated tests: - privileged-api-endpoints: 5101 -> 55101 - unprivileged-api-endpoints: 5100 -> 55100 - telemetry-endpoint: 5100 -> 55100 - otlp-traces-enabled: 4317 -> 54317, 4318 -> 54318 - dogstatsd-forwarding (Python harness): telemetry URL port 5100 -> 55100 --- bin/correctness/panoramic/src/config.rs | 12 +++-- bin/correctness/panoramic/src/dynamic_vars.rs | 2 +- bin/correctness/panoramic/src/runner.rs | 8 ++-- bin/correctness/panoramic/src/unix_runner.rs | 48 ++++++++++++++----- .../cases/adp-cmd-port/config.yaml | 17 +++---- .../cases/adp-config-check-exit/config.yaml | 17 +++---- .../cases/adp-config-check-warn/config.yaml | 17 +++---- .../cases/adp-config-stream/config.yaml | 15 +++--- .../cases/adp-disabled-exit/config.yaml | 11 +++-- .../adp-logging-default-path/config.yaml | 13 ++--- .../config.yaml | 15 +++--- .../config.yaml | 15 +++--- .../adp-memory-mode-disabled/config.yaml | 19 ++++---- .../config.yaml | 21 ++++---- .../config.yaml | 17 +++---- .../config.yaml | 21 ++++---- .../config.yaml | 17 +++---- .../cases/adp-no-pipelines-exit/config.yaml | 15 +++--- .../cases/adp-rar-disabled/config.yaml | 21 ++++---- .../cases/adp-rar-registration/config.yaml | 15 +++--- .../cases/basic-startup/config.yaml | 11 +++-- .../cases/dogstatsd-autoscale-udp/config.yaml | 17 +++---- .../config.yaml | 23 ++++----- .../cases/dogstatsd-bind-host/config.yaml | 17 +++---- .../cases/dogstatsd-default-bind/config.yaml | 13 ++--- .../cases/dogstatsd-enabled/config.yaml | 15 +++--- .../cases/dogstatsd-forwarding/config.yaml | 25 +++++----- .../run_forwarding_test.py | 4 +- .../config.yaml | 23 ++++----- .../cases/otlp-traces-enabled/config.yaml | 25 +++++----- .../privileged-api-endpoints/config.yaml | 27 ++++++----- .../cases/telemetry-endpoint/config.yaml | 21 ++++---- .../unprivileged-api-endpoints/config.yaml | 21 ++++---- 33 files changed, 317 insertions(+), 261 deletions(-) diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index b67630f77aa..d602aeaa9f0 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -111,6 +111,14 @@ pub struct IntegrationConfig { /// Container configuration. pub container: ContainerConfig, + /// Environment variables to set on the target process(es). + /// + /// Top-level (not under `container`) because both the docker and `mac` runtimes apply + /// these the same way — docker injects them as container env, the Unix runner passes them + /// to the spawned ADP / Core Agent processes. + #[serde(default)] + pub env: HashMap, + /// List of assertion steps to run. pub assertions: Vec, @@ -189,10 +197,6 @@ pub struct ContainerConfig { #[serde(default)] pub command: Vec, - /// Environment variables to set. - #[serde(default)] - pub env: HashMap, - /// Files to mount (host_path:container_path format). #[serde(default)] pub files: Vec, diff --git a/bin/correctness/panoramic/src/dynamic_vars.rs b/bin/correctness/panoramic/src/dynamic_vars.rs index ba82db791da..94a38e8eeef 100644 --- a/bin/correctness/panoramic/src/dynamic_vars.rs +++ b/bin/correctness/panoramic/src/dynamic_vars.rs @@ -72,7 +72,7 @@ const PLACEHOLDER_NEEDLE: &str = "{{PANORAMIC_DYNAMIC_"; /// Returns `true` if the test case defines any `PANORAMIC_DYNAMIC_*` env vars. pub fn has_dynamic_vars(test_case: &IntegrationConfig) -> bool { - test_case.container.env.keys().any(|k| k.starts_with(ENV_PREFIX)) + test_case.env.keys().any(|k| k.starts_with(ENV_PREFIX)) } /// Reads resolved dynamic variable values from `/airlock/dynamic/` inside the container. diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index 07cb06ebd75..30c48a38fc8 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -694,11 +694,11 @@ impl IntegrationRunner { let container = &self.test_case.container; // Merge framework-level port-isolation env vars with the test's own env. Framework - // defaults are applied first so the test's `container.env` (and any explicit override) - // takes precedence. Keeps the test surface consistent across the docker and `mac` - // runtimes — both see the same shifted port table. + // defaults are applied first so the test's `env` block takes precedence. Keeps the test + // surface consistent across the docker and `mac` runtimes — both see the same shifted + // port table. let mut merged_env = crate::unix_runner::test_port_isolation_env(); - for (k, v) in &container.env { + for (k, v) in &self.test_case.env { merged_env.insert(k.clone(), v.clone()); } let env_vars: Vec = merged_env.iter().map(|(k, v)| format!("{}={}", k, v)).collect(); diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index b3a84039c64..88adc9af0cf 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -57,22 +57,24 @@ const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/tmp/saluki-dda/datadog-agent/bin/ const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); -/// Framework-level env overrides that move every conflict-prone default port off its canonical -/// value so the test Core Agent + ADP can coexist with a system Datadog Agent already running -/// on a shared CI runner. Tests can override any of these via `container.env`; tests that test -/// specific port behavior (`adp-cmd-port`) supply their own values. +/// Framework-level env overrides that move every default port the test target binds off its +/// canonical value, so concurrent test runs and any system Agent / system ADP on the host can +/// coexist with the per-test processes. Tests can override any of these via their `env` block; +/// tests that exercise specific port behavior (`adp-cmd-port`) supply their own values. /// /// Naming convention: every default port that's 4 digits gets a `5` prepended (8125 -> 58125, /// 5001 -> 55001, etc.). The GUI is disabled outright since we don't exercise it. /// -/// Scope is intentionally narrow: only ports a stock Datadog Agent binds by default. ADP's own -/// listen addresses (5100/5101/5102) and the OTLP receiver (4317/4318) are not bound by the -/// system Agent and don't need to be shifted; keeping them on defaults means tests can assert -/// against canonical port numbers with no extra plumbing. +/// Note on env-var nesting: saluki-config (and figment) split env-var names on `__` to map to +/// nested config keys. Single-underscore env vars like `DD_DATA_PLANE_API_LISTEN_ADDRESS` map +/// to the flat key `data_plane_api_listen_address` and are silently ignored; we use double +/// underscores at every dot boundary for the deep ADP / OTLP keys below. The top-level Agent +/// env vars (`DD_CMD_PORT` etc.) are explicitly queried by the Agent so they don't need it. pub fn test_port_isolation_env() -> HashMap { HashMap::from([ + // ----- Core Agent ports ----- // CMD/IPC API. Shared key between the Core Agent (listener) and ADP (IPC client). - // `adp-cmd-port` overrides this via container.env to validate the non-default path. + // `adp-cmd-port` overrides this via its `env` block to validate the non-default path. ("DD_CMD_PORT".to_string(), "55001".to_string()), // GUI — disabled outright. No integration test exercises it. ("DD_GUI_PORT".to_string(), "-1".to_string()), @@ -86,6 +88,28 @@ pub fn test_port_isolation_env() -> HashMap { // DD_DATA_PLANE_ENABLED so this mainly affects ADP (the actual listener) and the // bootstrap-mode Agent. ("DD_DOGSTATSD_PORT".to_string(), "58125".to_string()), + // ----- ADP listen addresses ----- (URI-style; ListenAddress accepts `tcp://host:port`) + ( + "DD_DATA_PLANE__API_LISTEN_ADDRESS".to_string(), + "tcp://0.0.0.0:55100".to_string(), + ), + ( + "DD_DATA_PLANE__SECURE_API_LISTEN_ADDRESS".to_string(), + "tcp://0.0.0.0:55101".to_string(), + ), + ( + "DD_DATA_PLANE__TELEMETRY_LISTEN_ADDR".to_string(), + "tcp://0.0.0.0:55102".to_string(), + ), + // ----- OTLP receiver endpoints ----- (same shape as the Datadog Agent's OTLP env vars) + ( + "DD_OTLP_CONFIG__RECEIVER__PROTOCOLS__GRPC__ENDPOINT".to_string(), + "0.0.0.0:54317".to_string(), + ), + ( + "DD_OTLP_CONFIG__RECEIVER__PROTOCOLS__HTTP__ENDPOINT".to_string(), + "0.0.0.0:54318".to_string(), + ), ]) } @@ -93,7 +117,7 @@ pub fn test_port_isolation_env() -> HashMap { /// /// Precedence (lowest to highest): /// 1. framework port-isolation defaults (`test_port_isolation_env`) -/// 2. the test's declared `container.env` +/// 2. the test's top-level `env` block /// 3. forced overrides supplied by the caller (auth token path, run path, …) /// /// Forced overrides are bottom-of-stack from the framework's perspective but top-of-stack here @@ -209,7 +233,7 @@ impl UnixIntegrationRunner { // back to /opt — typically not writable in CI. Scope it to the per-test state // directory so each test gets a clean slate and nothing leaks across runs. let agent_env = build_process_env( - &self.test_case.container.env, + &self.test_case.env, &[ ("DD_AUTH_TOKEN_FILE_PATH", auth_token_path.clone()), ("DD_RUN_PATH", state_dir.to_string_lossy().into_owned()), @@ -267,7 +291,7 @@ impl UnixIntegrationRunner { } else { Vec::new() }; - let adp_env = build_process_env(&self.test_case.container.env, &adp_forced); + let adp_env = build_process_env(&self.test_case.env, &adp_forced); let process_config = UnixProcessConfig::new(self.test_case.name.clone(), binary_path) .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) .with_env_map(adp_env); diff --git a/test/integration/cases/adp-cmd-port/config.yaml b/test/integration/cases/adp-cmd-port/config.yaml index 77f21060ade..ed09720193a 100644 --- a/test/integration/cases/adp-cmd-port/config.yaml +++ b/test/integration/cases/adp-cmd-port/config.yaml @@ -18,16 +18,17 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-cmd-port" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "false" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_CMD_PORT: "7777" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-cmd-port" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "false" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_CMD_PORT: "7777" assertions: # ADP should reach out for config from the core agent on the correct port. diff --git a/test/integration/cases/adp-config-check-exit/config.yaml b/test/integration/cases/adp-config-check-exit/config.yaml index 143c84fe362..eb4ed9881e9 100644 --- a/test/integration/cases/adp-config-check-exit/config.yaml +++ b/test/integration/cases/adp-config-check-exit/config.yaml @@ -13,16 +13,17 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-config-exit" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "false" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_HEROKU_DYNO: "true" # Incompatible(High) - triggers error and exit + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-config-exit" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "false" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_HEROKU_DYNO: "true" # Incompatible(High) - triggers error and exit assertions: - type: log_contains diff --git a/test/integration/cases/adp-config-check-warn/config.yaml b/test/integration/cases/adp-config-check-warn/config.yaml index 72f8f32ce49..d5e5854b377 100644 --- a/test/integration/cases/adp-config-check-warn/config.yaml +++ b/test/integration/cases/adp-config-check-warn/config.yaml @@ -10,16 +10,17 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-config-warn" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "false" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_DOGSTATSD_TELEMETRY_ENABLED_LISTENER_ID: "true" # Incompatible(Medium) - triggers generic warning + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-config-warn" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "false" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_TELEMETRY_ENABLED_LISTENER_ID: "true" # Incompatible(Medium) - triggers generic warning assertions: - type: log_contains diff --git a/test/integration/cases/adp-config-stream/config.yaml b/test/integration/cases/adp-config-stream/config.yaml index 8e292eb314b..b7824b14cae 100644 --- a/test/integration/cases/adp-config-stream/config.yaml +++ b/test/integration/cases/adp-config-stream/config.yaml @@ -5,15 +5,16 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-configstream" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "false" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-configstream" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "false" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/adp-disabled-exit/config.yaml b/test/integration/cases/adp-disabled-exit/config.yaml index f0fdca080ff..d8090cfd691 100644 --- a/test/integration/cases/adp-disabled-exit/config.yaml +++ b/test/integration/cases/adp-disabled-exit/config.yaml @@ -5,13 +5,14 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "false" + DD_DATA_PLANE_STANDALONE_MODE: "false" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "false" - DD_DATA_PLANE_STANDALONE_MODE: "false" assertions: # ADP should log that it's not enabled and exit. diff --git a/test/integration/cases/adp-logging-default-path/config.yaml b/test/integration/cases/adp-logging-default-path/config.yaml index 596f0cec00d..90f9e517c05 100644 --- a/test/integration/cases/adp-logging-default-path/config.yaml +++ b/test/integration/cases/adp-logging-default-path/config.yaml @@ -5,14 +5,15 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-default-log" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-default-log" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml index 746e0bc0cab..0fcb1708daa 100644 --- a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml +++ b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml @@ -5,15 +5,16 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-ignore-core-log-file" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_LOG_FILE: "/tmp/coreagent-only.log" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-ignore-core-log-file" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_LOG_FILE: "/tmp/coreagent-only.log" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml index f528356f88c..6514566959c 100644 --- a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml +++ b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml @@ -5,15 +5,16 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-data-plane-log-override" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_DATA_PLANE_LOG_FILE: "/tmp/adp-custom.log" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-data-plane-log-override" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DATA_PLANE_LOG_FILE: "/tmp/adp-custom.log" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/adp-memory-mode-disabled/config.yaml b/test/integration/cases/adp-memory-mode-disabled/config.yaml index 136390cbadd..107d4336ab5 100644 --- a/test/integration/cases/adp-memory-mode-disabled/config.yaml +++ b/test/integration/cases/adp-memory-mode-disabled/config.yaml @@ -4,17 +4,18 @@ description: "Verifies that memory limiting is disabled by default and bounds ve timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + # An unrealistically small limit. With memory mode defaulted to "disabled", verification is + # skipped entirely and ADP uses an unbounded grant for the global limiter, so this value is + # intentionally ignored. + DD_MEMORY_LIMIT: "1" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - # An unrealistically small limit. With memory mode defaulted to "disabled", verification is - # skipped entirely and ADP uses an unbounded grant for the global limiter, so this value is - # intentionally ignored. - DD_MEMORY_LIMIT: "1" assertions: - type: log_contains diff --git a/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml b/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml index 26c83dd4e1d..356ecc9292c 100644 --- a/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-permissive-exceeds-limit/config.yaml @@ -4,18 +4,19 @@ description: "Verifies that permissive mode emits a best-effort warning when the timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_MEMORY_MODE: "permissive" + DD_MEMORY_LIMIT: "256mb" + # Bumped well above the default of 1,000,000 to drive the firm bound for the aggregate + # transform far past the configured memory limit. + DD_AGGREGATE_CONTEXT_LIMIT: "10000000" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_MEMORY_MODE: "permissive" - DD_MEMORY_LIMIT: "256mb" - # Bumped well above the default of 1,000,000 to drive the firm bound for the aggregate - # transform far past the configured memory limit. - DD_AGGREGATE_CONTEXT_LIMIT: "10000000" assertions: - type: log_contains diff --git a/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml b/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml index b61d873cc1c..3c0ef90d8f9 100644 --- a/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-permissive-within-limit/config.yaml @@ -4,16 +4,17 @@ description: "Verifies that permissive mode succeeds and verifies bounds when th timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_MEMORY_MODE: "permissive" + # Generous enough to comfortably fit the default calculated bounds. + DD_MEMORY_LIMIT: "2gb" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_MEMORY_MODE: "permissive" - # Generous enough to comfortably fit the default calculated bounds. - DD_MEMORY_LIMIT: "2gb" assertions: - type: log_contains diff --git a/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml b/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml index 92efa40f346..b15bf18a46b 100644 --- a/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-strict-exceeds-limit/config.yaml @@ -4,18 +4,19 @@ description: "Verifies that strict mode causes ADP to exit with code 1 when the timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_MEMORY_MODE: "strict" + DD_MEMORY_LIMIT: "256mb" + # Bumped well above the default of 1,000,000 to drive the firm bound for the aggregate + # transform far past the configured memory limit. + DD_AGGREGATE_CONTEXT_LIMIT: "10000000" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_MEMORY_MODE: "strict" - DD_MEMORY_LIMIT: "256mb" - # Bumped well above the default of 1,000,000 to drive the firm bound for the aggregate - # transform far past the configured memory limit. - DD_AGGREGATE_CONTEXT_LIMIT: "10000000" assertions: # Observe ADP's actual exit code, regardless of runtime. On docker (s6) this greps the diff --git a/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml b/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml index 31a8734b016..60615382c16 100644 --- a/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml +++ b/test/integration/cases/adp-memory-mode-strict-within-limit/config.yaml @@ -4,16 +4,17 @@ description: "Verifies that strict mode succeeds and verifies bounds when the ca timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_MEMORY_MODE: "strict" + # Generous enough to comfortably fit the default calculated bounds. + DD_MEMORY_LIMIT: "2gb" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_MEMORY_MODE: "strict" - # Generous enough to comfortably fit the default calculated bounds. - DD_MEMORY_LIMIT: "2gb" assertions: - type: log_contains diff --git a/test/integration/cases/adp-no-pipelines-exit/config.yaml b/test/integration/cases/adp-no-pipelines-exit/config.yaml index 949e8533531..42dcff845b7 100644 --- a/test/integration/cases/adp-no-pipelines-exit/config.yaml +++ b/test/integration/cases/adp-no-pipelines-exit/config.yaml @@ -4,15 +4,16 @@ description: "Verify ADP exits with error when no data pipelines enabled" timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "false" + DD_DATA_PLANE_OTLP_ENABLED: "false" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "false" - DD_DATA_PLANE_OTLP_ENABLED: "false" assertions: # ADP should log that no pipelines are enabled and exit. diff --git a/test/integration/cases/adp-rar-disabled/config.yaml b/test/integration/cases/adp-rar-disabled/config.yaml index 28a5ccc9b3d..8f0e342007b 100644 --- a/test/integration/cases/adp-rar-disabled/config.yaml +++ b/test/integration/cases/adp-rar-disabled/config.yaml @@ -9,18 +9,19 @@ runtimes: [docker] # runtime, so this assertion does not translate; re-enable once the Unix runner grows a # supervisor or the test is rewritten to assert on retry behavior directly. +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-rar-disabled" + # Core Agent setting to disable RAR + DD_REMOTE_AGENT_REGISTRY_ENABLED: "false" + # ADP settings - enable RAR integration (which will fail gracefully) + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "false" + DD_DATA_PLANE_REMOTE_AGENT_ENABLED: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-rar-disabled" - # Core Agent setting to disable RAR - DD_REMOTE_AGENT_REGISTRY_ENABLED: "false" - # ADP settings - enable RAR integration (which will fail gracefully) - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "false" - DD_DATA_PLANE_REMOTE_AGENT_ENABLED: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/adp-rar-registration/config.yaml b/test/integration/cases/adp-rar-registration/config.yaml index 97958b982a1..1610a11b3a4 100644 --- a/test/integration/cases/adp-rar-registration/config.yaml +++ b/test/integration/cases/adp-rar-registration/config.yaml @@ -5,15 +5,16 @@ timeout: 120s runtimes: [docker, mac] requires_core_agent: true +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-rar" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "false" + DD_DATA_PLANE_REMOTE_AGENT_ENABLED: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-rar" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "false" - DD_DATA_PLANE_REMOTE_AGENT_ENABLED: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/basic-startup/config.yaml b/test/integration/cases/basic-startup/config.yaml index 999b4f3ef80..715282c02c7 100644 --- a/test/integration/cases/basic-startup/config.yaml +++ b/test/integration/cases/basic-startup/config.yaml @@ -4,13 +4,14 @@ description: "Verifies ADP starts successfully and remains stable" timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" assertions: # Make sure we always emit our initial startup log, which contains relevant information diff --git a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml index 58e06c3084d..f0732e6ec7f 100644 --- a/test/integration/cases/dogstatsd-autoscale-udp/config.yaml +++ b/test/integration/cases/dogstatsd-autoscale-udp/config.yaml @@ -4,16 +4,17 @@ description: "Verifies DogStatsD UDP listener autoscaling (SO_REUSEPORT) starts timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd-autoscale" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" + DD_DOGSTATSD_AUTOSCALE_UDP_LISTENERS: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd-autoscale" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" - DD_DOGSTATSD_AUTOSCALE_UDP_LISTENERS: "true" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml index 578ab45b0b7..8be43832d17 100644 --- a/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml +++ b/test/integration/cases/dogstatsd-bind-custom-hostname/config.yaml @@ -29,19 +29,20 @@ timeout: 120s # once dynamic resolution grows a portable mechanism. runtimes: [docker] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd-bind-host-hostname" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + PANORAMIC_DYNAMIC_CONTAINER_IP: "hostname -i | awk '{print $1}'" + # Side-effect: add "foo.local -> eth0 IP" to /etc/hosts, then echo the + # hostname as the captured value (used below to template DD_BIND_HOST). + PANORAMIC_DYNAMIC_CUSTOM_HOSTNAME: "echo \"$(hostname -i | awk '{print $1}') foo.local\" >> /etc/hosts; echo foo.local" + DD_BIND_HOST: "{{PANORAMIC_DYNAMIC_CUSTOM_HOSTNAME}}" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd-bind-host-hostname" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - PANORAMIC_DYNAMIC_CONTAINER_IP: "hostname -i | awk '{print $1}'" - # Side-effect: add "foo.local -> eth0 IP" to /etc/hosts, then echo the - # hostname as the captured value (used below to template DD_BIND_HOST). - PANORAMIC_DYNAMIC_CUSTOM_HOSTNAME: "echo \"$(hostname -i | awk '{print $1}') foo.local\" >> /etc/hosts; echo foo.local" - DD_BIND_HOST: "{{PANORAMIC_DYNAMIC_CUSTOM_HOSTNAME}}" assertions: - type: log_contains diff --git a/test/integration/cases/dogstatsd-bind-host/config.yaml b/test/integration/cases/dogstatsd-bind-host/config.yaml index af9d702ba3b..54389f4d56b 100644 --- a/test/integration/cases/dogstatsd-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-bind-host/config.yaml @@ -20,16 +20,17 @@ timeout: 120s # once dynamic resolution grows a portable mechanism. runtimes: [docker] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd-bind-host" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + PANORAMIC_DYNAMIC_CONTAINER_IP: "hostname -i | awk '{print $1}'" + DD_BIND_HOST: "{{PANORAMIC_DYNAMIC_CONTAINER_IP}}" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd-bind-host" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - PANORAMIC_DYNAMIC_CONTAINER_IP: "hostname -i | awk '{print $1}'" - DD_BIND_HOST: "{{PANORAMIC_DYNAMIC_CONTAINER_IP}}" assertions: - type: log_contains diff --git a/test/integration/cases/dogstatsd-default-bind/config.yaml b/test/integration/cases/dogstatsd-default-bind/config.yaml index edc6d9ba53f..fcc25365936 100644 --- a/test/integration/cases/dogstatsd-default-bind/config.yaml +++ b/test/integration/cases/dogstatsd-default-bind/config.yaml @@ -14,14 +14,15 @@ description: "Verifies DogStatsD binds to 127.0.0.1 by default when bind_host is timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd-default-bind" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd-default-bind" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" assertions: - type: log_contains diff --git a/test/integration/cases/dogstatsd-enabled/config.yaml b/test/integration/cases/dogstatsd-enabled/config.yaml index 1efea0014fd..eb3379820ea 100644 --- a/test/integration/cases/dogstatsd-enabled/config.yaml +++ b/test/integration/cases/dogstatsd-enabled/config.yaml @@ -4,15 +4,16 @@ description: "Verifies DogStatsD pipeline starts and listens on UDP port" timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" exposed_ports: - "58125/udp" diff --git a/test/integration/cases/dogstatsd-forwarding/config.yaml b/test/integration/cases/dogstatsd-forwarding/config.yaml index 259d044412e..5bbd193527c 100644 --- a/test/integration/cases/dogstatsd-forwarding/config.yaml +++ b/test/integration/cases/dogstatsd-forwarding/config.yaml @@ -5,23 +5,24 @@ timeout: 90s # Not yet validated under mac; opt in by adding `mac` to this list once verified. runtimes: [docker] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd-forwarding" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_DOGSTATSD_TCP_PORT: "9126" + DD_DOGSTATSD_SOCKET: "/tmp/dsd-forwarding.sock" + DD_DOGSTATSD_STREAM_SOCKET: "/tmp/dsd-forwarding-stream.sock" + DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" + DD_STATSD_FORWARD_HOST: "127.0.0.1" + DD_STATSD_FORWARD_PORT: "9125" + container: image: "saluki-images/datadog-agent:testing-devel" entrypoint: - "/opt/datadog-agent/embedded/bin/python3" - "/forwarding-test/run_forwarding_test.py" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd-forwarding" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_TCP_PORT: "9126" - DD_DOGSTATSD_SOCKET: "/tmp/dsd-forwarding.sock" - DD_DOGSTATSD_STREAM_SOCKET: "/tmp/dsd-forwarding-stream.sock" - DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" - DD_STATSD_FORWARD_HOST: "127.0.0.1" - DD_STATSD_FORWARD_PORT: "9125" files: - "run_forwarding_test.py:/forwarding-test/run_forwarding_test.py" exposed_ports: diff --git a/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py b/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py index da6121ae4dc..bee91d3b34d 100644 --- a/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py +++ b/test/integration/cases/dogstatsd-forwarding/run_forwarding_test.py @@ -30,8 +30,8 @@ DOGSTATSD_UDS_PATH = "/tmp/dsd-forwarding.sock" DOGSTATSD_UDS_STREAM_PATH = "/tmp/dsd-forwarding-stream.sock" TELEMETRY_URLS = ( - "http://127.0.0.1:5100/metrics", - "http://127.0.0.1:5100/compat/metrics", + "http://127.0.0.1:55100/metrics", + "http://127.0.0.1:55100/compat/metrics", ) PROBE_TIMEOUT_SECS = 60 PROBE_INTERVAL_SECS = 0.25 diff --git a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml index 047ddf94e7f..9c5db0b8ff7 100644 --- a/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml +++ b/test/integration/cases/dogstatsd-non-local-overrides-bind-host/config.yaml @@ -16,19 +16,20 @@ description: "Verifies dogstatsd_non_local_traffic takes precedence over bind_ho timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd-non-local-override" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" + # Arbitrary, unreachable IP: non_local_traffic=true wins over bind_host, + # so the value is ignored. Using something obviously-not-a-real-binding + # address makes the "the value doesn't matter" intent unmistakable. + DD_BIND_HOST: "10.9.8.7" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd-non-local-override" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" - DD_DOGSTATSD_NON_LOCAL_TRAFFIC: "true" - # Arbitrary, unreachable IP: non_local_traffic=true wins over bind_host, - # so the value is ignored. Using something obviously-not-a-real-binding - # address makes the "the value doesn't matter" intent unmistakable. - DD_BIND_HOST: "10.9.8.7" assertions: - type: log_contains diff --git a/test/integration/cases/otlp-traces-enabled/config.yaml b/test/integration/cases/otlp-traces-enabled/config.yaml index 8a482db57c8..282c18b7d0a 100644 --- a/test/integration/cases/otlp-traces-enabled/config.yaml +++ b/test/integration/cases/otlp-traces-enabled/config.yaml @@ -4,19 +4,20 @@ description: "Verifies OTLP pipeline starts with native trace handling and proxy timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-dsd" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_OTLP_ENABLED: "true" + DD_DATA_PLANE_OTLP_PROXY_ENABLED: "true" + DD_DATA_PLANE_OTLP_PROXY_TRACES_ENABLED: "false" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-dsd" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_OTLP_ENABLED: "true" - DD_DATA_PLANE_OTLP_PROXY_ENABLED: "true" - DD_DATA_PLANE_OTLP_PROXY_TRACES_ENABLED: "false" exposed_ports: - - "4317/tcp" - - "4318/tcp" + - "54317/tcp" + - "54318/tcp" assertions: # Make sure the process becomes healthy, and stays up without errors, listening for OTLP (HTTP and gRPC), @@ -25,11 +26,11 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 4317 + port: 54317 protocol: tcp timeout: 10s - type: port_listening - port: 4318 + port: 54318 protocol: tcp timeout: 10s - type: log_not_contains diff --git a/test/integration/cases/privileged-api-endpoints/config.yaml b/test/integration/cases/privileged-api-endpoints/config.yaml index c72e146f419..c5b321cf074 100644 --- a/test/integration/cases/privileged-api-endpoints/config.yaml +++ b/test/integration/cases/privileged-api-endpoints/config.yaml @@ -4,23 +4,24 @@ description: "Verifies the logging and metrics override routes are exposed on th timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-privileged-api" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-privileged-api" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "5101/tcp" + - "55101/tcp" assertions: - parallel: - type: process_stable_for duration: 10s - type: port_listening - port: 5101 + port: 55101 protocol: tcp timeout: 20s # Each of the four routes below is registered as POST-only by the corresponding override @@ -28,31 +29,31 @@ assertions: # if the worker failed to assert its DynamicRoute, the request would return 404 instead. # Asserting "status != 404" lets us verify route registration without exercising the routes. - type: http_check - endpoint: "https://localhost:5101/logging/override" + endpoint: "https://localhost:55101/logging/override" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/logging/reset" + endpoint: "https://localhost:55101/logging/reset" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/metrics/override" + endpoint: "https://localhost:55101/metrics/override" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/metrics/reset" + endpoint: "https://localhost:55101/metrics/reset" status: not_equal: 404 insecure_skip_verify: true timeout: 20s - type: http_check - endpoint: "https://localhost:5101/config" + endpoint: "https://localhost:55101/config" status: not_equal: 404 insecure_skip_verify: true diff --git a/test/integration/cases/telemetry-endpoint/config.yaml b/test/integration/cases/telemetry-endpoint/config.yaml index 22f5b38c588..9355297bbb4 100644 --- a/test/integration/cases/telemetry-endpoint/config.yaml +++ b/test/integration/cases/telemetry-endpoint/config.yaml @@ -4,16 +4,17 @@ description: "Verifies the internal telemetry routes are exposed on the unprivil timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-telemetry" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-telemetry" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "5100/tcp" + - "55100/tcp" assertions: # Make sure the process becomes healthy, and stays up without errors, with the telemetry routes @@ -22,16 +23,16 @@ assertions: - type: process_stable_for duration: 10s - type: port_listening - port: 5100 + port: 55100 protocol: tcp timeout: 10s - type: http_check - endpoint: "http://localhost:5100/metrics" + endpoint: "http://localhost:55100/metrics" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:5100/compat/metrics" + endpoint: "http://localhost:55100/compat/metrics" status: equal: 200 timeout: 10s diff --git a/test/integration/cases/unprivileged-api-endpoints/config.yaml b/test/integration/cases/unprivileged-api-endpoints/config.yaml index d5014bc3edb..584bd51cd54 100644 --- a/test/integration/cases/unprivileged-api-endpoints/config.yaml +++ b/test/integration/cases/unprivileged-api-endpoints/config.yaml @@ -4,33 +4,34 @@ description: "Verifies the /ready, /live, and /memory/status endpoints are acces timeout: 120s runtimes: [docker, mac] +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-health" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-health" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" exposed_ports: - - "5100/tcp" + - "55100/tcp" assertions: - parallel: - type: process_stable_for duration: 10s - type: http_check - endpoint: "http://localhost:5100/ready" + endpoint: "http://localhost:55100/ready" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:5100/live" + endpoint: "http://localhost:55100/live" status: equal: 200 timeout: 10s - type: http_check - endpoint: "http://localhost:5100/memory/status" + endpoint: "http://localhost:55100/memory/status" status: equal: 200 timeout: 10s From 56fa0e7599f690071d09ac555c0f6fb9824f9008 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 15:30:04 -0400 Subject: [PATCH 44/56] chore(docs): fix em-dash spacing in env field docstring vale's Google.EmDash rule rejects 'space - dash - space'. Replace with a colon, same meaning. --- bin/correctness/panoramic/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index d602aeaa9f0..93d5577cd2e 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -114,7 +114,7 @@ pub struct IntegrationConfig { /// Environment variables to set on the target process(es). /// /// Top-level (not under `container`) because both the docker and `mac` runtimes apply - /// these the same way — docker injects them as container env, the Unix runner passes them + /// these the same way: docker injects them as container env, the Unix runner passes them /// to the spawned ADP / Core Agent processes. #[serde(default)] pub env: HashMap, From c780fd096526e97eb37a6ec326f5f0e911a68a8a Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 15:33:05 -0400 Subject: [PATCH 45/56] build: run check-docs in the pre-commit hook The setup-hooks Make target points core.hooksPath at .githooks/, but .githooks/pre-commit was only invoking the subset of checks that fail hard at build time (fmt/clippy/licenses/deny/api-docs) and not vale's docs check. Vale errors only surfaced in CI, so style violations like em-dashes or 'e.g.' kept landing on PRs and bouncing the check-docs job on every push. Add 'make check-docs' to the hook so commits get the same vale gate locally. Hook order is unchanged otherwise; check-docs slots in just before generate-api-docs because the rustdoc generation is the slowest step. --- .githooks/pre-commit | 1 + 1 file changed, 1 insertion(+) diff --git a/.githooks/pre-commit b/.githooks/pre-commit index 20373534d85..2c604b8bd7b 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -7,6 +7,7 @@ make check-fmt make check-clippy make check-licenses make check-deny +make check-docs make generate-api-docs echo "[*] Pre-commit checks passed." From 9ab4324a9cc6a4bfd297168d0d75215442773fae Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 15:48:09 -0400 Subject: [PATCH 46/56] refactor(panoramic): move port-isolation env helper out of unix_runner The helper is framework-wide \u2014 the docker runner reaches into unix_runner for it, which made unix_runner's surface artificially load-bearing for non-Unix tests too. Lift it into a dedicated `test_env` module so both runners pull from a neutral spot, and the module name leaves room for future framework-wide env defaults (test API keys, host names, log levels, etc.) without a rename. While moving, drop the redundant `test_` prefix from the function itself \u2014 it's now `test_env::port_isolation_env()`. --- bin/correctness/panoramic/src/main.rs | 1 + bin/correctness/panoramic/src/runner.rs | 2 +- bin/correctness/panoramic/src/test_env.rs | 63 ++++++++++++++++++++ bin/correctness/panoramic/src/unix_runner.rs | 60 +------------------ 4 files changed, 67 insertions(+), 59 deletions(-) create mode 100644 bin/correctness/panoramic/src/test_env.rs diff --git a/bin/correctness/panoramic/src/main.rs b/bin/correctness/panoramic/src/main.rs index f44640cae45..717ca8a089f 100644 --- a/bin/correctness/panoramic/src/main.rs +++ b/bin/correctness/panoramic/src/main.rs @@ -34,6 +34,7 @@ use self::reporter::{OutputFormat, Reporter, TestResult, TestSuiteResult}; mod runner; mod test; +mod test_env; mod tui; mod unix_runner; mod utils; diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index 30c48a38fc8..ae83fdee244 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -697,7 +697,7 @@ impl IntegrationRunner { // defaults are applied first so the test's `env` block takes precedence. Keeps the test // surface consistent across the docker and `mac` runtimes — both see the same shifted // port table. - let mut merged_env = crate::unix_runner::test_port_isolation_env(); + let mut merged_env = crate::test_env::port_isolation_env(); for (k, v) in &self.test_case.env { merged_env.insert(k.clone(), v.clone()); } diff --git a/bin/correctness/panoramic/src/test_env.rs b/bin/correctness/panoramic/src/test_env.rs new file mode 100644 index 00000000000..358fc1fdb9a --- /dev/null +++ b/bin/correctness/panoramic/src/test_env.rs @@ -0,0 +1,63 @@ +//! Framework-level environment overrides applied to every integration test target. +//! +//! Today this is just port isolation, but the module is named generically so future +//! framework-wide env defaults (test-specific API keys, host names, log levels, etc.) have a +//! natural home here. + +use std::collections::HashMap; + +/// Framework-level env overrides that move every default port the test target binds off its +/// canonical value, so concurrent test runs and any system Agent / system ADP on the host can +/// coexist with the per-test processes. Tests can override any of these via their `env` block; +/// tests that exercise specific port behavior (`adp-cmd-port`) supply their own values. +/// +/// Naming convention: every default port that's 4 digits gets a `5` prepended (8125 -> 58125, +/// 5001 -> 55001, etc.). The GUI is disabled outright since we don't exercise it. +/// +/// Note on env-var nesting: saluki-config (and figment) split env-var names on `__` to map to +/// nested config keys. Single-underscore env vars like `DD_DATA_PLANE_API_LISTEN_ADDRESS` map +/// to the flat key `data_plane_api_listen_address` and are silently ignored; we use double +/// underscores at every dot boundary for the deep ADP / OTLP keys below. The top-level Agent +/// env vars (`DD_CMD_PORT` etc.) are explicitly queried by the Agent so they don't need it. +pub fn port_isolation_env() -> HashMap { + HashMap::from([ + // ----- Core Agent ports ----- + // CMD/IPC API. Shared key between the Core Agent (listener) and ADP (IPC client). + // `adp-cmd-port` overrides this via its `env` block to validate the non-default path. + ("DD_CMD_PORT".to_string(), "55001".to_string()), + // GUI — disabled outright. No integration test exercises it. + ("DD_GUI_PORT".to_string(), "-1".to_string()), + // expvar / APM / process / secondary IPC — not assertion targets, but the Agent will + // still try to bind them on startup, so shift them out of the way. + ("DD_EXPVAR_PORT".to_string(), "55000".to_string()), + ("DD_APM_RECEIVER_PORT".to_string(), "58126".to_string()), + ("DD_PROCESS_CONFIG_CMD_PORT".to_string(), "56062".to_string()), + ("DD_AGENT_IPC_PORT".to_string(), "55004".to_string()), + // DogStatsD UDP. In converged tests the Core Agent's DSD is disabled by + // DD_DATA_PLANE_ENABLED so this mainly affects ADP (the actual listener) and the + // bootstrap-mode Agent. + ("DD_DOGSTATSD_PORT".to_string(), "58125".to_string()), + // ----- ADP listen addresses ----- (URI-style; ListenAddress accepts `tcp://host:port`) + ( + "DD_DATA_PLANE__API_LISTEN_ADDRESS".to_string(), + "tcp://0.0.0.0:55100".to_string(), + ), + ( + "DD_DATA_PLANE__SECURE_API_LISTEN_ADDRESS".to_string(), + "tcp://0.0.0.0:55101".to_string(), + ), + ( + "DD_DATA_PLANE__TELEMETRY_LISTEN_ADDR".to_string(), + "tcp://0.0.0.0:55102".to_string(), + ), + // ----- OTLP receiver endpoints ----- (same shape as the Datadog Agent's OTLP env vars) + ( + "DD_OTLP_CONFIG__RECEIVER__PROTOCOLS__GRPC__ENDPOINT".to_string(), + "0.0.0.0:54317".to_string(), + ), + ( + "DD_OTLP_CONFIG__RECEIVER__PROTOCOLS__HTTP__ENDPOINT".to_string(), + "0.0.0.0:54318".to_string(), + ), + ]) +} diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 88adc9af0cf..97b793192e4 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -57,66 +57,10 @@ const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/tmp/saluki-dda/datadog-agent/bin/ const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); -/// Framework-level env overrides that move every default port the test target binds off its -/// canonical value, so concurrent test runs and any system Agent / system ADP on the host can -/// coexist with the per-test processes. Tests can override any of these via their `env` block; -/// tests that exercise specific port behavior (`adp-cmd-port`) supply their own values. -/// -/// Naming convention: every default port that's 4 digits gets a `5` prepended (8125 -> 58125, -/// 5001 -> 55001, etc.). The GUI is disabled outright since we don't exercise it. -/// -/// Note on env-var nesting: saluki-config (and figment) split env-var names on `__` to map to -/// nested config keys. Single-underscore env vars like `DD_DATA_PLANE_API_LISTEN_ADDRESS` map -/// to the flat key `data_plane_api_listen_address` and are silently ignored; we use double -/// underscores at every dot boundary for the deep ADP / OTLP keys below. The top-level Agent -/// env vars (`DD_CMD_PORT` etc.) are explicitly queried by the Agent so they don't need it. -pub fn test_port_isolation_env() -> HashMap { - HashMap::from([ - // ----- Core Agent ports ----- - // CMD/IPC API. Shared key between the Core Agent (listener) and ADP (IPC client). - // `adp-cmd-port` overrides this via its `env` block to validate the non-default path. - ("DD_CMD_PORT".to_string(), "55001".to_string()), - // GUI — disabled outright. No integration test exercises it. - ("DD_GUI_PORT".to_string(), "-1".to_string()), - // expvar / APM / process / secondary IPC — not assertion targets, but the Agent will - // still try to bind them on startup, so shift them out of the way. - ("DD_EXPVAR_PORT".to_string(), "55000".to_string()), - ("DD_APM_RECEIVER_PORT".to_string(), "58126".to_string()), - ("DD_PROCESS_CONFIG_CMD_PORT".to_string(), "56062".to_string()), - ("DD_AGENT_IPC_PORT".to_string(), "55004".to_string()), - // DogStatsD UDP. In converged tests the Core Agent's DSD is disabled by - // DD_DATA_PLANE_ENABLED so this mainly affects ADP (the actual listener) and the - // bootstrap-mode Agent. - ("DD_DOGSTATSD_PORT".to_string(), "58125".to_string()), - // ----- ADP listen addresses ----- (URI-style; ListenAddress accepts `tcp://host:port`) - ( - "DD_DATA_PLANE__API_LISTEN_ADDRESS".to_string(), - "tcp://0.0.0.0:55100".to_string(), - ), - ( - "DD_DATA_PLANE__SECURE_API_LISTEN_ADDRESS".to_string(), - "tcp://0.0.0.0:55101".to_string(), - ), - ( - "DD_DATA_PLANE__TELEMETRY_LISTEN_ADDR".to_string(), - "tcp://0.0.0.0:55102".to_string(), - ), - // ----- OTLP receiver endpoints ----- (same shape as the Datadog Agent's OTLP env vars) - ( - "DD_OTLP_CONFIG__RECEIVER__PROTOCOLS__GRPC__ENDPOINT".to_string(), - "0.0.0.0:54317".to_string(), - ), - ( - "DD_OTLP_CONFIG__RECEIVER__PROTOCOLS__HTTP__ENDPOINT".to_string(), - "0.0.0.0:54318".to_string(), - ), - ]) -} - /// Builds the env for a target process (Core Agent or ADP) under the Unix runner. /// /// Precedence (lowest to highest): -/// 1. framework port-isolation defaults (`test_port_isolation_env`) +/// 1. framework port-isolation defaults (`crate::test_env::port_isolation_env`) /// 2. the test's top-level `env` block /// 3. forced overrides supplied by the caller (auth token path, run path, …) /// @@ -124,7 +68,7 @@ pub fn test_port_isolation_env() -> HashMap { /// because they're path-bindings tests must not be able to override (they identify per-test /// state directories that the runner owns). fn build_process_env(test_env: &HashMap, forced: &[(&str, String)]) -> HashMap { - let mut env = test_port_isolation_env(); + let mut env = crate::test_env::port_isolation_env(); for (k, v) in test_env { env.insert(k.clone(), v.clone()); } From f213245e0a59eaa5f23c5335f63f5e0160f4d7fe Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Thu, 28 May 2026 16:23:48 -0400 Subject: [PATCH 47/56] build: seed empty datadog.yaml before bootstrap Agent runs The Datadog Agent at startup searches for a 'datadog' config file in [/etc /opt/datadog-agent/etc] and aborts ('Config File Not Found') if neither has one. On the previous bare-metal CI runners the fallback worked because /opt/datadog-agent was pre-installed with a real datadog.yaml. On the new arm64 Tart runners (fresh VM, no system install), neither path is populated and the bootstrap step never even reaches IPC cert generation. The pkg payload we extract only includes datadog.yaml.example, not datadog.yaml proper. Drop a zero-byte datadog.yaml into the sandbox etc/ before launching the bootstrap Agent. Tests communicate config via env vars; the file's content doesn't matter, only its existence. --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index f89c57e846e..832368a31bc 100644 --- a/Makefile +++ b/Makefile @@ -638,6 +638,7 @@ provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGE @if [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/ipc_cert.pem ] || [ ! -f $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token ]; then \ echo "[*] Bootstrapping IPC cert + auth_token by running the Agent briefly..."; \ mkdir -p $(MACOS_TEST_AGENT_INSTALL_DIR)/etc $(MACOS_TEST_AGENT_INSTALL_DIR)/run; \ + touch $(MACOS_TEST_AGENT_INSTALL_DIR)/etc/datadog.yaml; \ DD_API_KEY=bootstrap DD_HOSTNAME=bootstrap \ DD_RUN_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/run \ DD_AUTH_TOKEN_FILE_PATH=$(MACOS_TEST_AGENT_INSTALL_DIR)/etc/auth_token \ From 964d00b41d946bef1bc65383688bed894a34d9af Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 09:48:55 -0400 Subject: [PATCH 48/56] fix(panoramic): make PanoramicLogSink::push_line synchronous The 'simplification' in fe5e9a4221 made push_line spawn a tokio task per log line to do the actual buffer append. On the slower arm64 Tart VM those spawned tasks queue up faster than the runtime drains them; by the time the assertion polls or write_logs reads the buffer, only the first couple of lines have actually landed. amd64 (faster CPU / more cores) drains enough tasks for most assertions to still pass, but arm64 captures literally 2 lines per failing test \u2014 the rest are still stuck in the spawn queue. Switch LogBuffer's lock from tokio::sync::RwLock to std::sync::RwLock so push_line can take the lock and append synchronously from the pump's sync trait method. No spawn-per-line, no fire-and-forget, no ordering race. All log_buffer.read()/.write() call sites updated to sync. None of the existing critical sections held the lock across an .await, so the swap is mechanical. The one snag was adp_exits.rs's read guard being held in scope past the next sleep().await; tightened that with a block so the guard drops before the await. --- .../panoramic/src/assertions/adp_exits.rs | 8 ++++-- .../panoramic/src/assertions/log_contains.rs | 4 +-- .../panoramic/src/assertions/mod.rs | 2 +- bin/correctness/panoramic/src/runner.rs | 7 +++-- bin/correctness/panoramic/src/unix_runner.rs | 28 +++++++++---------- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/bin/correctness/panoramic/src/assertions/adp_exits.rs b/bin/correctness/panoramic/src/assertions/adp_exits.rs index eec0d1f6f12..5b2a7a50a4f 100644 --- a/bin/correctness/panoramic/src/assertions/adp_exits.rs +++ b/bin/correctness/panoramic/src/assertions/adp_exits.rs @@ -135,8 +135,11 @@ impl AdpExitsWithAssertion { duration: started.elapsed(), }; } - let buf = ctx.log_buffer.read().await; - if buf.contains_match(&pattern, false, &LogStream::Both) { + let matched = { + let buf = ctx.log_buffer.read().unwrap(); + buf.contains_match(&pattern, false, &LogStream::Both) + }; + if matched { return AssertionResult { name: self.name().to_string(), passed: true, @@ -144,7 +147,6 @@ impl AdpExitsWithAssertion { duration: started.elapsed(), }; } - drop(buf); tokio::time::sleep(Duration::from_millis(200)).await; } } diff --git a/bin/correctness/panoramic/src/assertions/log_contains.rs b/bin/correctness/panoramic/src/assertions/log_contains.rs index a5582173fee..c38e42c9db3 100644 --- a/bin/correctness/panoramic/src/assertions/log_contains.rs +++ b/bin/correctness/panoramic/src/assertions/log_contains.rs @@ -78,7 +78,7 @@ impl Assertion for LogContainsAssertion { // Check the log buffer. { - let buffer = ctx.log_buffer.read().await; + let buffer = ctx.log_buffer.read().unwrap(); if buffer.contains_match(&self.pattern, self.is_regex, &self.stream) { return AssertionResult { name: self.name().to_string(), @@ -187,7 +187,7 @@ impl Assertion for LogNotContainsAssertion { // Check the log buffer for the unwanted pattern. { - let buffer = ctx.log_buffer.read().await; + let buffer = ctx.log_buffer.read().unwrap(); if let Some(matching_line) = buffer.find_match(&self.pattern, self.is_regex, &self.stream) { // Truncate the matching line for display. let display_line = if matching_line.len() > 100 { diff --git a/bin/correctness/panoramic/src/assertions/mod.rs b/bin/correctness/panoramic/src/assertions/mod.rs index 0aa8d429dd9..f5061247643 100644 --- a/bin/correctness/panoramic/src/assertions/mod.rs +++ b/bin/correctness/panoramic/src/assertions/mod.rs @@ -1,8 +1,8 @@ +use std::sync::RwLock; use std::{sync::Arc, time::Duration}; use futures::future; use saluki_error::GenericError; -use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; use tracing::{debug, error}; diff --git a/bin/correctness/panoramic/src/runner.rs b/bin/correctness/panoramic/src/runner.rs index ae83fdee244..e1f1c97becc 100644 --- a/bin/correctness/panoramic/src/runner.rs +++ b/bin/correctness/panoramic/src/runner.rs @@ -4,6 +4,7 @@ //! is used regardless of output mode (TUI or plain). Events are emitted to a channel //! and consumed by either a TUI renderer or logging consumer. +use std::sync::RwLock; use std::{ collections::HashMap, io::Write as _, @@ -16,7 +17,7 @@ use airlock::driver::{Driver, DriverConfig, DriverDetails}; use bollard::container::LogOutput; use futures::stream::{self, StreamExt as _}; use saluki_error::{generic_error, ErrorContext as _, GenericError}; -use tokio::sync::{mpsc, RwLock, Semaphore}; +use tokio::sync::{mpsc, Semaphore}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; @@ -779,7 +780,7 @@ impl IntegrationRunner { while let Some(log_result) = log_stream.next().await { match log_result { Ok(log) => { - let mut buffer = log_buffer.write().await; + let mut buffer = log_buffer.write().unwrap(); match log { LogOutput::StdOut { message } => { if let Ok(line) = String::from_utf8(message.to_vec()) { @@ -843,7 +844,7 @@ impl IntegrationRunner { let log_dir = self.tctx.log_dir(); // Get the log buffer contents. - let buffer = self.log_buffer.read().await; + let buffer = self.log_buffer.read().unwrap(); // Write stdout. let stdout_path = log_dir.join("stdout.log"); diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 97b793192e4..9b2798bff59 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -25,6 +25,7 @@ //! `make provision-macos-test-env`). Set the env var explicitly to point at a different //! install (for example, a system-wide `/opt/datadog-agent` on a developer host). +use std::sync::RwLock; use std::{ collections::HashMap, path::PathBuf, @@ -35,7 +36,7 @@ use std::{ use airlock::unix::{LogSink, UnixProcess, UnixProcessConfig}; use rand::distr::SampleString as _; use saluki_error::{ErrorContext as _, GenericError}; -use tokio::sync::{Mutex, RwLock}; +use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; @@ -329,7 +330,7 @@ impl UnixIntegrationRunner { use std::io::Write as _; let log_dir = self.tctx.log_dir(); - let buffer = self.log_buffer.read().await; + let buffer = self.log_buffer.read().unwrap(); let stdout_path = log_dir.join("stdout.log"); let mut stdout_file = std::fs::File::create(&stdout_path) @@ -443,18 +444,15 @@ struct PanoramicLogSink { impl LogSink for PanoramicLogSink { fn push_line(&mut self, line: String, is_stderr: bool) { - // The log pump (in airlock::unix) holds the LogSink's outer mutex while calling us, - // so writes from a single pump are already serialized. Spawn a small task to actually - // append to the buffer so we never block the pump on `.write().await` ordering with - // concurrent assertion readers. - let buf = self.buf.clone(); - tokio::spawn(async move { - let mut buf = buf.write().await; - if is_stderr { - buf.stderr.push(line); - } else { - buf.stdout.push(line); - } - }); + // Synchronous write into the shared LogBuffer. The lock is briefly contended with + // assertion readers; the critical section is just a Vec::push, so it's cheap. Doing + // this synchronously (rather than spawning a tokio task per line) guarantees the + // buffer is up-to-date by the time the assertion polls. + let mut buf = self.buf.write().unwrap(); + if is_stderr { + buf.stderr.push(line); + } else { + buf.stdout.push(line); + } } } From 13314a0e00f14f86a176d0b5336f3516ce4205f7 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 10:24:31 -0400 Subject: [PATCH 49/56] ci: collect host-level diagnostics into the integration-logs artifact When a macOS integration test fails outside panoramic's per-test log capture (bootstrap-Agent issues, stranded processes from this run, weird mount or filesystem state), nothing useful lands in the artifact today \u2014 panoramic only knows about its per-test stdout/stderr, and /tmp/saluki-agent-bootstrap.log lives outside the artifact path. Add an after_script that runs whether the script passed or failed (GitLab CI semantics) and writes a small bundle of host state into integration-logs/host-diag/: sw_vers, uname, mount, df, ps, the bootstrap log, sandbox etc/ listing The existing artifacts.paths declares integration-logs/ with when: always, so anything dropped under that dir is uploaded automatically \u2014 no artifacts-config change needed. Cheap to collect, can be the difference between 'I have no idea why this failed in CI' and 'oh, that process is still running'. Comment refresh: the helper formerly known as test_port_isolation_env moved to test_env::port_isolation_env earlier in this branch; the pkill block still references its old name. --- .gitlab/e2e.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.gitlab/e2e.yml b/.gitlab/e2e.yml index a3572903f9f..076776cf7c8 100644 --- a/.gitlab/e2e.yml +++ b/.gitlab/e2e.yml @@ -139,7 +139,7 @@ test-integration: before_script: # Defensive: clean up any leftover Agent/ADP processes from prior runs on this shared # runner. All test-Agent ports are shifted out of the canonical range (see - # panoramic::unix_runner::test_port_isolation_env), so we don't need to touch a system + # panoramic::test_env::port_isolation_env), so we don't need to touch a system # install at /opt/datadog-agent. We do need to sweep: # - our own Core Agent sandbox under /tmp/saluki-dda (trace-agent / process-agent # children that survived a non-graceful job termination still hold our shifted ports) @@ -149,6 +149,19 @@ test-integration: - sudo pkill -9 -f /target/release/agent-data-plane || true script: - make test-integration-macos-ci + after_script: + # Collect host-level diagnostics into the artifact so we have something to debug from + # when something fails outside panoramic's per-test log capture (bootstrap-Agent failures, + # system state, stranded processes from this run). Runs whether the test step passed or + # failed. + - mkdir -p integration-logs/host-diag + - sw_vers > integration-logs/host-diag/sw_vers.txt 2>&1 || true + - uname -a > integration-logs/host-diag/uname.txt 2>&1 || true + - mount > integration-logs/host-diag/mount.txt 2>&1 || true + - df -h > integration-logs/host-diag/df.txt 2>&1 || true + - ps -axo pid,ppid,user,command > integration-logs/host-diag/ps.txt 2>&1 || true + - cp /tmp/saluki-agent-bootstrap.log integration-logs/host-diag/saluki-agent-bootstrap.log 2>/dev/null || true + - ls -la /tmp/saluki-dda/datadog-agent/etc/ > integration-logs/host-diag/sandbox-etc.txt 2>&1 || true test-integration-macos-arm64: extends: From 591af05f8588a188d533c3559966dd0c4d141587 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 11:13:28 -0400 Subject: [PATCH 50/56] fix(panoramic): seed IPC credentials for standalone mac tests Standalone macOS integration tests run ADP without a Core Agent, so no Agent process writes the IPC certificate ADP needs for its privileged API. ADP fell back to the platform default path under /opt/datadog-agent and waited up to 20 seconds for that file, causing 15-second log assertions to time out after only the startup/standalone-mode lines. Seed a per-test auth token and self-signed IPC certificate for standalone Unix-runner tests, and force ADP to use those paths. Converged tests keep using the Core Agent-generated credentials in the same per-test state dir. Verified with: - cargo test -p panoramic unix_runner::tests::standalone_adp_env_points_ipc_credentials_at_test_state_dir - cargo check -p panoramic - PANORAMIC_LOG_DIR=/tmp/panoramic-standalone-fix-all make test-integration-macos-run CASE=adp-memory-mode-disabled,adp-memory-mode-permissive-exceeds-limit,adp-memory-mode-permissive-within-limit,adp-memory-mode-strict-within-limit,dogstatsd-autoscale-udp,dogstatsd-default-bind,dogstatsd-non-local-overrides-bind-host,otlp-traces-enabled,privileged-api-endpoints,telemetry-endpoint,unprivileged-api-endpoints --- Cargo.lock | 1 + bin/correctness/panoramic/Cargo.toml | 1 + bin/correctness/panoramic/src/unix_runner.rs | 74 +++++++++++++++++--- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ec91ddbc2ce..67704e91eb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2976,6 +2976,7 @@ dependencies = [ "kube", "rand 0.10.1", "rand_distr", + "rcgen", "regex", "reqwest", "rustls", diff --git a/bin/correctness/panoramic/Cargo.toml b/bin/correctness/panoramic/Cargo.toml index 4edd9b13443..a160d9c4cae 100644 --- a/bin/correctness/panoramic/Cargo.toml +++ b/bin/correctness/panoramic/Cargo.toml @@ -22,6 +22,7 @@ k8s-openapi = { workspace = true } kube = { workspace = true, features = ["client", "http-proxy", "rustls-tls", "ws"] } rand = { workspace = true, features = ["std", "std_rng", "thread_rng"] } rand_distr = { workspace = true } +rcgen = { workspace = true, features = ["crypto", "aws_lc_rs", "pem"] } regex = { workspace = true, features = ["std"] } reqwest = { workspace = true, features = ["json", "zstd", "rustls", "query"] } rustls = { workspace = true } diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 9b2798bff59..d15003b1dfe 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -28,13 +28,14 @@ use std::sync::RwLock; use std::{ collections::HashMap, - path::PathBuf, + path::{Path, PathBuf}, sync::Arc, time::{Duration, Instant}, }; use airlock::unix::{LogSink, UnixProcess, UnixProcessConfig}; -use rand::distr::SampleString as _; +use rand::distr::{Alphanumeric, SampleString as _}; +use rcgen::{generate_simple_self_signed, CertifiedKey}; use saluki_error::{ErrorContext as _, GenericError}; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; @@ -229,13 +230,15 @@ impl UnixIntegrationRunner { // Phase: spawn ADP. let spawn_start = Instant::now(); let config_path_str = config_path.to_string_lossy().into_owned(); - let adp_forced: Vec<(&str, String)> = if self.test_case.requires_core_agent { - // Point ADP's IPC client at the per-test auth token (and by derivation, the - // per-test ipc_cert.pem in the same directory). - vec![("DD_AUTH_TOKEN_FILE_PATH", auth_token_path)] - } else { - Vec::new() - }; + if !self.test_case.requires_core_agent { + if let Err(e) = seed_standalone_ipc_credentials(&state_dir, &auth_token_path) { + if let Some(agent) = core_agent.take() { + agent.cleanup().await; + } + return make_error_result(test_name, started, "prepare_standalone_ipc", e, phase_timings); + } + } + let adp_forced = build_adp_forced_env(self.test_case.requires_core_agent, &state_dir, auth_token_path); let adp_env = build_process_env(&self.test_case.env, &adp_forced); let process_config = UnixProcessConfig::new(self.test_case.name.clone(), binary_path) .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) @@ -395,7 +398,37 @@ fn resolve_core_agent_binary_path() -> Result { }) } -async fn wait_for_agent_ipc_ready(state_dir: &std::path::Path, timeout: Duration) -> Result<(), GenericError> { +fn build_adp_forced_env( + requires_core_agent: bool, state_dir: &Path, auth_token_path: String, +) -> Vec<(&'static str, String)> { + if requires_core_agent { + vec![("DD_AUTH_TOKEN_FILE_PATH", auth_token_path)] + } else { + vec![ + ("DD_AUTH_TOKEN_FILE_PATH", auth_token_path), + ( + "DD_IPC_CERT_FILE_PATH", + state_dir.join("ipc_cert.pem").to_string_lossy().into_owned(), + ), + ] + } +} + +fn seed_standalone_ipc_credentials(state_dir: &Path, auth_token_path: &str) -> Result<(), GenericError> { + let auth_token = Alphanumeric.sample_string(&mut rand::rng(), 32); + std::fs::write(auth_token_path, auth_token) + .with_error_context(|| format!("Failed to write auth token at '{}'.", auth_token_path))?; + + let CertifiedKey { cert, signing_key } = generate_simple_self_signed(["localhost".to_string()]) + .error_context("Failed to generate self-signed IPC certificate.")?; + let cert_path = state_dir.join("ipc_cert.pem"); + std::fs::write(&cert_path, format!("{}{}", cert.pem(), signing_key.serialize_pem())) + .with_error_context(|| format!("Failed to write IPC certificate at '{}'.", cert_path.display()))?; + + Ok(()) +} + +async fn wait_for_agent_ipc_ready(state_dir: &Path, timeout: Duration) -> Result<(), GenericError> { let auth_token = state_dir.join("auth_token"); let ipc_cert = state_dir.join("ipc_cert.pem"); let deadline = Instant::now() + timeout; @@ -456,3 +489,24 @@ impl LogSink for PanoramicLogSink { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn standalone_adp_env_points_ipc_credentials_at_test_state_dir() { + let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); + let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); + + let env: HashMap<_, _> = build_adp_forced_env(false, &state_dir, auth_token_path.clone()) + .into_iter() + .collect(); + + assert_eq!(env.get("DD_AUTH_TOKEN_FILE_PATH"), Some(&auth_token_path)); + assert_eq!( + env.get("DD_IPC_CERT_FILE_PATH"), + Some(&state_dir.join("ipc_cert.pem").to_string_lossy().into_owned()) + ); + } +} From 98ac52175ca13e8d92a01a4d62fb208ae79ddb3b Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 11:45:42 -0400 Subject: [PATCH 51/56] fix(panoramic): keep mac ADP log files in test state dirs The docker logging tests assert the Linux container default path, /var/log/datadog/agent-data-plane.log. Host-process mac tests should not write to /var or /opt on the runner, and the default mac path is not the container path anyway. When a mac host-process test does not explicitly configure DD_DATA_PLANE_LOG_FILE, set it to the per-test state directory and rewrite assertions that target the platform/container default ADP log file to the same path. Tests with an explicit ADP log-file override keep using their configured path. Verified with: - cargo test -p panoramic unix_runner::tests - cargo check -p panoramic - PANORAMIC_LOG_DIR=/tmp/panoramic-converged-logfix2 make test-integration-macos-run CASE=adp-logging-default-path,adp-logging-ignores-core-agent-log-file --- bin/correctness/panoramic/src/unix_runner.rs | 56 ++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index d15003b1dfe..792736232ba 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -58,6 +58,9 @@ const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/tmp/saluki-dda/datadog-agent/bin/ /// giving up and failing the test. const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); +const DATA_PLANE_LOG_FILE_ENV_VAR: &str = "DD_DATA_PLANE_LOG_FILE"; +const DOCKER_DEFAULT_ADP_LOG_FILE: &str = "/var/log/datadog/agent-data-plane.log"; +const MACOS_DEFAULT_ADP_LOG_FILE: &str = "/opt/datadog-agent/logs/agent-data-plane.log"; /// Builds the env for a target process (Core Agent or ADP) under the Unix runner. /// @@ -230,6 +233,7 @@ impl UnixIntegrationRunner { // Phase: spawn ADP. let spawn_start = Instant::now(); let config_path_str = config_path.to_string_lossy().into_owned(); + prepare_host_process_adp_log_file(&mut self.test_case, &state_dir); if !self.test_case.requires_core_agent { if let Err(e) = seed_standalone_ipc_credentials(&state_dir, &auth_token_path) { if let Some(agent) = core_agent.take() { @@ -398,6 +402,39 @@ fn resolve_core_agent_binary_path() -> Result { }) } +fn prepare_host_process_adp_log_file(test_case: &mut IntegrationConfig, state_dir: &Path) { + let adp_log_file = test_case + .env + .entry(DATA_PLANE_LOG_FILE_ENV_VAR.to_string()) + .or_insert_with(|| state_dir.join("agent-data-plane.log").to_string_lossy().into_owned()) + .clone(); + + rewrite_default_adp_log_file_assertions(&mut test_case.assertions, &adp_log_file); +} + +fn rewrite_default_adp_log_file_assertions(assertions: &mut [crate::config::AssertionStep], adp_log_file: &str) { + for step in assertions { + match step { + crate::config::AssertionStep::Single(assertion) => { + rewrite_default_adp_log_file_assertion(assertion, adp_log_file); + } + crate::config::AssertionStep::Parallel { parallel } => { + for assertion in parallel { + rewrite_default_adp_log_file_assertion(assertion, adp_log_file); + } + } + } + } +} + +fn rewrite_default_adp_log_file_assertion(assertion: &mut crate::config::AssertionConfig, adp_log_file: &str) { + if let crate::config::AssertionConfig::FileContains { path, .. } = assertion { + if path == DOCKER_DEFAULT_ADP_LOG_FILE || path == MACOS_DEFAULT_ADP_LOG_FILE { + *path = adp_log_file.to_string(); + } + } +} + fn build_adp_forced_env( requires_core_agent: bool, state_dir: &Path, auth_token_path: String, ) -> Vec<(&'static str, String)> { @@ -494,6 +531,25 @@ impl LogSink for PanoramicLogSink { mod tests { use super::*; + #[test] + fn host_process_adp_log_file_defaults_to_test_state_dir_and_rewrites_assertions() { + let config_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../../../test/integration/cases/adp-logging-default-path/config.yaml"); + let mut test_case = IntegrationConfig::from_yaml(config_path).expect("load test case"); + let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); + let expected_log_file = state_dir.join("agent-data-plane.log").to_string_lossy().into_owned(); + + prepare_host_process_adp_log_file(&mut test_case, &state_dir); + + assert_eq!(test_case.env.get(DATA_PLANE_LOG_FILE_ENV_VAR), Some(&expected_log_file)); + let crate::config::AssertionStep::Single(crate::config::AssertionConfig::FileContains { path, .. }) = + &test_case.assertions[1] + else { + panic!("expected second assertion to be file_contains"); + }; + assert_eq!(path, &expected_log_file); + } + #[test] fn standalone_adp_env_points_ipc_credentials_at_test_state_dir() { let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); From ea9119eec8dbd2cb1b73e31412364d9c8f86de3b Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 11:59:22 -0400 Subject: [PATCH 52/56] test(panoramic): split ADP log path tests by runtime The Linux default-path logging tests assert container paths such as /var/log/datadog/agent-data-plane.log. Those paths are correct for the docker runtime, but they are not appropriate for macOS host-process tests. Make the existing Linux-path cases docker-only, add an explicit macOS case for the behavior we still need to cover there, and remove the Unix-runner path/env rewriting that made the effective test behavior invisible from the YAML. The macOS case uses explicit Core Agent and ADP log-file paths under /tmp, so it still proves ADP ignores the Core Agent's log_file setting without requiring writes to /var or /opt. Verified with: - cargo test -p panoramic unix_runner::tests - cargo check -p panoramic - PANORAMIC_LOG_DIR=/tmp/panoramic-mac-logging-split make test-integration-macos-run CASE=adp-logging-mac-ignores-core-agent-log-file,adp-logging-respects-data-plane-log-file --- bin/correctness/panoramic/src/unix_runner.rs | 56 ------------------- .../adp-logging-default-path/config.yaml | 4 +- .../config.yaml | 2 +- .../config.yaml | 33 +++++++++++ 4 files changed, 36 insertions(+), 59 deletions(-) create mode 100644 test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index 792736232ba..d15003b1dfe 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -58,9 +58,6 @@ const DEFAULT_CORE_AGENT_BINARY_PATH: &str = "/tmp/saluki-dda/datadog-agent/bin/ /// giving up and failing the test. const CORE_AGENT_IPC_READY_TIMEOUT: Duration = Duration::from_secs(60); const CORE_AGENT_IPC_READY_POLL: Duration = Duration::from_millis(200); -const DATA_PLANE_LOG_FILE_ENV_VAR: &str = "DD_DATA_PLANE_LOG_FILE"; -const DOCKER_DEFAULT_ADP_LOG_FILE: &str = "/var/log/datadog/agent-data-plane.log"; -const MACOS_DEFAULT_ADP_LOG_FILE: &str = "/opt/datadog-agent/logs/agent-data-plane.log"; /// Builds the env for a target process (Core Agent or ADP) under the Unix runner. /// @@ -233,7 +230,6 @@ impl UnixIntegrationRunner { // Phase: spawn ADP. let spawn_start = Instant::now(); let config_path_str = config_path.to_string_lossy().into_owned(); - prepare_host_process_adp_log_file(&mut self.test_case, &state_dir); if !self.test_case.requires_core_agent { if let Err(e) = seed_standalone_ipc_credentials(&state_dir, &auth_token_path) { if let Some(agent) = core_agent.take() { @@ -402,39 +398,6 @@ fn resolve_core_agent_binary_path() -> Result { }) } -fn prepare_host_process_adp_log_file(test_case: &mut IntegrationConfig, state_dir: &Path) { - let adp_log_file = test_case - .env - .entry(DATA_PLANE_LOG_FILE_ENV_VAR.to_string()) - .or_insert_with(|| state_dir.join("agent-data-plane.log").to_string_lossy().into_owned()) - .clone(); - - rewrite_default_adp_log_file_assertions(&mut test_case.assertions, &adp_log_file); -} - -fn rewrite_default_adp_log_file_assertions(assertions: &mut [crate::config::AssertionStep], adp_log_file: &str) { - for step in assertions { - match step { - crate::config::AssertionStep::Single(assertion) => { - rewrite_default_adp_log_file_assertion(assertion, adp_log_file); - } - crate::config::AssertionStep::Parallel { parallel } => { - for assertion in parallel { - rewrite_default_adp_log_file_assertion(assertion, adp_log_file); - } - } - } - } -} - -fn rewrite_default_adp_log_file_assertion(assertion: &mut crate::config::AssertionConfig, adp_log_file: &str) { - if let crate::config::AssertionConfig::FileContains { path, .. } = assertion { - if path == DOCKER_DEFAULT_ADP_LOG_FILE || path == MACOS_DEFAULT_ADP_LOG_FILE { - *path = adp_log_file.to_string(); - } - } -} - fn build_adp_forced_env( requires_core_agent: bool, state_dir: &Path, auth_token_path: String, ) -> Vec<(&'static str, String)> { @@ -531,25 +494,6 @@ impl LogSink for PanoramicLogSink { mod tests { use super::*; - #[test] - fn host_process_adp_log_file_defaults_to_test_state_dir_and_rewrites_assertions() { - let config_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("../../../test/integration/cases/adp-logging-default-path/config.yaml"); - let mut test_case = IntegrationConfig::from_yaml(config_path).expect("load test case"); - let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); - let expected_log_file = state_dir.join("agent-data-plane.log").to_string_lossy().into_owned(); - - prepare_host_process_adp_log_file(&mut test_case, &state_dir); - - assert_eq!(test_case.env.get(DATA_PLANE_LOG_FILE_ENV_VAR), Some(&expected_log_file)); - let crate::config::AssertionStep::Single(crate::config::AssertionConfig::FileContains { path, .. }) = - &test_case.assertions[1] - else { - panic!("expected second assertion to be file_contains"); - }; - assert_eq!(path, &expected_log_file); - } - #[test] fn standalone_adp_env_points_ipc_credentials_at_test_state_dir() { let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); diff --git a/test/integration/cases/adp-logging-default-path/config.yaml b/test/integration/cases/adp-logging-default-path/config.yaml index 90f9e517c05..100752e6e41 100644 --- a/test/integration/cases/adp-logging-default-path/config.yaml +++ b/test/integration/cases/adp-logging-default-path/config.yaml @@ -1,8 +1,8 @@ type: integration name: "adp-logging-default-path" -description: "Verifies ADP writes to the platform-default log file path (/var/log/datadog/agent-data-plane.log) when no override is provided" +description: "Verifies ADP writes to the platform-default Linux log file path (/var/log/datadog/agent-data-plane.log) when no override is provided" timeout: 120s -runtimes: [docker, mac] +runtimes: [docker] requires_core_agent: true env: diff --git a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml index 0fcb1708daa..dac1d53e931 100644 --- a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml +++ b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml @@ -2,7 +2,7 @@ type: integration name: "adp-logging-ignores-core-agent-log-file" description: "Verifies ADP ignores the Core Agent's `log_file` setting and continues to use its own per-subagent log file path" timeout: 120s -runtimes: [docker, mac] +runtimes: [docker] requires_core_agent: true env: diff --git a/test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml new file mode 100644 index 00000000000..f5610130cd0 --- /dev/null +++ b/test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml @@ -0,0 +1,33 @@ +type: integration +name: "adp-logging-mac-ignores-core-agent-log-file" +description: "Verifies ADP ignores the Core Agent's `log_file` setting on the macOS host-process runtime" +timeout: 120s +runtimes: [mac] +requires_core_agent: true + +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-mac-ignore-core-log-file" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "true" + DD_LOG_FILE: "/tmp/coreagent-only-mac.log" + DD_DATA_PLANE_LOG_FILE: "/tmp/adp-mac-ignores-core-agent-log-file.log" + +container: + image: "saluki-images/datadog-agent:testing-devel" + exposed_ports: + - "58125/udp" + +assertions: + - type: log_contains + pattern: "Initial configuration received." + timeout: 30s + - type: file_contains + path: "/tmp/coreagent-only-mac.log" + pattern: "CORE" + timeout: 30s + - type: file_contains + path: "/tmp/adp-mac-ignores-core-agent-log-file.log" + pattern: "DATAPLANE" + timeout: 30s From afa587d7c57c0b1ef5d2cce6f13029c412d19541 Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 14:12:45 -0400 Subject: [PATCH 53/56] fix(panoramic): clean up mac test state directories Move the macOS integration-test Agent settings up with the other Makefile variables so they are discoverable. Also wrap each Unix-runner per-test state directory in an RAII guard so the runner removes only the directories it creates itself. This deliberately does not clean paths that appear in test YAML, such as file_contains log paths; those are test-owned paths, not framework-owned scratch state. Verified with: - cargo test -p panoramic unix_runner::tests - cargo check -p panoramic - make build-panoramic - PANORAMIC_LOG_DIR=/tmp/panoramic-state-cleanup-check2 make test-integration-macos-run CASE=adp-memory-mode-disabled --- Makefile | 16 +++----- bin/correctness/panoramic/src/unix_runner.rs | 43 ++++++++++++++++---- 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index 832368a31bc..176ec91f2af 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,12 @@ export ADP_APP_BUILD_TIME := $(APP_BUILD_TIME) # ADP-specific settings used when running. export ADP_STANDALONE_IPC_CERT_FILE := /tmp/adp-ipc-cert.pem +# macOS integration-test settings. +MACOS_TEST_AGENT_VERSION ?= 7.78.0 +MACOS_TEST_AGENT_DMG_DIR ?= /tmp/saluki-dda-dmg-cache +MACOS_TEST_AGENT_DMG_URL ?= https://s3.amazonaws.com/dd-agent/datadog-agent-$(MACOS_TEST_AGENT_VERSION)-1.$(shell uname -m).dmg +MACOS_TEST_AGENT_INSTALL_DIR ?= /tmp/saluki-dda/datadog-agent + # General build settings used for tooling, etc. export GO_BUILD_IMAGE ?= golang:1.23-bullseye export GO_APP_IMAGE ?= ubuntu:24.04 @@ -590,16 +596,6 @@ test-integration-macos-run: ## Runs the macOS host-process integration tests usi $(if $(CASE),-t $(CASE)) --no-tui -p 1 \ $(if $(PANORAMIC_LOG_DIR),-l $(PANORAMIC_LOG_DIR)) -# Version of the Datadog Agent installed by `provision-macos-test-env`. Pinned for -# reproducibility; bump when the integration tests need newer Agent behavior. -MACOS_TEST_AGENT_VERSION ?= 7.78.0 -MACOS_TEST_AGENT_DMG_DIR ?= /tmp/saluki-dda-dmg-cache -MACOS_TEST_AGENT_DMG_URL ?= https://s3.amazonaws.com/dd-agent/datadog-agent-$(MACOS_TEST_AGENT_VERSION)-1.$(shell uname -m).dmg -# Sandbox directory the Agent is installed into. Deliberately *not* /opt/datadog-agent: keeping -# our install isolated from any pre-existing system install (which a CI runner or developer host -# may already have at a different, conflicting version) avoids surprises in both directions. -MACOS_TEST_AGENT_INSTALL_DIR ?= /tmp/saluki-dda/datadog-agent - .PHONY: provision-macos-test-env provision-macos-test-env: ## Installs the pinned Datadog Agent ($(MACOS_TEST_AGENT_VERSION)) into $(MACOS_TEST_AGENT_INSTALL_DIR) (a sandbox under /tmp) and bootstraps the IPC cert. Idempotent: re-uses the install if it already matches the pinned version. @echo "[*] Provisioning macOS test environment..." diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index d15003b1dfe..d22c39b05b9 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -28,6 +28,7 @@ use std::sync::RwLock; use std::{ collections::HashMap, + ops::Deref, path::{Path, PathBuf}, sync::Arc, time::{Duration, Instant}, @@ -445,14 +446,40 @@ async fn wait_for_agent_ipc_ready(state_dir: &Path, timeout: Duration) -> Result )) } -fn create_test_state_dir() -> Result { - let suffix = rand::distr::Alphanumeric - .sample_string(&mut rand::rng(), 8) - .to_lowercase(); - let dir = std::env::temp_dir().join(format!("panoramic-unix-{}", suffix)); - std::fs::create_dir_all(&dir) - .with_error_context(|| format!("Failed to create state directory '{}'.", dir.display()))?; - Ok(dir) +struct TestStateDir { + path: PathBuf, +} + +impl TestStateDir { + fn create() -> Result { + let suffix = rand::distr::Alphanumeric + .sample_string(&mut rand::rng(), 8) + .to_lowercase(); + let path = std::env::temp_dir().join(format!("panoramic-unix-{}", suffix)); + std::fs::create_dir_all(&path) + .with_error_context(|| format!("Failed to create state directory '{}'.", path.display()))?; + Ok(Self { path }) + } +} + +impl Deref for TestStateDir { + type Target = Path; + + fn deref(&self) -> &Self::Target { + &self.path + } +} + +impl Drop for TestStateDir { + fn drop(&mut self) { + if let Err(e) = std::fs::remove_dir_all(&self.path) { + debug!(state_dir = %self.path.display(), error = %e, "Failed to remove per-test state directory."); + } + } +} + +fn create_test_state_dir() -> Result { + TestStateDir::create() } fn make_error_result( From a1f12f01919a41415cef6e459dc0680c79fe93dc Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 14:26:43 -0400 Subject: [PATCH 54/56] fix(panoramic): always run Core Agent in mac integration tests The docker integration fixture always runs the Core Agent and ADP together via s6. Match that shape in the mac host-process runner: start the Core Agent for every test, then let the test config decide whether ADP behaves as standalone or uses the Agent/config stream. Mirror the docker cont-init collision avoidance for listeners ADP owns: disable Core Agent DogStatsD when ADP DogStatsD is enabled, and redirect Core Agent OTLP receivers when ADP OTLP is enabled. With Core Agent always present, the requires_core_agent YAML field is no longer meaningful, so remove it and the standalone self-signed IPC cert fallback. Verified with: - cargo test -p panoramic unix_runner::tests - cargo check -p panoramic - make build-panoramic - PANORAMIC_LOG_DIR=/tmp/panoramic-always-core-smoke2 make test-integration-macos-run CASE=basic-startup,adp-memory-mode-disabled,dogstatsd-default-bind,otlp-traces-enabled,adp-logging-mac-ignores-core-agent-log-file --- Cargo.lock | 1 - bin/correctness/panoramic/Cargo.toml | 1 - bin/correctness/panoramic/src/config.rs | 12 - bin/correctness/panoramic/src/unix_runner.rs | 232 +++++++++--------- .../cases/adp-cmd-port/config.yaml | 1 - .../cases/adp-config-check-exit/config.yaml | 1 - .../cases/adp-config-check-warn/config.yaml | 1 - .../cases/adp-config-stream/config.yaml | 1 - .../cases/adp-disabled-exit/config.yaml | 1 - .../adp-logging-default-path/config.yaml | 1 - .../config.yaml | 1 - .../config.yaml | 1 - .../config.yaml | 1 - .../cases/adp-rar-registration/config.yaml | 1 - 14 files changed, 119 insertions(+), 137 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 67704e91eb5..ec91ddbc2ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2976,7 +2976,6 @@ dependencies = [ "kube", "rand 0.10.1", "rand_distr", - "rcgen", "regex", "reqwest", "rustls", diff --git a/bin/correctness/panoramic/Cargo.toml b/bin/correctness/panoramic/Cargo.toml index a160d9c4cae..4edd9b13443 100644 --- a/bin/correctness/panoramic/Cargo.toml +++ b/bin/correctness/panoramic/Cargo.toml @@ -22,7 +22,6 @@ k8s-openapi = { workspace = true } kube = { workspace = true, features = ["client", "http-proxy", "rustls-tls", "ws"] } rand = { workspace = true, features = ["std", "std_rng", "thread_rng"] } rand_distr = { workspace = true } -rcgen = { workspace = true, features = ["crypto", "aws_lc_rs", "pem"] } regex = { workspace = true, features = ["std"] } reqwest = { workspace = true, features = ["json", "zstd", "rustls", "query"] } rustls = { workspace = true } diff --git a/bin/correctness/panoramic/src/config.rs b/bin/correctness/panoramic/src/config.rs index 93d5577cd2e..6a24de60e2b 100644 --- a/bin/correctness/panoramic/src/config.rs +++ b/bin/correctness/panoramic/src/config.rs @@ -132,18 +132,6 @@ pub struct IntegrationConfig { #[serde(default = "default_integration_runtimes")] pub runtimes: Vec, - /// Whether this test requires a Core Agent process to be running alongside ADP. - /// - /// When `true`, host-process runtimes (such as `mac`) spawn the Datadog Core Agent as a side process before - /// starting ADP, sharing a per-test config directory so they communicate over IPC the same - /// way they would in production. When `false` (the default), only ADP is spawned (standalone - /// mode). - /// - /// On the `docker` runtime this field is informational; the converged image always runs - /// both processes via s6. - #[serde(default)] - pub requires_core_agent: bool, - /// Active runtime for this test instance. /// /// Empty at parse time; the discovery layer sets it to whichever runtime the CLI is scoped diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index d22c39b05b9..a5c963f9dd4 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -9,12 +9,12 @@ //! //! # Supported test shapes //! -//! - **Standalone**: only ADP is spawned. The default for tests that don't set -//! `requires_core_agent: true`. -//! - **Converged**: the Datadog Core Agent is spawned alongside ADP (when -//! `requires_core_agent: true`), sharing a per-test config directory so they authenticate -//! over IPC the same way they would in production. See the per-phase comments in -//! [`UnixIntegrationRunner::run`] for the cert/auth_token plumbing. +//! The Datadog Core Agent is always spawned alongside ADP, matching the Docker integration image's +//! fixture shape. Tests still control ADP behavior through configuration: standalone-mode tests set +//! `DD_DATA_PLANE_STANDALONE_MODE=true`, while converged tests enable remote-agent/config-stream +//! behavior. Both processes share a per-test config directory so they authenticate over IPC the same +//! way they would in production. See the per-phase comments in [`UnixIntegrationRunner::run`] for +//! the cert/auth_token plumbing. //! //! # Binary discovery //! @@ -35,8 +35,7 @@ use std::{ }; use airlock::unix::{LogSink, UnixProcess, UnixProcessConfig}; -use rand::distr::{Alphanumeric, SampleString as _}; -use rcgen::{generate_simple_self_signed, CertifiedKey}; +use rand::distr::SampleString as _; use saluki_error::{ErrorContext as _, GenericError}; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; @@ -145,101 +144,83 @@ impl UnixIntegrationRunner { buf: self.log_buffer.clone(), })); - // Path that both the Agent and ADP use for auth_token / ipc_cert.pem. Always computed, - // only inserted into env when the Agent is in the picture (see comments below). + // Path that both the Agent and ADP use for auth_token / ipc_cert.pem. let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); - // Optional Phase: spawn the Core Agent (converged tests). + // Phase: spawn the Core Agent. // - // Converged tests need both the Core Agent and ADP running side-by-side, sharing a - // config directory so they can authenticate over IPC. We spawn the Agent first against - // the per-test state dir, wait until it has written `auth_token` and `ipc_cert.pem`, - // then spawn ADP with `DD_AUTH_TOKEN_FILE_PATH` pointing at the per-test auth token so - // ADP's IPC client uses the same per-test credentials (and ADP's own API server uses - // the matching cert). - let mut core_agent: Option = None; - if self.test_case.requires_core_agent { - let agent_spawn_start = Instant::now(); - let agent_binary = match resolve_core_agent_binary_path() { - Ok(p) => p, - Err(e) => return make_error_result(test_name, started, "resolve_core_agent", e, phase_timings), - }; - debug!(test = %test_name, binary = %agent_binary.display(), "Resolved Core Agent binary path."); - - // Forced runner-owned bindings: - // DD_AUTH_TOKEN_FILE_PATH — pin Agent + ADP to the same per-test path. The Agent's - // authoritative config (sent to ADP via the config stream) overrides ADP's env - // vars, so the Agent itself must be told about the per-test path; otherwise it - // advertises the platform default (`/opt/datadog-agent/etc/auth_token`), ADP - // follows that advice for its post-config-stream IPC clients, and TLS fails - // with UnknownIssuer because the platform default cert does not match what the - // per-test Agent is actually serving. - // DD_RUN_PATH — Agent's default `run_path` is the install prefix's `run/` dir - // (e.g., /opt/datadog-agent/run). Without overriding, a relocated Agent install - // would try to write its runtime state (remote-config db, sockets, pid file) - // back to /opt — typically not writable in CI. Scope it to the per-test state - // directory so each test gets a clean slate and nothing leaks across runs. - let agent_env = build_process_env( - &self.test_case.env, - &[ - ("DD_AUTH_TOKEN_FILE_PATH", auth_token_path.clone()), - ("DD_RUN_PATH", state_dir.to_string_lossy().into_owned()), - ], - ); - - let agent_config = UnixProcessConfig::new(format!("{}-core-agent", self.test_case.name), agent_binary) - .with_args(vec![ - "run".to_string(), - "-c".to_string(), - state_dir.to_string_lossy().into_owned(), - ]) - .with_env_map(agent_env); - - let agent = match UnixProcess::spawn(agent_config, log_sink.clone(), CancellationToken::new()).await { - Ok(p) => p, - Err(e) => { - phase_timings.push(PhaseTiming { - phase: "core_agent_spawn".to_string(), - duration: agent_spawn_start.elapsed(), - }); - return make_error_result(test_name, started, "core_agent_spawn", e, phase_timings); - } - }; - phase_timings.push(PhaseTiming { - phase: "core_agent_spawn".to_string(), - duration: agent_spawn_start.elapsed(), - }); - info!(test = %test_name, "Core Agent process started."); - - let wait_start = Instant::now(); - if let Err(e) = wait_for_agent_ipc_ready(&state_dir, CORE_AGENT_IPC_READY_TIMEOUT).await { - agent.cleanup().await; + // The Docker integration image always runs the Core Agent beside ADP via s6. Do the + // same for the Unix runner so mac tests keep the same fixture shape: standalone-mode + // tests still configure ADP not to use the Agent, but the Agent process exists. + let agent_spawn_start = Instant::now(); + let agent_binary = match resolve_core_agent_binary_path() { + Ok(p) => p, + Err(e) => return make_error_result(test_name, started, "resolve_core_agent", e, phase_timings), + }; + debug!(test = %test_name, binary = %agent_binary.display(), "Resolved Core Agent binary path."); + + // Forced runner-owned bindings: + // DD_AUTH_TOKEN_FILE_PATH — pin Agent + ADP to the same per-test path. The Agent's + // authoritative config (sent to ADP via the config stream) overrides ADP's env + // vars, so the Agent itself must be told about the per-test path; otherwise it + // advertises the platform default (`/opt/datadog-agent/etc/auth_token`), ADP + // follows that advice for its post-config-stream IPC clients, and TLS fails + // with UnknownIssuer because the platform default cert does not match what the + // per-test Agent is actually serving. + // DD_RUN_PATH — Agent's default `run_path` is the install prefix's `run/` dir + // (e.g., /opt/datadog-agent/run). Without overriding, a relocated Agent install + // would try to write its runtime state (remote-config db, sockets, pid file) + // back to /opt — typically not writable in CI. Scope it to the per-test state + // directory so each test gets a clean slate and nothing leaks across runs. + // DD_USE_DOGSTATSD / OTLP endpoint overrides — mirror the Docker cont-init script's + // collision avoidance when ADP owns those listeners. + let agent_forced = build_core_agent_forced_env(&self.test_case.env, &state_dir, auth_token_path.clone()); + let agent_env = build_process_env(&self.test_case.env, &agent_forced); + + let agent_config = UnixProcessConfig::new(format!("{}-core-agent", self.test_case.name), agent_binary) + .with_args(vec![ + "run".to_string(), + "-c".to_string(), + state_dir.to_string_lossy().into_owned(), + ]) + .with_env_map(agent_env); + + let agent = match UnixProcess::spawn(agent_config, log_sink.clone(), CancellationToken::new()).await { + Ok(p) => p, + Err(e) => { phase_timings.push(PhaseTiming { - phase: "core_agent_ipc_ready".to_string(), - duration: wait_start.elapsed(), + phase: "core_agent_spawn".to_string(), + duration: agent_spawn_start.elapsed(), }); - return make_error_result(test_name, started, "core_agent_ipc_ready", e, phase_timings); + return make_error_result(test_name, started, "core_agent_spawn", e, phase_timings); } + }; + phase_timings.push(PhaseTiming { + phase: "core_agent_spawn".to_string(), + duration: agent_spawn_start.elapsed(), + }); + info!(test = %test_name, "Core Agent process started."); + + let wait_start = Instant::now(); + if let Err(e) = wait_for_agent_ipc_ready(&state_dir, CORE_AGENT_IPC_READY_TIMEOUT).await { + agent.cleanup().await; phase_timings.push(PhaseTiming { phase: "core_agent_ipc_ready".to_string(), duration: wait_start.elapsed(), }); - debug!(test = %test_name, "Core Agent IPC credentials present."); - core_agent = Some(agent); + return make_error_result(test_name, started, "core_agent_ipc_ready", e, phase_timings); } + phase_timings.push(PhaseTiming { + phase: "core_agent_ipc_ready".to_string(), + duration: wait_start.elapsed(), + }); + debug!(test = %test_name, "Core Agent IPC credentials present."); + let mut core_agent = Some(agent); // Phase: spawn ADP. let spawn_start = Instant::now(); let config_path_str = config_path.to_string_lossy().into_owned(); - if !self.test_case.requires_core_agent { - if let Err(e) = seed_standalone_ipc_credentials(&state_dir, &auth_token_path) { - if let Some(agent) = core_agent.take() { - agent.cleanup().await; - } - return make_error_result(test_name, started, "prepare_standalone_ipc", e, phase_timings); - } - } - let adp_forced = build_adp_forced_env(self.test_case.requires_core_agent, &state_dir, auth_token_path); + let adp_forced = build_adp_forced_env(auth_token_path); let adp_env = build_process_env(&self.test_case.env, &adp_forced); let process_config = UnixProcessConfig::new(self.test_case.name.clone(), binary_path) .with_args(vec!["-c".to_string(), config_path_str, "run".to_string()]) @@ -399,34 +380,40 @@ fn resolve_core_agent_binary_path() -> Result { }) } -fn build_adp_forced_env( - requires_core_agent: bool, state_dir: &Path, auth_token_path: String, +fn build_core_agent_forced_env( + test_env: &HashMap, state_dir: &Path, auth_token_path: String, ) -> Vec<(&'static str, String)> { - if requires_core_agent { - vec![("DD_AUTH_TOKEN_FILE_PATH", auth_token_path)] - } else { - vec![ - ("DD_AUTH_TOKEN_FILE_PATH", auth_token_path), + let mut forced = vec![ + ("DD_AUTH_TOKEN_FILE_PATH", auth_token_path), + ("DD_RUN_PATH", state_dir.to_string_lossy().into_owned()), + ]; + + if env_is_true(test_env, "DD_DATA_PLANE_DOGSTATSD_ENABLED") { + forced.push(("DD_USE_DOGSTATSD", "0".to_string())); + } + + if env_is_true(test_env, "DD_DATA_PLANE_OTLP_ENABLED") { + forced.extend([ + ( + "DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_GRPC_ENDPOINT", + "127.0.0.1:14317".to_string(), + ), ( - "DD_IPC_CERT_FILE_PATH", - state_dir.join("ipc_cert.pem").to_string_lossy().into_owned(), + "DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_HTTP_ENDPOINT", + "127.0.0.1:14318".to_string(), ), - ] + ]); } -} -fn seed_standalone_ipc_credentials(state_dir: &Path, auth_token_path: &str) -> Result<(), GenericError> { - let auth_token = Alphanumeric.sample_string(&mut rand::rng(), 32); - std::fs::write(auth_token_path, auth_token) - .with_error_context(|| format!("Failed to write auth token at '{}'.", auth_token_path))?; + forced +} - let CertifiedKey { cert, signing_key } = generate_simple_self_signed(["localhost".to_string()]) - .error_context("Failed to generate self-signed IPC certificate.")?; - let cert_path = state_dir.join("ipc_cert.pem"); - std::fs::write(&cert_path, format!("{}{}", cert.pem(), signing_key.serialize_pem())) - .with_error_context(|| format!("Failed to write IPC certificate at '{}'.", cert_path.display()))?; +fn build_adp_forced_env(auth_token_path: String) -> Vec<(&'static str, String)> { + vec![("DD_AUTH_TOKEN_FILE_PATH", auth_token_path)] +} - Ok(()) +fn env_is_true(env: &HashMap, key: &str) -> bool { + env.get(key).is_some_and(|v| v == "true") } async fn wait_for_agent_ipc_ready(state_dir: &Path, timeout: Duration) -> Result<(), GenericError> { @@ -522,18 +509,37 @@ mod tests { use super::*; #[test] - fn standalone_adp_env_points_ipc_credentials_at_test_state_dir() { + fn adp_env_points_ipc_credentials_at_test_state_dir() { + let auth_token_path = "/tmp/panoramic-unix-test/auth_token".to_string(); + + let env: HashMap<_, _> = build_adp_forced_env(auth_token_path.clone()).into_iter().collect(); + + assert_eq!(env.get("DD_AUTH_TOKEN_FILE_PATH"), Some(&auth_token_path)); + } + + #[test] + fn core_agent_env_mirrors_docker_listener_collision_avoidance() { let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); + let test_env = HashMap::from([ + ("DD_DATA_PLANE_DOGSTATSD_ENABLED".to_string(), "true".to_string()), + ("DD_DATA_PLANE_OTLP_ENABLED".to_string(), "true".to_string()), + ]); - let env: HashMap<_, _> = build_adp_forced_env(false, &state_dir, auth_token_path.clone()) + let env: HashMap<_, _> = build_core_agent_forced_env(&test_env, &state_dir, auth_token_path.clone()) .into_iter() .collect(); assert_eq!(env.get("DD_AUTH_TOKEN_FILE_PATH"), Some(&auth_token_path)); + assert_eq!(env.get("DD_RUN_PATH"), Some(&state_dir.to_string_lossy().into_owned())); + assert_eq!(env.get("DD_USE_DOGSTATSD"), Some(&"0".to_string())); + assert_eq!( + env.get("DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_GRPC_ENDPOINT"), + Some(&"127.0.0.1:14317".to_string()) + ); assert_eq!( - env.get("DD_IPC_CERT_FILE_PATH"), - Some(&state_dir.join("ipc_cert.pem").to_string_lossy().into_owned()) + env.get("DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_HTTP_ENDPOINT"), + Some(&"127.0.0.1:14318".to_string()) ); } } diff --git a/test/integration/cases/adp-cmd-port/config.yaml b/test/integration/cases/adp-cmd-port/config.yaml index ed09720193a..3de1c864c16 100644 --- a/test/integration/cases/adp-cmd-port/config.yaml +++ b/test/integration/cases/adp-cmd-port/config.yaml @@ -16,7 +16,6 @@ name: "adp-cmd-port" description: "Verifies ADP connects to the correct port when cmd_port is set" timeout: 120s runtimes: [docker, mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-config-check-exit/config.yaml b/test/integration/cases/adp-config-check-exit/config.yaml index eb4ed9881e9..b4a84096698 100644 --- a/test/integration/cases/adp-config-check-exit/config.yaml +++ b/test/integration/cases/adp-config-check-exit/config.yaml @@ -11,7 +11,6 @@ name: "adp-config-check-exit" description: "Verify config check exits ADP on high-severity incompatible key" timeout: 120s runtimes: [docker, mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-config-check-warn/config.yaml b/test/integration/cases/adp-config-check-warn/config.yaml index d5e5854b377..ea12f91ddba 100644 --- a/test/integration/cases/adp-config-check-warn/config.yaml +++ b/test/integration/cases/adp-config-check-warn/config.yaml @@ -8,7 +8,6 @@ name: "adp-config-check-warn" description: "Verify config check warns on medium-severity incompatible keys without exiting" timeout: 120s runtimes: [docker, mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-config-stream/config.yaml b/test/integration/cases/adp-config-stream/config.yaml index b7824b14cae..52d4328703f 100644 --- a/test/integration/cases/adp-config-stream/config.yaml +++ b/test/integration/cases/adp-config-stream/config.yaml @@ -3,7 +3,6 @@ name: "adp-config-stream" description: "Verify ADP receives configuration from Core Agent via config stream" timeout: 120s runtimes: [docker, mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-disabled-exit/config.yaml b/test/integration/cases/adp-disabled-exit/config.yaml index d8090cfd691..3c1061c5a32 100644 --- a/test/integration/cases/adp-disabled-exit/config.yaml +++ b/test/integration/cases/adp-disabled-exit/config.yaml @@ -3,7 +3,6 @@ name: "adp-disabled-exit" description: "Verify ADP exits cleanly when data plane is not enabled" timeout: 120s runtimes: [docker, mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-logging-default-path/config.yaml b/test/integration/cases/adp-logging-default-path/config.yaml index 100752e6e41..5e4d07c6112 100644 --- a/test/integration/cases/adp-logging-default-path/config.yaml +++ b/test/integration/cases/adp-logging-default-path/config.yaml @@ -3,7 +3,6 @@ name: "adp-logging-default-path" description: "Verifies ADP writes to the platform-default Linux log file path (/var/log/datadog/agent-data-plane.log) when no override is provided" timeout: 120s runtimes: [docker] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml index dac1d53e931..bd3a40d18c9 100644 --- a/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml +++ b/test/integration/cases/adp-logging-ignores-core-agent-log-file/config.yaml @@ -3,7 +3,6 @@ name: "adp-logging-ignores-core-agent-log-file" description: "Verifies ADP ignores the Core Agent's `log_file` setting and continues to use its own per-subagent log file path" timeout: 120s runtimes: [docker] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml b/test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml index f5610130cd0..439fc6a52b1 100644 --- a/test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml +++ b/test/integration/cases/adp-logging-mac-ignores-core-agent-log-file/config.yaml @@ -3,7 +3,6 @@ name: "adp-logging-mac-ignores-core-agent-log-file" description: "Verifies ADP ignores the Core Agent's `log_file` setting on the macOS host-process runtime" timeout: 120s runtimes: [mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml index 6514566959c..5bcc9146940 100644 --- a/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml +++ b/test/integration/cases/adp-logging-respects-data-plane-log-file/config.yaml @@ -3,7 +3,6 @@ name: "adp-logging-respects-data-plane-log-file" description: "Verifies ADP honors the per-subagent `data_plane.log_file` setting when explicitly configured" timeout: 120s runtimes: [docker, mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" diff --git a/test/integration/cases/adp-rar-registration/config.yaml b/test/integration/cases/adp-rar-registration/config.yaml index 1610a11b3a4..6a85878f1e6 100644 --- a/test/integration/cases/adp-rar-registration/config.yaml +++ b/test/integration/cases/adp-rar-registration/config.yaml @@ -3,7 +3,6 @@ name: "adp-rar-registration" description: "Verify ADP successfully registers with Remote Agent Registry" timeout: 120s runtimes: [docker, mac] -requires_core_agent: true env: DD_API_KEY: "test-api-key" From 4f644c5c52cd5f06bdbf5ccfc44dd31e4ec1775f Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Fri, 29 May 2026 14:57:32 -0400 Subject: [PATCH 55/56] fix(panoramic): disable Core Agent DogStatsD when ADP owns it After making the mac runner always start the Core Agent, standalone ADP cases could race the Core Agent for the shifted DogStatsD UDP port. The Docker fixture avoids this when ADP owns DogStatsD; mirror that behavior in the mac runner even when tests rely on ADP's default DogStatsD setting and do not explicitly set DD_DATA_PLANE_DOGSTATSD_ENABLED=true. Disable Core Agent DogStatsD whenever ADP is enabled unless the test explicitly disables ADP DogStatsD. Verified with: - cargo test -p panoramic unix_runner::tests - cargo check -p panoramic - PANORAMIC_LOG_DIR=/tmp/panoramic-dsd-own-fix make test-integration-macos-run CASE=adp-memory-mode-disabled,adp-memory-mode-permissive-exceeds-limit,adp-memory-mode-permissive-within-limit,adp-memory-mode-strict-within-limit,basic-startup,otlp-traces-enabled --- bin/correctness/panoramic/src/unix_runner.rs | 40 ++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/bin/correctness/panoramic/src/unix_runner.rs b/bin/correctness/panoramic/src/unix_runner.rs index a5c963f9dd4..952fea67134 100644 --- a/bin/correctness/panoramic/src/unix_runner.rs +++ b/bin/correctness/panoramic/src/unix_runner.rs @@ -388,8 +388,8 @@ fn build_core_agent_forced_env( ("DD_RUN_PATH", state_dir.to_string_lossy().into_owned()), ]; - if env_is_true(test_env, "DD_DATA_PLANE_DOGSTATSD_ENABLED") { - forced.push(("DD_USE_DOGSTATSD", "0".to_string())); + if adp_owns_dogstatsd(test_env) { + forced.push(("DD_USE_DOGSTATSD", "false".to_string())); } if env_is_true(test_env, "DD_DATA_PLANE_OTLP_ENABLED") { @@ -412,6 +412,10 @@ fn build_adp_forced_env(auth_token_path: String) -> Vec<(&'static str, String)> vec![("DD_AUTH_TOKEN_FILE_PATH", auth_token_path)] } +fn adp_owns_dogstatsd(env: &HashMap) -> bool { + env_is_true(env, "DD_DATA_PLANE_ENABLED") && env.get("DD_DATA_PLANE_DOGSTATSD_ENABLED").is_none_or(|v| v != "false") +} + fn env_is_true(env: &HashMap, key: &str) -> bool { env.get(key).is_some_and(|v| v == "true") } @@ -522,6 +526,7 @@ mod tests { let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); let test_env = HashMap::from([ + ("DD_DATA_PLANE_ENABLED".to_string(), "true".to_string()), ("DD_DATA_PLANE_DOGSTATSD_ENABLED".to_string(), "true".to_string()), ("DD_DATA_PLANE_OTLP_ENABLED".to_string(), "true".to_string()), ]); @@ -532,7 +537,7 @@ mod tests { assert_eq!(env.get("DD_AUTH_TOKEN_FILE_PATH"), Some(&auth_token_path)); assert_eq!(env.get("DD_RUN_PATH"), Some(&state_dir.to_string_lossy().into_owned())); - assert_eq!(env.get("DD_USE_DOGSTATSD"), Some(&"0".to_string())); + assert_eq!(env.get("DD_USE_DOGSTATSD"), Some(&"false".to_string())); assert_eq!( env.get("DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_GRPC_ENDPOINT"), Some(&"127.0.0.1:14317".to_string()) @@ -542,4 +547,33 @@ mod tests { Some(&"127.0.0.1:14318".to_string()) ); } + + #[test] + fn core_agent_dogstatsd_is_disabled_when_adp_uses_default_dogstatsd_setting() { + let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); + let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); + let test_env = HashMap::from([("DD_DATA_PLANE_ENABLED".to_string(), "true".to_string())]); + + let env: HashMap<_, _> = build_core_agent_forced_env(&test_env, &state_dir, auth_token_path) + .into_iter() + .collect(); + + assert_eq!(env.get("DD_USE_DOGSTATSD"), Some(&"false".to_string())); + } + + #[test] + fn core_agent_dogstatsd_is_not_disabled_when_adp_dogstatsd_is_explicitly_disabled() { + let state_dir = PathBuf::from("/tmp/panoramic-unix-test"); + let auth_token_path = state_dir.join("auth_token").to_string_lossy().into_owned(); + let test_env = HashMap::from([ + ("DD_DATA_PLANE_ENABLED".to_string(), "true".to_string()), + ("DD_DATA_PLANE_DOGSTATSD_ENABLED".to_string(), "false".to_string()), + ]); + + let env: HashMap<_, _> = build_core_agent_forced_env(&test_env, &state_dir, auth_token_path) + .into_iter() + .collect(); + + assert!(!env.contains_key("DD_USE_DOGSTATSD")); + } } From 09379a59af314d38f85d0f577b98b05a5f264a6a Mon Sep 17 00:00:00 2001 From: Travis Thieman Date: Mon, 1 Jun 2026 10:33:03 -0400 Subject: [PATCH 56/56] test(panoramic): adapt adp config no-warn case for mac runtime The new adp-config-check-no-warn case from main still used the old container.env shape, so the mac runner did not pass its ADP settings to the spawned processes. ADP received the dynamic config stream, saw itself as disabled, and exited. Move the env block to the top-level integration config shape and mark the case as portable across docker and mac, matching adp-config-check-warn. Because this case deliberately disables DogStatsD to prove DogStatsD-affined config warnings are suppressed, enable OTLP so ADP still has an active data pipeline and can remain stable for the assertion window. Verified with: - PANORAMIC_LOG_DIR=/tmp/panoramic-adp-check-no-warn-3 make test-integration-macos-run CASE=adp-config-check-no-warn --- .../adp-config-check-no-warn/config.yaml | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/integration/cases/adp-config-check-no-warn/config.yaml b/test/integration/cases/adp-config-check-no-warn/config.yaml index 4cd21c08409..27d217d2f04 100644 --- a/test/integration/cases/adp-config-check-no-warn/config.yaml +++ b/test/integration/cases/adp-config-check-no-warn/config.yaml @@ -9,17 +9,20 @@ type: integration name: "adp-config-check-no-warn" description: "Verify config check suppresses warnings for incompatible keys whose pipeline is inactive" timeout: 120s +runtimes: [docker, mac] + +env: + DD_API_KEY: "test-api-key" + DD_HOSTNAME: "integration-test-config-no-warn" + DD_DATA_PLANE_ENABLED: "true" + DD_DATA_PLANE_STANDALONE_MODE: "false" + DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" + DD_DATA_PLANE_DOGSTATSD_ENABLED: "false" # DogStatsD pipeline inactive + DD_DATA_PLANE_OTLP_ENABLED: "true" # Keep ADP running with a non-DogStatsD pipeline. + DD_DOGSTATSD_TELEMETRY_ENABLED_LISTENER_ID: "true" # Incompatible(Medium, DogStatsD) container: image: "saluki-images/datadog-agent:testing-devel" - env: - DD_API_KEY: "test-api-key" - DD_HOSTNAME: "integration-test-config-no-warn" - DD_DATA_PLANE_ENABLED: "true" - DD_DATA_PLANE_STANDALONE_MODE: "false" - DD_DATA_PLANE_USE_NEW_CONFIG_STREAM_ENDPOINT: "true" - DD_DATA_PLANE_DOGSTATSD_ENABLED: "false" # DogStatsD pipeline inactive - DD_DOGSTATSD_TELEMETRY_ENABLED_LISTENER_ID: "true" # Incompatible(Medium, DogStatsD) assertions: - parallel: