From d203dbfbd0a486d9be5eccabc3daa8284a74ab63 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Mon, 1 Jun 2026 16:36:09 -0400 Subject: [PATCH 1/2] feat(metrics): add V3 payload support to the Datadog Metrics encoder Adds experimental V3 columnar encoding for series and sketch metrics behind the serializer_experimental_use_v3_api.* config keys, including a V2/V3 validation mode, V3 intake routing/filtering, intake-side V3 payload parsing for correctness testing, and the dsd-plain-v3 correctness cases. Existing V1/V2 series and sketch encoding is preserved. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../config/vocabularies/technical/accept.txt | 2 + Cargo.lock | 8 + Cargo.toml | 1 + LICENSE-3rdparty.csv | 1 + .../dogstatsd/known-configs.json | 56 +- lib/protos/datadog/build.rs | 5 +- .../datadog/proto/agent-payload/README.md | 2 +- .../proto/agent-payload/intake_v3.proto | 79 + lib/protos/datadog/src/lib.rs | 5 + lib/saluki-components/Cargo.toml | 1 + lib/saluki-components/etc/ignored_keys.yaml | 4 - .../src/common/datadog/config.rs | 18 +- .../src/common/datadog/endpoints.rs | 194 +- .../src/common/datadog/io.rs | 92 +- .../src/common/datadog/mod.rs | 20 +- .../src/common/datadog/protocol.rs | 170 ++ .../src/common/datadog/transaction.rs | 9 + .../src/config_registry/datadog/encoders.rs | 105 + .../config_registry/datadog/unsupported.rs | 58 +- .../src/encoders/datadog/metrics/endpoint.rs | 64 + .../src/encoders/datadog/metrics/mod.rs | 2363 +++++++++-------- .../src/encoders/datadog/metrics/v1/mod.rs | 235 ++ .../encoders/datadog/metrics/v2/constants.rs | 42 + .../src/encoders/datadog/metrics/v2/mod.rs | 793 ++++++ .../encoders/datadog/metrics/v3/constants.rs | 29 + .../encoders/datadog/metrics/v3/interner.rs | 94 + .../src/encoders/datadog/metrics/v3/mod.rs | 26 + .../encoders/datadog/metrics/v3/payload.rs | 52 + .../src/encoders/datadog/metrics/v3/types.rs | 211 ++ .../src/encoders/datadog/metrics/v3/writer.rs | 820 ++++++ lib/saluki-components/vendor/core_schema.yaml | 15 +- lib/saluki-core/Cargo.toml | 1 + .../src/data_model/payload/metadata.rs | 26 + .../dsd-plain-v3-validation/config.yaml | 22 + .../dsd-plain-v3-validation/datadog.yaml | 36 + .../dsd-plain-v3-validation/millstone.yaml | 91 + test/correctness/dsd-plain-v3/config.yaml | 22 + test/correctness/dsd-plain-v3/datadog.yaml | 35 + test/correctness/dsd-plain-v3/millstone.yaml | 91 + 39 files changed, 4744 insertions(+), 1154 deletions(-) create mode 100644 lib/protos/datadog/proto/agent-payload/intake_v3.proto create mode 100644 lib/saluki-components/src/common/datadog/protocol.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/endpoint.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v1/mod.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v2/constants.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v2/mod.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v3/constants.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v3/interner.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v3/mod.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v3/payload.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v3/types.rs create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/v3/writer.rs create mode 100644 test/correctness/dsd-plain-v3-validation/config.yaml create mode 100644 test/correctness/dsd-plain-v3-validation/datadog.yaml create mode 100644 test/correctness/dsd-plain-v3-validation/millstone.yaml create mode 100644 test/correctness/dsd-plain-v3/config.yaml create mode 100644 test/correctness/dsd-plain-v3/datadog.yaml create mode 100644 test/correctness/dsd-plain-v3/millstone.yaml diff --git a/.vale/styles/config/vocabularies/technical/accept.txt b/.vale/styles/config/vocabularies/technical/accept.txt index ad9315178e9..e591b4e93da 100644 --- a/.vale/styles/config/vocabularies/technical/accept.txt +++ b/.vale/styles/config/vocabularies/technical/accept.txt @@ -229,3 +229,5 @@ mpmc dhat profiler launchd +varint +serializer diff --git a/Cargo.lock b/Cargo.lock index 6aacceee514..5c101faf3ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -130,6 +130,12 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "anymap3" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170433209e817da6aae2c51aa0dd443009a613425dd041ebfb2492d1c4c11a25" + [[package]] name = "arc-swap" version = "1.9.1" @@ -4154,6 +4160,7 @@ dependencies = [ "tracing-appender", "tracing-rolling-file", "url", + "uuid", "zstd", ] @@ -4197,6 +4204,7 @@ dependencies = [ name = "saluki-core" version = "0.1.0" dependencies = [ + "anymap3", "async-trait", "bitmask-enum", "ddsketch", diff --git a/Cargo.toml b/Cargo.toml index d9350879ba6..990b0efba24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,7 @@ stele = { path = "bin/correctness/stele" } stringtheory = { path = "lib/stringtheory" } ottl = { path = "lib/ottl" } otlp-protos = { path = "lib/protos/otlp" } +anymap3 = { version = "1", default-features = false, features = ["std"] } async-trait = { version = "0.1", default-features = false } atty = { version = "0.2", default-features = false } axum = { version = "0.8", default-features = false } diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 6490abcd2b5..2b3e42e569f 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -8,6 +8,7 @@ android_system_properties,https://github.com/nical/android_system_properties,MIT anes,https://github.com/zrzka/anes-rs,MIT OR Apache-2.0,Robert Vojta anstyle,https://github.com/rust-cli/anstyle,MIT OR Apache-2.0,The anstyle Authors anyhow,https://github.com/dtolnay/anyhow,MIT OR Apache-2.0,David Tolnay +anymap3,https://github.com/reivilibre/anymap3,BlueOak-1.0.0 OR MIT OR Apache-2.0,"Olivier 'reivilibre' (fork maintainer) , Chris Morgan (original author) " arc-swap,https://github.com/vorner/arc-swap,MIT OR Apache-2.0,Michal 'vorner' Vaner argh,https://github.com/google/argh,BSD-3-Clause,"Taylor Cramer , Benjamin Brittain , Erick Tryzelaar " argh_derive,https://github.com/google/argh,BSD-3-Clause,"Taylor Cramer , Benjamin Brittain , Erick Tryzelaar " diff --git a/docs/agent-data-plane/configuration/dogstatsd/known-configs.json b/docs/agent-data-plane/configuration/dogstatsd/known-configs.json index 6417e79b46c..ade733bd000 100644 --- a/docs/agent-data-plane/configuration/dogstatsd/known-configs.json +++ b/docs/agent-data-plane/configuration/dogstatsd/known-configs.json @@ -1747,29 +1747,29 @@ }, { "key": "serializer_experimental_use_v3_api.compression_level", - "feature_state": "MISSING", - "action": "IMPLEMENT", + "feature_state": "PARITY", + "action": "NONE", "description": "V3 API zstd compression level", - "reason": "V2 to V3 migration PR in progress. Feature is missing but actively being addressed.", - "issue": "#1468", + "reason": "Implemented in ADP via the experimental V3 columnar metrics encoder.", + "issue": null, "adp_key": null }, { "key": "serializer_experimental_use_v3_api.series.beta_route", - "feature_state": "MISSING", - "action": "IMPLEMENT", + "feature_state": "PARITY", + "action": "NONE", "description": "V3 beta intake route path for series", - "reason": "Controls series intake beta route for metric forwarding", - "issue": "#1468", + "reason": "Implemented in ADP via the experimental V3 columnar metrics encoder.", + "issue": null, "adp_key": null }, { "key": "serializer_experimental_use_v3_api.series.endpoints", - "feature_state": "MISSING", - "action": "IMPLEMENT", + "feature_state": "PARITY", + "action": "NONE", "description": "Endpoints enabled for V3 series API", - "reason": "V2 to V3 migration PR in progress. Feature is missing but actively being addressed.", - "issue": "#1468", + "reason": "Implemented in ADP via the experimental V3 columnar metrics encoder.", + "issue": null, "adp_key": null }, { @@ -1792,38 +1792,38 @@ }, { "key": "serializer_experimental_use_v3_api.series.use_beta", - "feature_state": "MISSING", - "action": "IMPLEMENT", + "feature_state": "PARITY", + "action": "NONE", "description": "Use v3beta route instead of v3 for series", - "reason": "Enables V3 beta API for series metric forwarding", - "issue": "#1468", + "reason": "Implemented in ADP via the experimental V3 columnar metrics encoder.", + "issue": null, "adp_key": null }, { "key": "serializer_experimental_use_v3_api.series.validate", - "feature_state": "MISSING", - "action": "IMPLEMENT", + "feature_state": "PARITY", + "action": "NONE", "description": "Dual-send v2+v3 series for validation", - "reason": "V2 to V3 migration PR in progress. Feature is missing but actively being addressed.", - "issue": "#1468", + "reason": "Implemented in ADP via the experimental V3 columnar metrics encoder.", + "issue": null, "adp_key": null }, { "key": "serializer_experimental_use_v3_api.sketches.endpoints", - "feature_state": "MISSING", - "action": "IMPLEMENT", + "feature_state": "PARITY", + "action": "NONE", "description": "Endpoints enabling v3 sketches API", - "reason": "V2 to V3 migration PR in progress. Feature is missing but actively being addressed.", - "issue": "#1468", + "reason": "Implemented in ADP via the experimental V3 columnar metrics encoder.", + "issue": null, "adp_key": null }, { "key": "serializer_experimental_use_v3_api.sketches.validate", - "feature_state": "MISSING", - "action": "IMPLEMENT", + "feature_state": "PARITY", + "action": "NONE", "description": "Dual-send v2+v3 sketches for validation", - "reason": "V2 to V3 migration PR in progress. Feature is missing but actively being addressed.", - "issue": "#1468", + "reason": "Implemented in ADP via the experimental V3 columnar metrics encoder.", + "issue": null, "adp_key": null }, { diff --git a/lib/protos/datadog/build.rs b/lib/protos/datadog/build.rs index 4c576e001b7..d77e28dd751 100644 --- a/lib/protos/datadog/build.rs +++ b/lib/protos/datadog/build.rs @@ -88,7 +88,10 @@ fn main() { protobuf_codegen::Codegen::new() .protoc() .includes(["proto", "proto/datadog-agent"]) - .inputs(["proto/agent-payload/agent_payload.proto"]) + .inputs([ + "proto/agent-payload/agent_payload.proto", + "proto/agent-payload/intake_v3.proto", + ]) .cargo_out_dir("protos") .customize(codegen_customize.clone()) .run_from_script(); diff --git a/lib/protos/datadog/proto/agent-payload/README.md b/lib/protos/datadog/proto/agent-payload/README.md index 5260c8e6d9e..5e0e6acc7a8 100644 --- a/lib/protos/datadog/proto/agent-payload/README.md +++ b/lib/protos/datadog/proto/agent-payload/README.md @@ -6,4 +6,4 @@ Agent/Agent Data Plane send telemetry payloads to. ## Source **Repository:** https://github.com/DataDog/agent-payload.git -**Branch / Tag**: v5.0.164 +**Branch / Tag**: v5.0.180 diff --git a/lib/protos/datadog/proto/agent-payload/intake_v3.proto b/lib/protos/datadog/proto/agent-payload/intake_v3.proto new file mode 100644 index 00000000000..5660bb8612a --- /dev/null +++ b/lib/protos/datadog/proto/agent-payload/intake_v3.proto @@ -0,0 +1,79 @@ +syntax = "proto3"; + +package datadoghq.api.metrics.v3; + +message Payload { + reserved 1; // for compatibility with agentpayload.MetricPayload.series + Metadata metadata = 2; + MetricData metricData = 3; +} + +message Metadata { + repeated string tags = 1; + repeated string resources = 2; // even number of elements, [Type, Name] pairs +} + +message MetricData { + // Dictionaries + // All dictionary indexes are base-1, zero implicitly represents an empty value. + bytes dictNameStr = 1; // varint length + value + bytes dictTagStr = 2; // varint length + value + repeated sint64 dictTagsets = 3; // length, delta encoded set of indexes into dictTagsStr + + bytes dictResourceStr = 4; // varint length + value + repeated int64 dictResourceLen = 5; // number of elements in Type and Name arrays + repeated sint64 dictResourceType = 6; // delta encoded set of indexes into dictResourceStr + repeated sint64 dictResourceName = 7; // delta encoded set of indexes into dictResourceStr + + bytes dictSourceTypeName = 8; // varint length + value + repeated int32 dictOriginInfo = 9; // (product, category, service) tuples + bytes dictUnitStr = 25; // varint length + value + + // One entry per time series + repeated uint64 types = 10; // type = metricType | valueType | metricFlags + repeated sint64 nameRefs = 11; // index into dictNameStr, entire array is delta encoded + repeated sint64 tagsetRefs = 12; // index into dictTagsets, entire array is delta encoded + repeated sint64 resourcesRefs = 13; // index into dictResourceLen, entire array is delta encoded + repeated uint64 intervals = 14; + repeated uint64 numPoints = 15; + repeated sint64 sourceTypeNameRefs = 23; // index into dictSourceTypeName, entire array is delta encoded + repeated sint64 originInfoRefs = 24; // index into dictOriginInfo, entire array is delta encoded + repeated sint64 unitRefs = 26; // index into dictUnitStr, value present if flagHasUnit is set, entire array is delta encoded + + // each metric has numPoints values in this section + repeated sint64 timestamps = 16; // entire array delta encoded + repeated sint64 valsSint64 = 17; // or + repeated float valsFloat32 = 18; // or + repeated double valsFloat64 = 19; // based on valueType + repeated uint64 sketchNumBins = 20; + repeated sint32 sketchBinKeys = 21; // per-metric sequence is delta encoded + repeated uint32 sketchBinCnts = 22; + // sketch summary Sum, Min, Max are encoded as three consecutive elements in one of vals using valueType + // sketch summary Cnt is always encoded in valInt64 + // sketch summary Avg is reconstructed as Sum/Cnt in the intake +} + +enum metricType { + UNUSED = 0; + Count = 1; + Rate = 2; + Gauge = 3; + Sketch = 4; +} + +enum valueType { + Zero = 0x00; // value is zero, not stored explicitly + Sint64 = 0x10; // value is stored in valsSint64 + Float32 = 0x20; // value is stored in valsFloat32 + Float64 = 0x30; // value is stored in valsFloat64 +} + +enum metricFlags { + flagNone = 0; + flagNoIndex = 0x100; // metric should not be indexed (equivalent to origin metric type == agent_hidden in v2) + flagHasUnit = 0x200; // timeseries has a unit in the unitRefs column +} + +message Response { + string error = 1; +} diff --git a/lib/protos/datadog/src/lib.rs b/lib/protos/datadog/src/lib.rs index 6a5a17272a7..92a4094201d 100644 --- a/lib/protos/datadog/src/lib.rs +++ b/lib/protos/datadog/src/lib.rs @@ -38,6 +38,11 @@ pub mod metrics { pub use super::include::agent_payload::metric_payload::*; pub use super::include::agent_payload::sketch_payload::{sketch::*, Sketch}; pub use super::include::agent_payload::*; + + /// Metrics V3 API-related definitions. + pub mod v3 { + pub use super::super::include::intake_v3::*; + } } /// Event-related definitions. diff --git a/lib/saluki-components/Cargo.toml b/lib/saluki-components/Cargo.toml index 9056455e589..3c5e9b3e6e2 100644 --- a/lib/saluki-components/Cargo.toml +++ b/lib/saluki-components/Cargo.toml @@ -84,6 +84,7 @@ tracing = { workspace = true } tracing-appender = { workspace = true } tracing-rolling-file = { workspace = true } url = { workspace = true } +uuid = { workspace = true, features = ["std", "v7"] } zstd = { workspace = true } [dev-dependencies] diff --git a/lib/saluki-components/etc/ignored_keys.yaml b/lib/saluki-components/etc/ignored_keys.yaml index b60897e3034..342f73f8c02 100644 --- a/lib/saluki-components/etc/ignored_keys.yaml +++ b/lib/saluki-components/etc/ignored_keys.yaml @@ -3074,10 +3074,6 @@ reason: initial bulk - name: security_agent.log_file reason: initial bulk -- name: serializer_experimental_use_v3_api.series.beta_route - reason: initial bulk -- name: serializer_experimental_use_v3_api.series.use_beta - reason: initial bulk - name: server_timeout reason: initial bulk - name: serverless.enabled diff --git a/lib/saluki-components/src/common/datadog/config.rs b/lib/saluki-components/src/common/datadog/config.rs index ac9a94a2802..5e0c6cadf5c 100644 --- a/lib/saluki-components/src/common/datadog/config.rs +++ b/lib/saluki-components/src/common/datadog/config.rs @@ -9,6 +9,7 @@ use tracing::warn; use super::{ endpoints::{EndpointConfiguration, EndpointRoute, RoutableEndpoint}, + protocol::V3ApiConfig, proxy::ProxyConfiguration, retry::RetryConfiguration, }; @@ -203,6 +204,13 @@ pub struct ForwarderConfiguration { )] connection_reset_interval_secs: u64, + /// V3 API configuration for per-endpoint V3 support. + /// + /// This is read from the encoder configuration and used by the I/O layer to filter payloads + /// based on endpoint URL matching. + #[serde(rename = "serializer_experimental_use_v3_api", default)] + v3_api: V3ApiConfig, + /// Whether to disable TLS certificate validation for Datadog intake forwarding. /// /// Defaults to `false`. If set to `true`, HTTPS clients built for the shared Datadog forwarder accept invalid @@ -341,6 +349,11 @@ impl ForwarderConfiguration { Duration::from_secs(self.connection_reset_interval_secs) } + /// Returns a reference to the V3 API configuration. + pub fn v3_api(&self) -> &V3ApiConfig { + &self.v3_api + } + /// Returns whether TLS certificate validation is disabled for Datadog intake forwarding. pub const fn skip_ssl_validation(&self) -> bool { self.skip_ssl_validation @@ -875,7 +888,10 @@ mod config_smoke { // config load in the smoke test has a valid starting point. run_config_smoke_tests( structs::FORWARDER_CONFIGURATION, - &[], + &[ + "serializer_experimental_use_v3_api.sketches.beta_route", + "serializer_experimental_use_v3_api.sketches.use_beta", + ], json!({ "api_key": "smoke-test-api-key" }), |cfg| ForwarderConfiguration::from_configuration(&cfg).expect("ForwarderConfiguration should deserialize"), ) diff --git a/lib/saluki-components/src/common/datadog/endpoints.rs b/lib/saluki-components/src/common/datadog/endpoints.rs index 95e287ce094..a94d97e42d7 100644 --- a/lib/saluki-components/src/common/datadog/endpoints.rs +++ b/lib/saluki-components/src/common/datadog/endpoints.rs @@ -16,6 +16,8 @@ use snafu::{ResultExt, Snafu}; use tracing::debug; use url::Url; +use super::protocol::{MetricsPayloadInfo, MetricsProtocolVersion}; + static DD_URL_REGEX: LazyLock = LazyLock::new(|| Regex::new(r"^app(\.mrf)?(\.[a-z]{2}\d)?\.(datad(oghq|0g)\.(com|eu)|ddog-gov\.com)$").unwrap()); @@ -25,6 +27,104 @@ fn default_site() -> String { DEFAULT_SITE.to_owned() } +/// Per-endpoint V3 protocol settings. +/// +/// These settings control which protocol versions an endpoint will accept for metrics payloads. +/// Settings are derived from a global `V3ApiConfig` by matching the endpoint URL against the +/// configured V3 endpoint lists. +#[derive(Clone, Debug, Default)] +pub struct EndpointV3Settings { + /// Whether this endpoint accepts V3 series payloads. + pub use_v3_series: bool, + + /// Whether this endpoint accepts V3 sketches payloads. + pub use_v3_sketches: bool, + + /// Whether validation mode is enabled for series (send both V2 and V3). + pub series_validation_mode: bool, + + /// Whether validation mode is enabled for sketches (send both V2 and V3). + pub sketches_validation_mode: bool, +} + +impl EndpointV3Settings { + /// Creates V3 settings for a specific endpoint based on URL matching. + /// + /// The `v3_series_endpoints` and `v3_sketches_endpoints` are lists of configured endpoint names. + /// If the endpoint name matches any entry, V3 is enabled for that metric type. + pub fn from_endpoint_url( + configured_endpoint: &str, v3_series_endpoints: &[String], v3_sketches_endpoints: &[String], + series_validate: bool, sketches_validate: bool, + ) -> Self { + let use_v3_series = v3_series_endpoints.iter().any(|e| configured_endpoint == e); + let use_v3_sketches = v3_sketches_endpoints.iter().any(|e| configured_endpoint == e); + + Self { + use_v3_series, + use_v3_sketches, + series_validation_mode: use_v3_series && series_validate, + sketches_validation_mode: use_v3_sketches && sketches_validate, + } + } + + /// Determines if this endpoint should receive a payload with the given payload info. + /// + /// Returns `true` if the endpoint should receive the payload, `false` otherwise. + /// + /// The logic is: + /// - V2 series payload: accept if series V3 is disabled OR series validation mode is enabled + /// - V2 sketches payload: accept if sketches V3 is disabled OR sketches validation mode is enabled + /// - V3 series payload: accept if series V3 is enabled + /// - V3 sketches payload: accept if sketches V3 is enabled + /// - Non-metrics payloads (None): always accept + pub fn should_receive_payload(&self, payload_info: Option) -> bool { + let Some(info) = payload_info else { + // No payload info - this is a non-metrics payload or legacy payload, always accept. + return true; + }; + + let is_sketch = info.is_sketch(); + + match info.version { + MetricsProtocolVersion::V2 => { + if is_sketch { + // V2 sketches: accept if V3 sketches is disabled OR validation mode is enabled + !self.use_v3_sketches || self.sketches_validation_mode + } else { + // V2 series: accept if V3 series is disabled OR validation mode is enabled + !self.use_v3_series || self.series_validation_mode + } + } + + MetricsProtocolVersion::V3 => { + if is_sketch { + // V3 sketches: accept if V3 sketches is enabled + self.use_v3_sketches + } else { + // V3 series: accept if V3 series is enabled + self.use_v3_series + } + } + } + } + + /// Determines if this endpoint should receive metrics validation headers. + /// + /// Validation headers are endpoint-scoped: they should only be sent to endpoints that are + /// receiving both V2 and V3 payloads for the payload's metric family. + pub fn should_receive_validation_headers(&self, payload_info: Option) -> bool { + let Some(info) = payload_info else { + return false; + }; + + if info.is_sketch() { + self.sketches_validation_mode + } else { + self.series_validation_mode + } + } +} + /// Error type for invalid endpoints. #[derive(Debug, Snafu)] #[snafu(context(suffix(false)))] @@ -105,6 +205,7 @@ impl AdditionalEndpoints { seen.insert(trimmed_api_key); resolved.push(ResolvedEndpoint { endpoint: endpoint.clone(), + configured_endpoint: raw_endpoint.to_string(), api_key: trimmed_api_key.to_string(), config: configuration.clone(), api_key_index: Some(index), @@ -215,6 +316,7 @@ impl EndpointConfiguration { #[derive(Clone, Debug)] pub struct ResolvedEndpoint { endpoint: Url, + configured_endpoint: String, api_key: String, config: Option, /// Position of this key in the `additional_endpoints` config key list for its URL (raw @@ -286,6 +388,7 @@ impl ResolvedEndpoint { let traces_authority = compute_traces_authority(&endpoint); Ok(Self { endpoint, + configured_endpoint: raw_endpoint.to_string(), api_key: api_key.to_string(), config: None, api_key_index: None, @@ -299,6 +402,7 @@ impl ResolvedEndpoint { pub fn with_configuration(self, config: Option) -> Self { Self { endpoint: self.endpoint, + configured_endpoint: self.configured_endpoint, api_key: self.api_key, config, api_key_index: self.api_key_index, @@ -313,6 +417,13 @@ impl ResolvedEndpoint { &self.endpoint } + /// Returns the endpoint string as it was provided by configuration. + /// + /// Unlike [`ResolvedEndpoint::endpoint`], this is not rewritten with the data plane version prefix. + pub fn configured_endpoint(&self) -> &str { + &self.configured_endpoint + } + /// Returns the API key associated with the endpoint. /// /// If a [`GenericConfiguration`] has been configured, the API key will be queried from the configuration and @@ -409,15 +520,19 @@ impl ResolvedEndpoint { } } +fn endpoint_with_default_scheme(raw_endpoint: &str) -> String { + if !raw_endpoint.starts_with("http://") && !raw_endpoint.starts_with("https://") { + format!("https://{}", raw_endpoint) + } else { + raw_endpoint.to_string() + } +} + fn parse_and_normalize_endpoint(raw_endpoint: &str) -> Result { // Start out by parsing the given domain/endpoint, which means ensuring first that it has a scheme. // // If no scheme is present, we assume HTTPS. - let raw_endpoint = if !raw_endpoint.starts_with("http://") && !raw_endpoint.starts_with("https://") { - format!("https://{}", raw_endpoint) - } else { - raw_endpoint.to_string() - }; + let raw_endpoint = endpoint_with_default_scheme(raw_endpoint); let endpoint = Url::parse(&raw_endpoint).context(Parse { endpoint: raw_endpoint })?; @@ -503,7 +618,7 @@ fn calculate_resolved_endpoint( // // We also do a little bit of prefixing to get it in the right shape before creating the resolved endpoint. let base_domain = if site.is_empty() { DEFAULT_SITE } else { site }; - format!("app.{}", base_domain) + format!("https://app.{}", base_domain) } }; @@ -820,4 +935,71 @@ mod tests { .expect("error calculating override API endpoint"); assert_eq!(expected_endpoint, resolved.endpoint().to_string()); } + + #[test] + fn validation_headers_are_scoped_to_payload_family() { + let settings = EndpointV3Settings { + use_v3_series: true, + use_v3_sketches: false, + series_validation_mode: true, + sketches_validation_mode: false, + }; + + assert!(settings.should_receive_validation_headers(Some(MetricsPayloadInfo::v2_series()))); + assert!(settings.should_receive_validation_headers(Some(MetricsPayloadInfo::v3_series()))); + assert!(!settings.should_receive_validation_headers(Some(MetricsPayloadInfo::v2_sketches()))); + assert!(!settings.should_receive_validation_headers(Some(MetricsPayloadInfo::v3_sketches()))); + assert!(!settings.should_receive_validation_headers(None)); + } + + #[test] + fn v3_endpoint_matching_uses_configured_endpoint_before_version_prefix() { + let resolved = ResolvedEndpoint::from_raw_endpoint("https://app.datadoghq.com", "fake-api-key") + .expect("endpoint should resolve"); + + assert_eq!("https://app.datadoghq.com", resolved.configured_endpoint()); + assert_ne!("app.datadoghq.com", resolved.endpoint().host_str().unwrap()); + + let v3_series_endpoints = vec!["https://app.datadoghq.com".to_string()]; + let settings = EndpointV3Settings::from_endpoint_url( + resolved.configured_endpoint(), + &v3_series_endpoints, + &[], + false, + false, + ); + + assert!(settings.use_v3_series); + } + + #[test] + fn v3_endpoint_matching_is_endpoint_based() { + let v3_series_endpoints = vec!["https://app.us".to_string()]; + let settings = EndpointV3Settings::from_endpoint_url( + "https://app.us5.datadoghq.com", + &v3_series_endpoints, + &[], + false, + false, + ); + + assert!(!settings.use_v3_series); + } + + #[test] + fn v3_endpoint_matching_requires_exact_configured_endpoint() { + let v3_series_endpoints = vec!["app.datadoghq.com/".to_string()]; + let settings = + EndpointV3Settings::from_endpoint_url("https://app.datadoghq.com", &v3_series_endpoints, &[], false, false); + + assert!(!settings.use_v3_series); + } + + #[test] + fn calculated_site_endpoint_uses_agent_configured_endpoint_shape() { + let resolved = + calculate_resolved_endpoint(None, "datadoghq.com", "").expect("error calculating default API endpoint"); + + assert_eq!("https://app.datadoghq.com", resolved.configured_endpoint()); + } } diff --git a/lib/saluki-components/src/common/datadog/io.rs b/lib/saluki-components/src/common/datadog/io.rs index 61eaf11b195..bff99da9212 100644 --- a/lib/saluki-components/src/common/datadog/io.rs +++ b/lib/saluki-components/src/common/datadog/io.rs @@ -35,7 +35,7 @@ use tracing::{debug, error, warn}; use super::{ config::ForwarderConfiguration, - endpoints::{EndpointRoute, ResolvedEndpoint, RoutableEndpoint}, + endpoints::{EndpointRoute, EndpointV3Settings, ResolvedEndpoint, RoutableEndpoint}, middleware::{for_resolved_endpoint, with_allow_arbitrary_tags, with_version_info}, telemetry::{ComponentTelemetry, SharedTransactionQueueTelemetry, TransactionQueueTelemetry}, transaction::{Metadata, Transaction, TransactionBody}, @@ -287,7 +287,8 @@ async fn run_io_loop( // Listen for transactions to forward, and send a copy of each one to the matching endpoint I/O tasks. while let Some(transaction) = transactions_rx.recv().await { - let is_metrics_request = is_metrics_request_uri(transaction.request_uri()); + let is_metrics_request = + is_metrics_request_uri(transaction.request_uri(), config.v3_api().series.beta_route.as_str()); for endpoint_sender in &endpoint_txs { if !should_route_to_endpoint(is_metrics_request, has_metrics_primary, endpoint_sender.route) { continue; @@ -331,8 +332,8 @@ where tx: mpsc::Sender>, } -fn is_metrics_request_uri(uri: &Uri) -> bool { - METRIC_INTAKE_PATHS.contains(&uri.path()) +fn is_metrics_request_uri(uri: &Uri, v3_beta_series_route: &str) -> bool { + METRIC_INTAKE_PATHS.contains(&uri.path()) || uri.path() == v3_beta_series_route } fn should_route_to_endpoint(is_metrics_request: bool, has_metrics_primary: bool, route: EndpointRoute) -> bool { @@ -359,10 +360,23 @@ async fn run_endpoint_io_loop( { let queue_id = generate_retry_queue_id(context, &endpoint); let endpoint_url = endpoint.endpoint().to_string(); + let configured_endpoint = endpoint.configured_endpoint().to_string(); let endpoint_domain = endpoint.endpoint().origin().ascii_serialization(); + + // Match against the endpoint string from configuration, not the version-prefixed URL used for requests. + let v3_api = config.v3_api(); + let endpoint_v3_settings = EndpointV3Settings::from_endpoint_url( + &configured_endpoint, + &v3_api.series.endpoints, + &v3_api.sketches.endpoints, + v3_api.series.validate, + v3_api.sketches.validate, + ); debug!( endpoint_url, + configured_endpoint, num_workers = config.endpoint_concurrency(), + ?endpoint_v3_settings, "Starting endpoint I/O task." ); @@ -416,9 +430,27 @@ async fn run_endpoint_io_loop( select! { // Try and drain the next transaction from our channel, and push it into the pending transactions queue. maybe_txn = txns_rx.recv(), if !done => match maybe_txn { - Some(txn) => match pending_txns.push_high_priority(txn).await { - Ok(push_result) => track_queue_drops(&telemetry, &endpoint_domain, push_result), - Err(e) => error!(endpoint_url, error = %e, "Failed to enqueue transaction. Events may be permanently lost."), + Some(txn) => { + // Filter transactions based on endpoint's V3 settings and the transaction's payload info. + let payload_info = txn.metadata().payload_info; + if !endpoint_v3_settings.should_receive_payload(payload_info) { + debug!( + endpoint_url, + ?payload_info, + "Filtering out transaction based on endpoint V3 settings." + ); + continue; + } + let txn = if endpoint_v3_settings.should_receive_validation_headers(payload_info) { + txn + } else { + strip_metrics_validation_headers(txn) + }; + + match pending_txns.push_high_priority(txn).await { + Ok(push_result) => track_queue_drops(&telemetry, &endpoint_domain, push_result), + Err(e) => error!(endpoint_url, error = %e, "Failed to enqueue transaction. Events may be permanently lost."), + } }, None => { // Our transactions channel has been closed, so mark ourselves as done which will stop any further @@ -511,6 +543,18 @@ async fn run_endpoint_io_loop( task_barrier.wait().await; } +fn strip_metrics_validation_headers(txn: Transaction) -> Transaction +where + B: Buf + Clone, +{ + let (metadata, mut request) = txn.into_parts(); + let headers = request.headers_mut(); + headers.remove("X-Metrics-Request-ID"); + headers.remove("X-Metrics-Request-Seq"); + headers.remove("X-Metrics-Request-Len"); + Transaction::reassemble(metadata, request) +} + fn generate_retry_queue_id(context: ComponentContext, endpoint: &ResolvedEndpoint) -> String { // For additional endpoints we hash over the api_key_index (the stable position of this key in // the additional_endpoints config list) rather than the raw API key value. This means the queue @@ -845,23 +889,45 @@ mod tests { use super::*; use crate::common::datadog::endpoints::AdditionalEndpoints; use crate::common::datadog::transaction::{Metadata as TxnMetadata, Transaction}; - use crate::common::datadog::{METRICS_SERIES_V1_PATH, METRICS_SERIES_V2_PATH, METRICS_SKETCHES_PATH}; + use crate::common::datadog::{ + METRICS_SERIES_V1_PATH, METRICS_SERIES_V2_PATH, METRICS_SERIES_V3_BETA_PATH, METRICS_SERIES_V3_PATH, + METRICS_SKETCHES_PATH, METRICS_SKETCHES_V3_PATH, + }; fn uri(path: &'static str) -> Uri { Uri::from_static(path) } + fn is_metrics_request_path(path: &'static str) -> bool { + is_metrics_request_uri(&uri(path), METRICS_SERIES_V3_BETA_PATH) + } + fn forwarder_config_from_value(value: serde_json::Value) -> ForwarderConfiguration { serde_json::from_value(value).expect("ForwarderConfiguration should deserialize") } #[test] fn identifies_metrics_request_paths() { - assert!(is_metrics_request_uri(&uri(METRICS_SERIES_V1_PATH))); - assert!(is_metrics_request_uri(&uri(METRICS_SERIES_V2_PATH))); - assert!(is_metrics_request_uri(&uri(METRICS_SKETCHES_PATH))); - assert!(!is_metrics_request_uri(&uri("/api/v2/logs"))); - assert!(!is_metrics_request_uri(&uri("/api/v0.2/traces"))); + assert!(is_metrics_request_path(METRICS_SERIES_V1_PATH)); + assert!(is_metrics_request_path(METRICS_SERIES_V2_PATH)); + assert!(is_metrics_request_path(METRICS_SERIES_V3_PATH)); + assert!(is_metrics_request_path(METRICS_SERIES_V3_BETA_PATH)); + assert!(is_metrics_request_path(METRICS_SKETCHES_PATH)); + assert!(is_metrics_request_path(METRICS_SKETCHES_V3_PATH)); + assert!(!is_metrics_request_path("/api/v2/logs")); + assert!(!is_metrics_request_path("/api/v0.2/traces")); + } + + #[test] + fn identifies_configured_v3_beta_series_route_as_metrics_path() { + assert!(is_metrics_request_uri( + &uri("/custom/v3beta/series"), + "/custom/v3beta/series" + )); + assert!(!is_metrics_request_uri( + &uri("/custom/v3beta/series"), + METRICS_SERIES_V3_BETA_PATH + )); } #[test] diff --git a/lib/saluki-components/src/common/datadog/mod.rs b/lib/saluki-components/src/common/datadog/mod.rs index 9ca953a43e0..7e4088d6ba5 100644 --- a/lib/saluki-components/src/common/datadog/mod.rs +++ b/lib/saluki-components/src/common/datadog/mod.rs @@ -4,6 +4,7 @@ pub mod endpoints; pub mod io; pub mod middleware; pub mod obfuscation; +pub mod protocol; mod proxy; pub mod request_builder; mod retry; @@ -45,14 +46,29 @@ pub(crate) const METRICS_SERIES_V1_PATH: &str = "/api/v1/series"; /// V2 metric series intake path. pub(crate) const METRICS_SERIES_V2_PATH: &str = "/api/v2/series"; +/// V3 metric series intake path. +pub(crate) const METRICS_SERIES_V3_PATH: &str = "/api/intake/metrics/v3/series"; + +/// V3 beta metric series intake path. +pub(crate) const METRICS_SERIES_V3_BETA_PATH: &str = "/api/intake/metrics/v3beta/series"; + /// Metric sketches intake path. pub(crate) const METRICS_SKETCHES_PATH: &str = "/api/beta/sketches"; +/// V3 metric sketches intake path. +pub(crate) const METRICS_SKETCHES_V3_PATH: &str = "/api/intake/metrics/v3/sketches"; + /// Metric intake paths emitted by the encoder and matched by OPW routing. /// /// Keep these paths in one place so metric encoding and OPW routing don't drift. -pub(crate) const METRIC_INTAKE_PATHS: [&str; 3] = - [METRICS_SERIES_V1_PATH, METRICS_SERIES_V2_PATH, METRICS_SKETCHES_PATH]; +pub(crate) const METRIC_INTAKE_PATHS: [&str; 6] = [ + METRICS_SERIES_V1_PATH, + METRICS_SERIES_V2_PATH, + METRICS_SERIES_V3_PATH, + METRICS_SERIES_V3_BETA_PATH, + METRICS_SKETCHES_PATH, + METRICS_SKETCHES_V3_PATH, +]; /// Metadata tag used to store the sampling decision maker (`_dd.p.dm`). pub const TAG_DECISION_MAKER: &str = "_dd.p.dm"; diff --git a/lib/saluki-components/src/common/datadog/protocol.rs b/lib/saluki-components/src/common/datadog/protocol.rs new file mode 100644 index 00000000000..394f78ceaa7 --- /dev/null +++ b/lib/saluki-components/src/common/datadog/protocol.rs @@ -0,0 +1,170 @@ +//! Protocol version types for Datadog payloads. + +use facet::Facet; +use serde::{Deserialize, Serialize}; + +use super::METRICS_SERIES_V3_BETA_PATH; + +fn default_v3_beta_series_route() -> String { + METRICS_SERIES_V3_BETA_PATH.to_owned() +} + +/// The type of metrics payload. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum MetricsPayloadType { + /// Series metrics (counters, gauges, rates, sets). + Series, + + /// Sketch metrics (histograms, distributions). + Sketches, +} + +/// Protocol version for metrics payloads. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum MetricsProtocolVersion { + /// V2 protocol (legacy format). + V2, + + /// V3 protocol (columnar format). + V3, +} + +/// Combined payload info for metrics, encoding both protocol version and metric type. +/// +/// This is stored in `PayloadMetadata` and used by the I/O layer to filter payloads +/// based on endpoint V3 settings. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MetricsPayloadInfo { + /// The protocol version (V2 or V3). + pub version: MetricsProtocolVersion, + + /// The type of metrics (series or sketches). + pub payload_type: MetricsPayloadType, +} + +impl MetricsPayloadInfo { + /// Creates a new V2 series payload info. + pub const fn v2_series() -> Self { + Self { + version: MetricsProtocolVersion::V2, + payload_type: MetricsPayloadType::Series, + } + } + + /// Creates a new V2 sketches payload info. + pub const fn v2_sketches() -> Self { + Self { + version: MetricsProtocolVersion::V2, + payload_type: MetricsPayloadType::Sketches, + } + } + + /// Creates a new V3 series payload info. + pub const fn v3_series() -> Self { + Self { + version: MetricsProtocolVersion::V3, + payload_type: MetricsPayloadType::Series, + } + } + + /// Creates a new V3 sketches payload info. + pub const fn v3_sketches() -> Self { + Self { + version: MetricsProtocolVersion::V3, + payload_type: MetricsPayloadType::Sketches, + } + } + + /// Returns true if this is a sketch payload. + pub const fn is_sketch(&self) -> bool { + matches!(self.payload_type, MetricsPayloadType::Sketches) + } +} + +/// V3 API settings for a specific metric type (series or sketches). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Facet)] +pub struct V3ApiSettings { + /// Endpoints that should receive V3 payloads for this metric type. + /// + /// Each entry should be a configured endpoint name, such as `https://app.datadoghq.com`. + /// If empty, no V3 payloads are generated for this metric type. + #[serde(default)] + pub endpoints: Vec, + + /// Whether to also send V2 payloads to V3-enabled endpoints (validation mode). + /// + /// When true, endpoints in the `endpoints` list receive both V2 and V3 payloads. + /// When false, endpoints in the `endpoints` list receive only V3 payloads. + #[serde(default)] + pub validate: bool, + + /// Whether to use the beta V3 route for this metric type. + /// + /// This only applies to series metrics. Sketches always use the standard V3 sketches route. + #[serde(default)] + pub use_beta: bool, + + /// Beta V3 route to use when `use_beta` is enabled for series metrics. + /// + /// Defaults to `/api/intake/metrics/v3beta/series`. + #[serde(default = "default_v3_beta_series_route")] + pub beta_route: String, +} + +impl Default for V3ApiSettings { + fn default() -> Self { + Self { + endpoints: Vec::new(), + validate: false, + use_beta: false, + beta_route: default_v3_beta_series_route(), + } + } +} + +impl V3ApiSettings { + /// Returns true if V3 is enabled for any endpoint. + pub fn is_enabled(&self) -> bool { + !self.endpoints.is_empty() + } +} + +/// V3 API configuration for per-endpoint V3 support. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize, Facet)] +pub struct V3ApiConfig { + /// V3 settings for series metrics (counters, gauges, rates, sets). + #[serde(default)] + pub series: V3ApiSettings, + + /// V3 settings for sketch metrics (histograms, distributions). + #[serde(default)] + pub sketches: V3ApiSettings, + + /// Override compression level for V3 payloads. + /// + /// Defaults to `0`, which uses the normal serializer compression level. + #[serde(default)] + pub compression_level: i32, +} + +impl V3ApiConfig { + /// Returns true if V3 is enabled for series metrics. + pub fn use_v3_series(&self) -> bool { + self.series.is_enabled() + } + + /// Returns true if V3 is enabled for sketch metrics. + pub fn use_v3_sketches(&self) -> bool { + self.sketches.is_enabled() + } + + /// Returns true if validation mode is enabled for series metrics. + pub fn use_v3_series_validate(&self) -> bool { + self.series.is_enabled() && self.series.validate + } + + /// Returns true if validation mode is enabled for sketch metrics. + pub fn use_v3_sketches_validate(&self) -> bool { + self.sketches.is_enabled() && self.sketches.validate + } +} diff --git a/lib/saluki-components/src/common/datadog/transaction.rs b/lib/saluki-components/src/common/datadog/transaction.rs index 5c946b0b9a5..e2258aeccb6 100644 --- a/lib/saluki-components/src/common/datadog/transaction.rs +++ b/lib/saluki-components/src/common/datadog/transaction.rs @@ -10,6 +10,8 @@ use pin_project::pin_project; use saluki_io::net::util::retry::{EventContainer, Retryable}; use serde::{ser::SerializeSeq as _, Deserialize, Serialize, Serializer}; +use super::protocol::MetricsPayloadInfo; + /// Data type for the body of `TransactionBody`. pub enum TransactionBodyData where @@ -178,6 +180,12 @@ pub struct Metadata { /// Number of metric data points represented by this transaction. #[serde(default)] pub data_point_count: usize, + + /// Payload info containing protocol version and metric type, if applicable. + /// + /// This is `Some` for metrics payloads and `None` for non-metrics payloads. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub payload_info: Option, } impl Metadata { @@ -186,6 +194,7 @@ impl Metadata { Self { event_count, data_point_count, + payload_info: None, } } } diff --git a/lib/saluki-components/src/config_registry/datadog/encoders.rs b/lib/saluki-components/src/config_registry/datadog/encoders.rs index 3a7da6ae683..5075813ce2f 100644 --- a/lib/saluki-components/src/config_registry/datadog/encoders.rs +++ b/lib/saluki-components/src/config_registry/datadog/encoders.rs @@ -184,4 +184,109 @@ crate::declare_annotations! { test_json: None, pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::Traces]), }; + + /// `serializer_experimental_use_v3_api.compression_level`—compression level for V3 payloads. + SERIALIZER_EXPERIMENTAL_USE_V3_API_COMPRESSION_LEVEL = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_COMPRESSION_LEVEL, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + + /// `serializer_experimental_use_v3_api.series.beta_route`—intake route for V3 beta series payloads. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_BETA_ROUTE = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_BETA_ROUTE, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + + /// `serializer_experimental_use_v3_api.series.endpoints`—additional endpoints for V3 series payloads. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_ENDPOINTS = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_ENDPOINTS, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + + /// `serializer_experimental_use_v3_api.series.use_beta`—whether to send V3 series payloads to the beta route. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_USE_BETA = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_USE_BETA, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + + /// `serializer_experimental_use_v3_api.series.validate`—enable V2/V3 validation mode for series payloads. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_VALIDATE = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_VALIDATE, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + + /// `serializer_experimental_use_v3_api.sketches.endpoints`—additional endpoints for V3 sketch payloads. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_ENDPOINTS = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_ENDPOINTS, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + + /// `serializer_experimental_use_v3_api.sketches.validate`—enable V2/V3 validation mode for sketch payloads. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_VALIDATE = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_VALIDATE, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; } diff --git a/lib/saluki-components/src/config_registry/datadog/unsupported.rs b/lib/saluki-components/src/config_registry/datadog/unsupported.rs index 2cb1e030a4f..77f132a5d44 100644 --- a/lib/saluki-components/src/config_registry/datadog/unsupported.rs +++ b/lib/saluki-components/src/config_registry/datadog/unsupported.rs @@ -177,10 +177,10 @@ crate::declare_annotations! { pipeline_affinity: PipelineAffinity::CrossCutting, }; - /// `serializer_experimental_use_v3_api.compression_level` - V3 API compression level. - SERIALIZER_EXPERIMENTAL_USE_V3_API_COMPRESSION_LEVEL = SalukiAnnotation { - schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_COMPRESSION_LEVEL, - // V3 metrics API not implemented. #1468 + /// `serializer_experimental_use_v3_api.series.shadow_sample_rate` - V3 API series shadow traffic sample rate. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SAMPLE_RATE = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SAMPLE_RATE, + // V3 shadow traffic not implemented. support_level: SupportLevel::Incompatible(Severity::Low), additional_yaml_paths: &[], env_var_override: None, @@ -191,52 +191,10 @@ crate::declare_annotations! { pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), }; - /// `serializer_experimental_use_v3_api.series.endpoints` - V3 API series endpoints. - SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_ENDPOINTS = SalukiAnnotation { - schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_ENDPOINTS, - // V3 metrics API not implemented. #1468 - support_level: SupportLevel::Incompatible(Severity::Low), - additional_yaml_paths: &[], - env_var_override: None, - used_by: &[], - value_type_override: None, - test_json: None, - // Metrics encoder (dd_metrics_encode) is used by DogStatsD, Checks, and OTLP native (Traces active); APM traces use a separate encoder. - pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), - }; - - /// `serializer_experimental_use_v3_api.series.validate` - V3 API series validation. - SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_VALIDATE = SalukiAnnotation { - schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_VALIDATE, - // V3 metrics API not implemented. #1468 - support_level: SupportLevel::Incompatible(Severity::Low), - additional_yaml_paths: &[], - env_var_override: None, - used_by: &[], - value_type_override: None, - test_json: None, - // Metrics encoder (dd_metrics_encode) is used by DogStatsD, Checks, and OTLP native (Traces active); APM traces use a separate encoder. - pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), - }; - - /// `serializer_experimental_use_v3_api.sketches.endpoints` - V3 API sketches endpoints. - SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_ENDPOINTS = SalukiAnnotation { - schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_ENDPOINTS, - // V3 metrics API not implemented. #1468 - support_level: SupportLevel::Incompatible(Severity::Low), - additional_yaml_paths: &[], - env_var_override: None, - used_by: &[], - value_type_override: None, - test_json: None, - // Metrics encoder (dd_metrics_encode) is used by DogStatsD, Checks, and OTLP native (Traces active); APM traces use a separate encoder. - pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), - }; - - /// `serializer_experimental_use_v3_api.sketches.validate` - V3 API sketches validation. - SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_VALIDATE = SalukiAnnotation { - schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SKETCHES_VALIDATE, - // V3 metrics API not implemented. #1468 + /// `serializer_experimental_use_v3_api.series.shadow_sites` - V3 API series shadow traffic sites. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SITES = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SITES, + // V3 shadow traffic not implemented. support_level: SupportLevel::Incompatible(Severity::Low), additional_yaml_paths: &[], env_var_override: None, diff --git a/lib/saluki-components/src/encoders/datadog/metrics/endpoint.rs b/lib/saluki-components/src/encoders/datadog/metrics/endpoint.rs new file mode 100644 index 00000000000..a567848ec03 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/endpoint.rs @@ -0,0 +1,64 @@ +use saluki_context::tags::SharedTagSet; +use saluki_core::data_model::event::metric::{Metric, MetricValues}; +use saluki_io::compression::CompressionScheme; + +/// Metrics intake endpoint. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MetricsEndpoint { + /// V1 series metrics, encoded as JSON and sent to `/api/v1/series`. + /// + /// Includes counters, gauges, rates, and sets. Selected when `use_v2_api_series` is `false`. + SeriesV1, + + /// V2 series metrics, encoded as Protocol Buffers and sent to `/api/v2/series`. + /// + /// Includes counters, gauges, rates, and sets. The default series encoding. + SeriesV2, + + /// Sketch metrics, encoded as Protocol Buffers and sent to `/api/beta/sketches`. + /// + /// Includes histograms and distributions. Always uses the V2 endpoint regardless of `use_v2_api_series`. + Sketches, +} + +impl MetricsEndpoint { + /// Creates a new `MetricsEndpoint` from the given metric. + pub fn from_metric(metric: &Metric) -> Self { + match metric.values() { + MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) => { + Self::SeriesV2 + } + MetricValues::Histogram(..) | MetricValues::Distribution(..) => Self::Sketches, + } + } +} + +pub struct EndpointConfiguration { + compression_scheme: CompressionScheme, + max_metrics_per_payload: usize, + additional_tags: SharedTagSet, +} + +impl EndpointConfiguration { + pub fn new( + compression_scheme: CompressionScheme, max_metrics_per_payload: usize, additional_tags: Option, + ) -> Self { + Self { + compression_scheme, + max_metrics_per_payload, + additional_tags: additional_tags.unwrap_or_default(), + } + } + + pub fn compression_scheme(&self) -> CompressionScheme { + self.compression_scheme + } + + pub fn max_metrics_per_payload(&self) -> usize { + self.max_metrics_per_payload + } + + pub fn additional_tags(&self) -> &SharedTagSet { + &self.additional_tags + } +} diff --git a/lib/saluki-components/src/encoders/datadog/metrics/mod.rs b/lib/saluki-components/src/encoders/datadog/metrics/mod.rs index 80864151619..4b64edff5ad 100644 --- a/lib/saluki-components/src/encoders/datadog/metrics/mod.rs +++ b/lib/saluki-components/src/encoders/datadog/metrics/mod.rs @@ -1,13 +1,15 @@ -use std::{fmt, num::NonZeroU64, time::Duration}; +use std::{collections::VecDeque, ops::Range, time::Duration}; use async_trait::async_trait; -use datadog_protos::metrics as proto; use ddsketch::DDSketch; use facet::Facet; -use http::{uri::PathAndQuery, HeaderValue, Method, Uri}; -use protobuf::{rt::WireType, CodedOutputStream, Enum as _}; +use http::{HeaderValue, Method, Request}; +use protobuf::{rt::WireType, CodedOutputStream}; use resource_accounting::{MemoryBounds, MemoryBoundsBuilder}; -use saluki_common::{iter::ReusableDeduplicator, task::HandleExt as _}; +use saluki_common::{ + buf::{ChunkedBytesBuffer, FrozenChunkedBytesBuffer}, + task::HandleExt as _, +}; use saluki_config::GenericConfiguration; use saluki_context::tags::{SharedTagSet, Tag}; use saluki_core::{ @@ -23,78 +25,40 @@ use saluki_core::{ topology::{EventsBuffer, PayloadsBuffer}, }; use saluki_error::{generic_error, ErrorContext as _, GenericError}; -use saluki_io::compression::CompressionScheme; +use saluki_io::compression::{CompressionScheme, Compressor}; use saluki_metrics::MetricsBuilder; use serde::Deserialize; -use serde_json::{Map as JsonMap, Number as JsonNumber, Value as JsonValue}; -use tokio::{select, sync::mpsc, time::sleep}; +use tokio::{io::AsyncWriteExt as _, select, sync::mpsc, time::sleep}; use tracing::{debug, error, warn}; - -use crate::common::datadog::{ - clamp_payload_limits, - io::RB_BUFFER_CHUNK_SIZE, - request_builder::{EndpointEncoder, RequestBuilder}, - telemetry::ComponentTelemetry, - DEFAULT_SERIALIZER_COMPRESSED_SIZE_LIMIT, DEFAULT_SERIALIZER_UNCOMPRESSED_SIZE_LIMIT, METRICS_SERIES_V1_PATH, - METRICS_SERIES_V2_PATH, METRICS_SKETCHES_PATH, +use uuid::Uuid; + +use self::v3::{V3EncodedRequest, V3PayloadLimits, V3PayloadRequest}; +use crate::{ + common::datadog::{ + clamp_payload_limits, + io::RB_BUFFER_CHUNK_SIZE, + protocol::{MetricsPayloadInfo, V3ApiConfig}, + request_builder::RequestBuilder, + telemetry::ComponentTelemetry, + DEFAULT_SERIALIZER_COMPRESSED_SIZE_LIMIT, DEFAULT_SERIALIZER_UNCOMPRESSED_SIZE_LIMIT, METRICS_SERIES_V3_PATH, + METRICS_SKETCHES_V3_PATH, + }, + encoders::datadog::metrics::v2::MetricsEndpointEncoder, }; -const SERIES_V2_COMPRESSED_SIZE_LIMIT: usize = 512_000; // 500 KiB -const SERIES_V2_UNCOMPRESSED_SIZE_LIMIT: usize = 5_242_880; // 5 MiB +mod endpoint; +use self::endpoint::{EndpointConfiguration, MetricsEndpoint}; -// V1 series JSON endpoint limits match the Datadog Agent's generic serializer defaults. -const SERIES_V1_COMPRESSED_SIZE_LIMIT: usize = DEFAULT_SERIALIZER_COMPRESSED_SIZE_LIMIT; -const SERIES_V1_UNCOMPRESSED_SIZE_LIMIT: usize = DEFAULT_SERIALIZER_UNCOMPRESSED_SIZE_LIMIT; +mod v1; +mod v2; +mod v3; const DEFAULT_SERIALIZER_COMPRESSOR_KIND: &str = "zstd"; +const V3_SERIES_ENDPOINT_URI: &str = METRICS_SERIES_V3_PATH; +const V3_SKETCHES_ENDPOINT_URI: &str = METRICS_SKETCHES_V3_PATH; -// Protocol Buffers field numbers for series and sketch payload messages. -// -// These field numbers come from the Protocol Buffers definitions in `lib/datadog-protos/proto/agent_payload.proto`. -const RESOURCES_TYPE_FIELD_NUMBER: u32 = 1; -const RESOURCES_NAME_FIELD_NUMBER: u32 = 2; - -const METADATA_ORIGIN_FIELD_NUMBER: u32 = 1; - -const ORIGIN_ORIGIN_PRODUCT_FIELD_NUMBER: u32 = 4; -const ORIGIN_ORIGIN_CATEGORY_FIELD_NUMBER: u32 = 5; -const ORIGIN_ORIGIN_SERVICE_FIELD_NUMBER: u32 = 6; - -const METRIC_POINT_VALUE_FIELD_NUMBER: u32 = 1; -const METRIC_POINT_TIMESTAMP_FIELD_NUMBER: u32 = 2; - -const DOGSKETCH_TS_FIELD_NUMBER: u32 = 1; -const DOGSKETCH_CNT_FIELD_NUMBER: u32 = 2; -const DOGSKETCH_MIN_FIELD_NUMBER: u32 = 3; -const DOGSKETCH_MAX_FIELD_NUMBER: u32 = 4; -const DOGSKETCH_AVG_FIELD_NUMBER: u32 = 5; -const DOGSKETCH_SUM_FIELD_NUMBER: u32 = 6; -const DOGSKETCH_K_FIELD_NUMBER: u32 = 7; -const DOGSKETCH_N_FIELD_NUMBER: u32 = 8; - -const SERIES_RESOURCES_FIELD_NUMBER: u32 = 1; -const SERIES_METRIC_FIELD_NUMBER: u32 = 2; -const SERIES_TAGS_FIELD_NUMBER: u32 = 3; -const SERIES_POINTS_FIELD_NUMBER: u32 = 4; -const SERIES_TYPE_FIELD_NUMBER: u32 = 5; -const SERIES_UNIT_FIELD_NUMBER: u32 = 6; -const SERIES_SOURCE_TYPE_NAME_FIELD_NUMBER: u32 = 7; -const SERIES_INTERVAL_FIELD_NUMBER: u32 = 8; -const SERIES_METADATA_FIELD_NUMBER: u32 = 9; - -const SKETCH_METRIC_FIELD_NUMBER: u32 = 1; -const SKETCH_HOST_FIELD_NUMBER: u32 = 2; -const SKETCH_TAGS_FIELD_NUMBER: u32 = 4; -const SKETCH_DOGSKETCHES_FIELD_NUMBER: u32 = 7; -const SKETCH_METADATA_FIELD_NUMBER: u32 = 8; - -static CONTENT_TYPE_PROTOBUF: HeaderValue = HeaderValue::from_static("application/x-protobuf"); -static CONTENT_TYPE_JSON: HeaderValue = HeaderValue::from_static("application/json"); - -// JSON framing for the V1 series payload, which wraps the array of `Serie` objects in a top-level object. -const SERIES_V1_PAYLOAD_PREFIX: &[u8] = b"{\"series\":["; -const SERIES_V1_PAYLOAD_SUFFIX: &[u8] = b"]}"; -const SERIES_V1_INPUT_SEPARATOR: &[u8] = b","; +// V3 keeps the Datadog Agent's point-count limit as an internal bound, not user-facing ADP configuration. +const SERIES_V3_POINTS_PER_PAYLOAD_LIMIT: usize = 10_000; const fn default_max_metrics_per_payload() -> usize { 10_000 @@ -109,11 +73,11 @@ const fn default_max_uncompressed_payload_size() -> usize { } const fn default_max_series_payload_size() -> usize { - SERIES_V2_COMPRESSED_SIZE_LIMIT + v2::SERIES_V2_COMPRESSED_SIZE_LIMIT } const fn default_max_series_uncompressed_payload_size() -> usize { - SERIES_V2_UNCOMPRESSED_SIZE_LIMIT + v2::SERIES_V2_UNCOMPRESSED_SIZE_LIMIT } const fn default_flush_timeout_secs() -> u64 { @@ -136,12 +100,45 @@ const fn default_log_payloads() -> bool { false } +/// Encoding mode for a metrics endpoint. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum MetricsEncoderMode { + /// Send V2 payloads only. + V2Only, + /// V3 is enabled for at least one endpoint; generate tagged V2 and V3 payloads so each endpoint + /// receives the protocol version configured for it. + V3Enabled, + /// Send both V2 and V3 payloads simultaneously with a shared batch ID for backend validation. + Validation, +} + +impl MetricsEncoderMode { + fn from_config(use_v3: bool, validate: bool) -> Self { + match (use_v3, validate) { + (false, _) => Self::V2Only, + (true, false) => Self::V3Enabled, + (true, true) => Self::Validation, + } + } + + fn needs_v3(self) -> bool { + matches!(self, Self::V3Enabled | Self::Validation) + } + + fn needs_batch_id(self) -> bool { + matches!(self, Self::Validation) + } + + fn needs_tagging(self) -> bool { + matches!(self, Self::V3Enabled | Self::Validation) + } +} + /// Datadog Metrics encoder. /// /// Generates Datadog metrics payloads for the Datadog platform. #[derive(Clone, Deserialize, Facet)] #[cfg_attr(test, derive(Debug, PartialEq, serde::Serialize))] -#[allow(dead_code)] pub struct DatadogMetricsConfiguration { /// Maximum number of input metrics to encode into a single request payload. /// @@ -213,7 +210,7 @@ pub struct DatadogMetricsConfiguration { /// Flush timeout for pending requests, in seconds. /// - /// When the destination has written metrics to the in-flight request payload, but it hasn't yet reached the + /// When the destination has written metrics to the in-flight request payload, but it has not yet reached the /// payload size limits that would force the payload to be flushed, the destination will wait for a period of time /// before flushing the in-flight request payload. This allows for the possibility of other events to be processed /// and written into the request payload, thereby maximizing the payload size and reducing the number of requests @@ -263,6 +260,12 @@ pub struct DatadogMetricsConfiguration { #[serde(default, skip)] #[facet(opaque)] additional_tags: Option, + + /// V3 API configuration for per-endpoint V3 support. + /// + /// Configures which endpoints receive V3 payloads and whether validation mode is enabled. + #[serde(rename = "serializer_experimental_use_v3_api", default)] + v3_api: V3ApiConfig, } impl DatadogMetricsConfiguration { @@ -273,7 +276,6 @@ impl DatadogMetricsConfiguration { /// Sets additional tags to be applied uniformly to all metrics forwarded by this destination. pub fn with_additional_tags(mut self, additional_tags: SharedTagSet) -> Self { - // Add the additional tags to the forwarder configuration. self.additional_tags = Some(additional_tags); self } @@ -292,25 +294,47 @@ impl EncoderBuilder for DatadogMetricsConfiguration { async fn build(&self, context: ComponentContext) -> Result, GenericError> { let metrics_builder = MetricsBuilder::from_component_context(&context); let telemetry = ComponentTelemetry::from_builder(&metrics_builder); - let compression_scheme = CompressionScheme::new(&self.compressor_kind, self.zstd_compressor_level); - // Create our request builders. - let series_endpoint = if self.use_v2_api_series { - MetricsEndpoint::SeriesV2 + let v2_compression_scheme = CompressionScheme::new(&self.compressor_kind, self.zstd_compressor_level); + let v3_compression_scheme = if self.v3_api.compression_level > 0 { + CompressionScheme::new(&self.compressor_kind, self.v3_api.compression_level) } else { - MetricsEndpoint::SeriesV1 + v2_compression_scheme }; - let mut series_encoder = MetricsEndpointEncoder::from_endpoint(series_endpoint); - let mut sketches_encoder = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::Sketches); + let v3_series_endpoint_uri = if self.v3_api.series.use_beta { + self.v3_api.series.beta_route.clone() + } else { + V3_SERIES_ENDPOINT_URI.to_string() + }; + let v3_payload_limits = V3PayloadLimits::new( + self.max_series_payload_size, + self.max_series_uncompressed_payload_size, + self.max_metrics_per_payload, + SERIES_V3_POINTS_PER_PAYLOAD_LIMIT, + ); - if let Some(additional_tags) = self.additional_tags.as_ref() { - series_encoder = series_encoder.with_additional_tags(additional_tags.clone()); - sketches_encoder = sketches_encoder.with_additional_tags(additional_tags.clone()); - } + let v2_endpoint_config = EndpointConfiguration::new( + v2_compression_scheme, + self.max_metrics_per_payload, + self.additional_tags.clone(), + ); + let v3_endpoint_config = EndpointConfiguration::new( + v3_compression_scheme, + self.max_metrics_per_payload, + self.additional_tags.clone(), + ); - let mut series_rb = RequestBuilder::new(series_encoder, compression_scheme, RB_BUFFER_CHUNK_SIZE).await?; - series_rb.with_max_inputs_per_payload(self.max_metrics_per_payload); + // Derive the encoding mode for each metric type from the configuration. + let series_mode = + MetricsEncoderMode::from_config(self.v3_api.use_v3_series(), self.v3_api.use_v3_series_validate()); + let sketches_mode = + MetricsEncoderMode::from_config(self.v3_api.use_v3_sketches(), self.v3_api.use_v3_sketches_validate()); + let series_endpoint = if self.use_v2_api_series { + MetricsEndpoint::SeriesV2 + } else { + MetricsEndpoint::SeriesV1 + }; let generic_payload_limits = clamp_payload_limits( self.max_uncompressed_payload_size, self.max_payload_size, @@ -321,18 +345,24 @@ impl EncoderBuilder for DatadogMetricsConfiguration { clamp_payload_limits( self.max_series_uncompressed_payload_size, self.max_series_payload_size, - SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, - SERIES_V2_COMPRESSED_SIZE_LIMIT, + v2::SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, + v2::SERIES_V2_COMPRESSED_SIZE_LIMIT, ) } else { generic_payload_limits }; - series_rb.with_len_limits(series_uncompressed_limit, series_compressed_limit)?; + let mut v2_series_builder = v2::create_v2_request_builder(series_endpoint, &v2_endpoint_config) + .await + .error_context("Failed to create V2 series request builder.")?; + v2_series_builder.with_len_limits(series_uncompressed_limit, series_compressed_limit)?; + let v2_series_builder = Some(v2_series_builder); - let mut sketches_rb = RequestBuilder::new(sketches_encoder, compression_scheme, RB_BUFFER_CHUNK_SIZE).await?; - sketches_rb.with_max_inputs_per_payload(self.max_metrics_per_payload); let (sketches_uncompressed_limit, sketches_compressed_limit) = generic_payload_limits; - sketches_rb.with_len_limits(sketches_uncompressed_limit, sketches_compressed_limit)?; + let mut v2_sketch_builder = v2::create_v2_request_builder(MetricsEndpoint::Sketches, &v2_endpoint_config) + .await + .error_context("Failed to create V2 sketches request builder.")?; + v2_sketch_builder.with_len_limits(sketches_uncompressed_limit, sketches_compressed_limit)?; + let v2_sketch_builder = Some(v2_sketch_builder); let flush_timeout = match self.flush_timeout_secs { // We always give ourselves a minimum flush timeout of 10ms to allow for some very minimal amount of @@ -341,9 +371,24 @@ impl EncoderBuilder for DatadogMetricsConfiguration { secs => Duration::from_secs(secs), }; + if series_mode.needs_v3() || sketches_mode.needs_v3() { + debug!( + ?series_mode, + ?sketches_mode, + v3_series_endpoints = ?self.v3_api.series.endpoints, + v3_sketches_endpoints = ?self.v3_api.sketches.endpoints, + "V3 encoding support is enabled." + ); + } + Ok(Box::new(DatadogMetrics { - series_rb, - sketches_rb, + v2_series_builder, + v2_sketch_builder, + series_mode, + sketches_mode, + v3_endpoint_config, + v3_payload_limits, + v3_series_endpoint_uri, telemetry, flush_timeout, log_payloads: self.log_payloads, @@ -376,8 +421,13 @@ impl MemoryBounds for DatadogMetricsConfiguration { } pub struct DatadogMetrics { - series_rb: RequestBuilder, - sketches_rb: RequestBuilder, + v2_series_builder: Option>, + v2_sketch_builder: Option>, + series_mode: MetricsEncoderMode, + sketches_mode: MetricsEncoderMode, + v3_endpoint_config: EndpointConfiguration, + v3_payload_limits: V3PayloadLimits, + v3_series_endpoint_uri: String, telemetry: ComponentTelemetry, flush_timeout: Duration, log_payloads: bool, @@ -387,8 +437,13 @@ pub struct DatadogMetrics { impl Encoder for DatadogMetrics { async fn run(mut self: Box, mut context: EncoderContext) -> Result<(), GenericError> { let Self { - series_rb, - sketches_rb, + v2_series_builder, + v2_sketch_builder, + series_mode, + sketches_mode, + v3_endpoint_config, + v3_payload_limits, + v3_series_endpoint_uri, telemetry, flush_timeout, log_payloads, @@ -400,8 +455,13 @@ impl Encoder for DatadogMetrics { let (events_tx, events_rx) = mpsc::channel(8); let (payloads_tx, mut payloads_rx) = mpsc::channel(8); let request_builder_fut = run_request_builder( - series_rb, - sketches_rb, + v2_series_builder, + v2_sketch_builder, + series_mode, + sketches_mode, + v3_endpoint_config, + v3_payload_limits, + v3_series_endpoint_uri, telemetry, events_rx, payloads_tx, @@ -460,16 +520,47 @@ impl Encoder for DatadogMetrics { } } +/// Logs the decoded contents of a metric prior to encoding. +/// +/// This logs the metric object itself, not the encoded JSON/protobuf HTTP body. +fn log_metric_payload(metric: &Metric) { + match metric.values() { + MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) => { + debug!(?metric, "Flushing series metric.") + } + MetricValues::Histogram(..) | MetricValues::Distribution(..) => { + debug!(?metric, "Flushing sketch metric.") + } + } +} + +#[allow(clippy::too_many_arguments)] async fn run_request_builder( - mut series_request_builder: RequestBuilder, - mut sketches_request_builder: RequestBuilder, telemetry: ComponentTelemetry, - mut events_rx: mpsc::Receiver, payloads_tx: mpsc::Sender, flush_timeout: Duration, - log_payloads: bool, + mut v2_series_builder: Option>, + mut v2_sketch_builder: Option>, series_mode: MetricsEncoderMode, + sketches_mode: MetricsEncoderMode, v3_endpoint_config: EndpointConfiguration, v3_payload_limits: V3PayloadLimits, + v3_series_endpoint_uri: String, telemetry: ComponentTelemetry, mut events_rx: mpsc::Receiver, + mut payloads_tx: mpsc::Sender, flush_timeout: Duration, log_payloads: bool, ) -> Result<(), GenericError> { let mut pending_flush = false; let pending_flush_timeout = sleep(flush_timeout); tokio::pin!(pending_flush_timeout); + let mut v3_series_metrics = series_mode.needs_v3().then(Vec::::new); + let mut v3_sketch_metrics = sketches_mode.needs_v3().then(Vec::::new); + + let mut series_batch_id = None; + let mut sketches_batch_id = None; + + let tag_series = series_mode.needs_tagging(); + let tag_sketches = sketches_mode.needs_tagging(); + let v3_flush_context = V3FlushContext { + endpoint_config: &v3_endpoint_config, + payload_limits: v3_payload_limits, + series_endpoint_uri: &v3_series_endpoint_uri, + telemetry: &telemetry, + }; + loop { select! { Some(event_buffer) = events_rx.recv() => { @@ -483,62 +574,103 @@ async fn run_request_builder( log_metric_payload(&metric); } - // Series metrics (counters, gauges, rates, sets) and sketch metrics (histograms, distributions) - // route to their respective request builders. Whether the series builder targets the V1 or V2 - // intake is decided once at builder time based on `use_v2_api_series`. - let request_builder = match metric.values() { - MetricValues::Counter(..) - | MetricValues::Rate(..) - | MetricValues::Gauge(..) - | MetricValues::Set(..) => &mut series_request_builder, - MetricValues::Histogram(..) | MetricValues::Distribution(..) => &mut sketches_request_builder, + // Figure out which endpoint the metric belongs to, and grab the relevant V2 builder/V3 storage. + let endpoint = MetricsEndpoint::from_metric(&metric); + let (endpoint_mode, maybe_v2_builder, maybe_v3_metrics, batch_id) = match endpoint { + MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => ( + series_mode, + &mut v2_series_builder, + &mut v3_series_metrics, + &mut series_batch_id, + ), + MetricsEndpoint::Sketches => ( + sketches_mode, + &mut v2_sketch_builder, + &mut v3_sketch_metrics, + &mut sketches_batch_id, + ), }; + if endpoint_mode.needs_batch_id() && batch_id.is_none() { + *batch_id = Some(Uuid::now_v7()); + } + let active_batch_id = endpoint_mode.needs_batch_id().then_some(batch_id.as_ref()).flatten(); + + // Store a copy of the metric in `maybe_v3_metrics` if it's present. + // + // We have to do this before encoding because `RequestBuilder::encode` consumes the metric. This also means we'll + // need to _remove_ the metric if encoding fails. + if let Some(metrics) = maybe_v3_metrics { + metrics.push(metric.clone()); + } - // Encode the metric. If we get it back, that means the current request is full, and we need to - // flush it before we can try to encode the metric again... so we'll hold on to it in that case - // before flushing and trying to encode it again. - let metric_to_retry = match request_builder.encode(metric).await { - Ok(None) => continue, - Ok(Some(metric)) => metric, - Err(e) => { - error!(error = %e, "Failed to encode metric."); - telemetry.events_dropped_encoder().increment(1); - continue; - } + // Attempt encoding the metric for V2 if configured. + // + // If the metric couldn't be encoded (too big, some other issue), the call returns `false` which is + // our signal to remove the metric from `maybe_v3_metrics` (if we added it), since we know now that + // the metric wasn't encoded for V2 and we want our V2/V3 payload batches to be consistent in + // validation mode. + let v2_payload_info = match endpoint { + MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => tag_series.then(MetricsPayloadInfo::v2_series), + MetricsEndpoint::Sketches => tag_sketches.then(MetricsPayloadInfo::v2_sketches), }; + let v2_flushed = if let Some(builder) = maybe_v2_builder { + let result = encode_v2_metrics(builder, metric, &telemetry, &mut payloads_tx, active_batch_id, v2_payload_info).await?; + if !result.encoded() { + if let Some(metrics) = maybe_v3_metrics { + let _ = metrics.pop(); + } + } - let maybe_requests = request_builder.flush().await; - if maybe_requests.is_empty() { - panic!("builder told us to flush, but gave us nothing"); - } + result.flushed() + } else { + false + }; - for maybe_request in maybe_requests { - match maybe_request { - Ok((events, data_points, request)) => { - let payload_meta = PayloadMetadata::from_event_and_data_point_count(events, data_points); - let http_payload = HttpPayload::new(payload_meta, request); - let payload = Payload::Http(http_payload); - - payloads_tx.send(payload).await - .map_err(|_| generic_error!("Failed to send payload to encoder."))?; - }, - - // TODO: Increment a counter here that metrics were dropped due to a flush failure. - Err(e) => if e.is_recoverable() { - // If the error is recoverable, we'll hold on to the metric to retry it later. - continue; - } else { - return Err(GenericError::from(e).context("Failed to flush request.")); + // If we flushed via V2, or we've hit our max metrics per payload limit in pure V3 mode, we need to flush our V3 metrics + // as well. + let v3_payload_info = match endpoint { + MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => tag_series.then(MetricsPayloadInfo::v3_series), + MetricsEndpoint::Sketches => tag_sketches.then(MetricsPayloadInfo::v3_sketches), + }; + let mut carried_metric_into_next_batch = false; + let v3_flushed = if let Some(v3_metrics) = maybe_v3_metrics { + let should_flush_v3 = match endpoint_mode { + MetricsEncoderMode::V2Only => false, + MetricsEncoderMode::V3Enabled => { + v2_flushed || v3_flush_context.payload_limits.should_flush_metric_count_limit(v3_metrics) + } + MetricsEncoderMode::Validation => v2_flushed, + }; + if should_flush_v3 { + // V2 flushes the previous batch without the current metric (the metric + // that triggered the flush is re-encoded into the next V2 batch). Pop it + // from V3 before flushing so both batches cover the same set of metrics. + let split_metric = if v2_flushed { v3_metrics.pop() } else { None }; + encode_and_flush_v3_metrics( + endpoint, + v3_flush_context, + v3_metrics, + &mut payloads_tx, + active_batch_id, + v3_payload_info, + ) + .await?; + if let Some(m) = split_metric { + carried_metric_into_next_batch = true; + v3_metrics.push(m); } + true + } else { + false } - } + } else { + false + }; - // Now try to encode the metric again. If it fails again, we'll just log it because it shouldn't - // be possible to fail at this point, otherwise we would have already caught that the first - // time. - if let Err(e) = request_builder.encode(metric_to_retry).await { - error!(error = %e, "Failed to encode metric."); - telemetry.events_dropped_encoder().increment(1); + // If a V2-triggered split leaves the current metric pending in the next batch, assign that pending + // V2/V3 pair a fresh validation ID. Otherwise, the next timeout flush would omit validation headers. + if endpoint_mode.needs_batch_id() && (v2_flushed || v3_flushed) { + *batch_id = carried_metric_into_next_batch.then(Uuid::now_v7); } } @@ -555,51 +687,73 @@ async fn run_request_builder( pending_flush = false; - // Once we've encoded and written all metrics, we flush the request builders to generate a request with - // anything left over. Again, we'll enqueue those requests to be sent immediately. - let maybe_series_requests = series_request_builder.flush().await; - for maybe_request in maybe_series_requests { - match maybe_request { - Ok((events, data_points, request)) => { - let payload_meta = PayloadMetadata::from_event_and_data_point_count(events, data_points); - let http_payload = HttpPayload::new(payload_meta, request); - let payload = Payload::Http(http_payload); - - payloads_tx.send(payload).await - .map_err(|_| generic_error!("Failed to send payload to encoder."))?; - }, - - // TODO: Increment a counter here that metrics were dropped due to a flush failure. - Err(e) => if e.is_recoverable() { - // If the error is recoverable, we'll hold on to the metric to retry it later. - continue; - } else { - return Err(GenericError::from(e).context("Failed to flush request.")); + // Flush any pending series metrics. + let v2_series_payload_info = tag_series.then(MetricsPayloadInfo::v2_series); + let series_active_batch_id = series_mode.needs_batch_id().then_some(series_batch_id.as_ref()).flatten(); + let mut v2_series_flush_succeeded = true; + if let Some(builder) = &mut v2_series_builder { + if let Err(e) = flush_v2_metrics(builder, &mut payloads_tx, series_active_batch_id, v2_series_payload_info).await { + error!(error = %e, "Failed to flush V2 series metrics: {}", e); + v2_series_flush_succeeded = false; + } + } + + let v3_series_payload_info = tag_series.then(MetricsPayloadInfo::v3_series); + if let Some(metrics) = &mut v3_series_metrics { + if v2_series_flush_succeeded { + if let Err(e) = encode_and_flush_v3_series_metrics( + v3_flush_context, + metrics, + &mut payloads_tx, + series_active_batch_id, + v3_series_payload_info, + ) + .await + { + error!(error = %e, "Failed to flush V3 series metrics: {}", e); } + } else { + warn!("Failed to flush V2 series metrics, skipping V3 series flush."); + metrics.clear(); } } + if series_mode.needs_batch_id() { + series_batch_id = None; + } - let maybe_sketches_requests = sketches_request_builder.flush().await; - for maybe_request in maybe_sketches_requests { - match maybe_request { - Ok((events, data_points, request)) => { - let payload_meta = PayloadMetadata::from_event_and_data_point_count(events, data_points); - let http_payload = HttpPayload::new(payload_meta, request); - let payload = Payload::Http(http_payload); - - payloads_tx.send(payload).await - .map_err(|_| generic_error!("Failed to send payload to encoder."))?; - }, - - // TODO: Increment a counter here that metrics were dropped due to a flush failure. - Err(e) => if e.is_recoverable() { - // If the error is recoverable, we'll hold on to the metric to retry it later. - continue; - } else { - return Err(GenericError::from(e).context("Failed to flush request.")); + // Flush any pending sketch metrics. + let v2_sketches_payload_info = tag_sketches.then(MetricsPayloadInfo::v2_sketches); + let sketches_active_batch_id = sketches_mode.needs_batch_id().then_some(sketches_batch_id.as_ref()).flatten(); + let mut v2_sketches_flush_succeeded = true; + if let Some(builder) = &mut v2_sketch_builder { + if let Err(e) = flush_v2_metrics(builder, &mut payloads_tx, sketches_active_batch_id, v2_sketches_payload_info).await { + error!(error = %e, "Failed to flush V2 sketch metrics: {}", e); + v2_sketches_flush_succeeded = false; + } + } + + let v3_sketches_payload_info = tag_sketches.then(MetricsPayloadInfo::v3_sketches); + if let Some(metrics) = &mut v3_sketch_metrics { + if v2_sketches_flush_succeeded { + if let Err(e) = encode_and_flush_v3_sketch_metrics( + v3_flush_context, + metrics, + &mut payloads_tx, + sketches_active_batch_id, + v3_sketches_payload_info, + ) + .await + { + error!(error = %e, "Failed to flush V3 sketch metrics: {}", e); } + } else { + warn!("Failed to flush V2 sketch metrics, skipping V3 sketch flush."); + metrics.clear(); } } + if sketches_mode.needs_batch_id() { + sketches_batch_id = None; + } debug!("All flushed requests sent to I/O task. Waiting for next event buffer..."); }, @@ -612,1010 +766,1119 @@ async fn run_request_builder( Ok(()) } -fn log_metric_payload(metric: &Metric) { - match metric.values() { - MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) => { - debug!(?metric, "Flushing series metric.") - } - MetricValues::Histogram(..) | MetricValues::Distribution(..) => { - debug!(?metric, "Flushing sketch metric.") - } - } +struct EncodeResult { + encoded: bool, + flushed: bool, } -/// Metrics intake endpoint. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -enum MetricsEndpoint { - /// V1 series metrics, encoded as JSON and sent to `/api/v1/series`. - /// - /// Includes counters, gauges, rates, and sets. Selected when `use_v2_api.series` is `false`. - SeriesV1, - - /// V2 series metrics, encoded as Protocol Buffers and sent to `/api/v2/series`. - /// - /// Includes counters, gauges, rates, and sets. The default series encoding. - SeriesV2, - - /// Sketch metrics, encoded as Protocol Buffers and sent to `/api/beta/sketches`. - /// - /// Includes histograms and distributions. Always uses the V2 endpoint regardless of `use_v2_api.series`. - Sketches, -} +impl EncodeResult { + pub const fn new(encoded: bool, flushed: bool) -> Self { + Self { encoded, flushed } + } -/// Error returned when a metric fails to encode for either the V1 JSON or V2 protobuf intake. -#[derive(Debug)] -pub enum MetricsEncodeError { - /// Protobuf encoding failed. - Protobuf(protobuf::Error), + pub const fn encoded(&self) -> bool { + self.encoded + } - /// JSON encoding failed. - Json(serde_json::Error), + pub const fn flushed(&self) -> bool { + self.flushed + } } -impl fmt::Display for MetricsEncodeError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Protobuf(e) => write!(f, "protobuf encode error: {}", e), - Self::Json(e) => write!(f, "json encode error: {}", e), +async fn encode_v2_metrics( + request_builder: &mut RequestBuilder, metric: Metric, telemetry: &ComponentTelemetry, + payloads_tx: &mut mpsc::Sender, batch_id: Option<&Uuid>, payload_info: Option, +) -> Result { + // Encode the metric. If we get it back, that means the current request is full, and we need to + // flush it before we can try to encode the metric again... so we'll hold on to it in that case + // before flushing and trying to encode it again. + let metric_to_retry = match request_builder.encode(metric).await { + Ok(None) => return Ok(EncodeResult::new(true, false)), + Ok(Some(metric)) => metric, + Err(e) => { + error!(error = %e, "Failed to encode metric."); + telemetry.events_dropped_encoder().increment(1); + return Ok(EncodeResult::new(false, false)); } - } -} + }; -impl std::error::Error for MetricsEncodeError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - Self::Protobuf(e) => Some(e), - Self::Json(e) => Some(e), + flush_v2_metrics(request_builder, payloads_tx, batch_id, payload_info).await?; + + // Now try to encode the metric again. If it fails again, we'll just log it because it shouldn't + // be possible to fail at this point, otherwise we would have already caught that the first + // time. + match request_builder.encode(metric_to_retry).await { + Ok(None) => Ok(EncodeResult::new(true, true)), + Ok(Some(_)) => unreachable!( + "failure to encode due to size should never occur after flush for metrics which aren't unencodable" + ), + Err(e) => { + error!(error = %e, "Failed to encode metric."); + telemetry.events_dropped_encoder().increment(1); + Ok(EncodeResult::new(false, true)) } } } -impl From for MetricsEncodeError { - fn from(value: protobuf::Error) -> Self { - Self::Protobuf(value) - } -} +async fn flush_v2_metrics( + request_builder: &mut RequestBuilder, payloads_tx: &mut mpsc::Sender, + batch_id: Option<&Uuid>, payload_info: Option, +) -> Result { + let mut requests_flushed = 0; + + let maybe_requests = request_builder.flush().await; + let batch_len = maybe_requests.len(); + for (batch_seq, maybe_request) in maybe_requests.into_iter().enumerate() { + match maybe_request { + Ok((events, data_points, request)) => { + requests_flushed += 1; + + flush_payload( + request, + events, + data_points, + payloads_tx, + batch_id, + batch_seq, + batch_len, + payload_info, + ) + .await?; + } -impl From for MetricsEncodeError { - fn from(value: serde_json::Error) -> Self { - Self::Json(value) + // TODO: Increment a counter here that metrics were dropped due to a flush failure. + Err(e) => { + if !e.is_recoverable() { + return Err(GenericError::from(e).context("Failed to flush request.")); + } + } + } } + + Ok(requests_flushed) } -#[derive(Debug)] -struct MetricsEndpointEncoder { - endpoint: MetricsEndpoint, - primary_scratch_buf: Vec, - secondary_scratch_buf: Vec, - packed_scratch_buf: Vec, - additional_tags: SharedTagSet, - tags_deduplicator: ReusableDeduplicator, +#[derive(Clone, Copy)] +struct V3FlushContext<'a> { + endpoint_config: &'a EndpointConfiguration, + payload_limits: V3PayloadLimits, + series_endpoint_uri: &'a str, + telemetry: &'a ComponentTelemetry, } -impl MetricsEndpointEncoder { - /// Creates a new `MetricsEndpointEncoder` for the given endpoint. - pub fn from_endpoint(endpoint: MetricsEndpoint) -> Self { - Self { - endpoint, - primary_scratch_buf: Vec::new(), - secondary_scratch_buf: Vec::new(), - packed_scratch_buf: Vec::new(), - additional_tags: SharedTagSet::default(), - tags_deduplicator: ReusableDeduplicator::new(), +async fn encode_and_flush_v3_metrics( + endpoint: MetricsEndpoint, context: V3FlushContext<'_>, metrics: &mut Vec, + payloads_tx: &mut mpsc::Sender, batch_id: Option<&Uuid>, payload_info: Option, +) -> Result<(), GenericError> { + match endpoint { + MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => { + encode_and_flush_v3_series_metrics(context, metrics, payloads_tx, batch_id, payload_info).await + } + MetricsEndpoint::Sketches => { + encode_and_flush_v3_sketch_metrics(context, metrics, payloads_tx, batch_id, payload_info).await } } +} - /// Sets the additional tags to be included with every metric encoded by this encoder. - /// - /// These tags are added in a deduplicated fashion, the same as instrumented tags and origin tags. This is an - /// optimized codepath for tag inclusion in high-volume scenarios, where creating new additional contexts - /// through the traditional means (for example, `ContextResolver`) would be too expensive. - pub fn with_additional_tags(mut self, additional_tags: SharedTagSet) -> Self { - self.additional_tags = additional_tags; - self +async fn encode_and_flush_v3_series_metrics( + context: V3FlushContext<'_>, metrics: &mut Vec, payloads_tx: &mut mpsc::Sender, + batch_id: Option<&Uuid>, payload_info: Option, +) -> Result<(), GenericError> { + if metrics.is_empty() { + return Ok(()); + } + let metrics_to_flush = std::mem::take(metrics); + + let requests = encode_v3_payload_requests(context.series_endpoint_uri, &metrics_to_flush, context, "series").await; + let batch_len = requests.len(); + for (batch_seq, payload_request) in requests.into_iter().enumerate() { + flush_payload( + payload_request.request, + payload_request.event_count, + payload_request.data_point_count, + payloads_tx, + batch_id, + batch_seq, + batch_len, + payload_info, + ) + .await?; + debug!( + events = payload_request.event_count, + data_points = payload_request.data_point_count, + "Sent V3 series payload." + ); } -} -impl EndpointEncoder for MetricsEndpointEncoder { - type Input = Metric; - type EncodeError = MetricsEncodeError; + Ok(()) +} - fn encoder_name() -> &'static str { - "metrics" +async fn encode_and_flush_v3_sketch_metrics( + context: V3FlushContext<'_>, metrics: &mut Vec, payloads_tx: &mut mpsc::Sender, + batch_id: Option<&Uuid>, payload_info: Option, +) -> Result<(), GenericError> { + if metrics.is_empty() { + return Ok(()); } - - fn compressed_size_limit(&self) -> usize { - match self.endpoint { - MetricsEndpoint::SeriesV1 => SERIES_V1_COMPRESSED_SIZE_LIMIT, - MetricsEndpoint::SeriesV2 => SERIES_V2_COMPRESSED_SIZE_LIMIT, - MetricsEndpoint::Sketches => DEFAULT_SERIALIZER_COMPRESSED_SIZE_LIMIT, - } + let metrics_to_flush = std::mem::take(metrics); + + let requests = encode_v3_payload_requests(V3_SKETCHES_ENDPOINT_URI, &metrics_to_flush, context, "sketches").await; + let batch_len = requests.len(); + for (batch_seq, payload_request) in requests.into_iter().enumerate() { + flush_payload( + payload_request.request, + payload_request.event_count, + payload_request.data_point_count, + payloads_tx, + batch_id, + batch_seq, + batch_len, + payload_info, + ) + .await?; + debug!( + events = payload_request.event_count, + data_points = payload_request.data_point_count, + "Sent V3 sketches payload." + ); } - fn uncompressed_size_limit(&self) -> usize { - match self.endpoint { - MetricsEndpoint::SeriesV1 => SERIES_V1_UNCOMPRESSED_SIZE_LIMIT, - MetricsEndpoint::SeriesV2 => SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, - MetricsEndpoint::Sketches => DEFAULT_SERIALIZER_UNCOMPRESSED_SIZE_LIMIT, + Ok(()) +} + +async fn encode_v3_payload_requests( + endpoint_uri: &str, metrics: &[Metric], context: V3FlushContext<'_>, payload_kind: &'static str, +) -> Vec { + let mut requests = Vec::new(); + let mut pending_ranges = split_v3_metric_ranges_by_point_limit(metrics, context, payload_kind); + + while let Some(range) = pending_ranges.pop_front() { + if range.is_empty() { + continue; } - } - fn input_data_point_count(&self, input: &Self::Input) -> usize { - input.values().len() - } + let metrics_in_range = &metrics[range.clone()]; + let event_count = metrics_in_range.len(); + let data_point_count = metrics_in_range.iter().map(|metric| metric.values().len()).sum(); - fn is_valid_input(&self, input: &Self::Input) -> bool { - let is_series_input = matches!( - input.values(), - MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) - ); + let encoded = match encode_v3_metrics_batch(metrics_in_range, context.endpoint_config.additional_tags()) { + Ok(encoded) => encoded, + Err(e) => { + error!(error = %e, payload_kind, events = event_count, "Failed to encode V3 metrics payload request."); + context.telemetry.events_dropped_encoder().increment(event_count as u64); + continue; + } + }; - match self.endpoint { - MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => is_series_input, - MetricsEndpoint::Sketches => !is_series_input, + let encoded_request = + match create_v3_request(endpoint_uri, encoded, context.endpoint_config.compression_scheme()).await { + Ok(request) => request, + Err(e) => { + error!(error = %e, payload_kind, events = event_count, "Failed to create V3 metrics request."); + context.telemetry.events_dropped_encoder().increment(event_count as u64); + continue; + } + }; + + if context.payload_limits.request_fits(&encoded_request) { + requests.push(V3PayloadRequest { + request: encoded_request.request, + event_count, + data_point_count, + }); + continue; } - } - fn get_payload_prefix(&self) -> Option<&'static [u8]> { - match self.endpoint { - MetricsEndpoint::SeriesV1 => Some(SERIES_V1_PAYLOAD_PREFIX), - _ => None, + if range.len() == 1 { + // The encoded request is too large and this range cannot be split any further. + warn!( + payload_kind, + compressed_len = encoded_request.compressed_len, + compressed_limit = context.payload_limits.max_compressed_size, + uncompressed_len = encoded_request.uncompressed_len, + uncompressed_limit = context.payload_limits.max_uncompressed_size, + "Dropping oversized V3 metric that cannot be split further." + ); + context.telemetry.events_dropped_encoder().increment(1); + continue; } - } - fn get_payload_suffix(&self) -> Option<&'static [u8]> { - match self.endpoint { - MetricsEndpoint::SeriesV1 => Some(SERIES_V1_PAYLOAD_SUFFIX), - _ => None, - } + // Retry this oversized range as two smaller ranges, preserving the original metric order. + let pivot = range.start + range.len() / 2; + pending_ranges.push_front(pivot..range.end); + pending_ranges.push_front(range.start..pivot); } - fn get_input_separator(&self) -> Option<&'static [u8]> { - match self.endpoint { - MetricsEndpoint::SeriesV1 => Some(SERIES_V1_INPUT_SEPARATOR), - _ => None, - } - } + requests +} - fn encode(&mut self, input: &Self::Input, buffer: &mut Vec) -> Result<(), Self::EncodeError> { - match self.endpoint { - MetricsEndpoint::SeriesV1 => { - encode_series_v1_metric(input, &self.additional_tags, buffer, &mut self.tags_deduplicator)?; - Ok(()) +fn split_v3_metric_ranges_by_point_limit( + metrics: &[Metric], context: V3FlushContext<'_>, payload_kind: &'static str, +) -> VecDeque> { + let mut ranges = VecDeque::new(); + let mut current_start = None; + let mut current_points = 0usize; + + for (idx, metric) in metrics.iter().enumerate() { + let metric_points = metric.values().len(); + if metric_points == 0 { + // The Agent drops zero-point V3 metrics before writing them. + if let Some(start) = current_start.take() { + if start < idx { + ranges.push_back(start..idx); + } } - MetricsEndpoint::SeriesV2 | MetricsEndpoint::Sketches => { - // NOTE: We're passing _four_ buffers to `encode_single_metric`, which is a lot, but with good reason. - // - // The first buffer, `buffer`, is the overall output buffer: the caller expects us to put the full - // encoded metric payload into this buffer. - // - // The second and third buffers, `primary_scratch_buf` and `secondary_scratch_buf`, are used for - // roughly the same thing but deal with _nesting_. When writing a "message" in Protocol Buffers, the - // message data itself is prefixed with the field number and a length delimiter that specifies how - // long the message is. We can't write that length delimiter until we know the full size of the - // message, so we write the message to a scratch buffer, calculate its size, and then write the field - // number and length delimiter to the output buffer followed by the message data from the scratch - // buffer. - // - // We have _two_ scratch buffers because you need a dedicated buffer for each level of nested message. - // We have to be able to nest up to two levels deep in our metrics payload, so we need two scratch - // buffers to handle that. - // - // The fourth buffer, `packed_scratch_buf`, is used for writing out packed repeated fields. This is - // similar to the situation describe above, except it's not _exactly_ the same as an additional level - // of nesting.. so I just decided to give it a somewhat more descriptive name. - encode_single_metric( - input, - &self.additional_tags, - buffer, - &mut self.primary_scratch_buf, - &mut self.secondary_scratch_buf, - &mut self.packed_scratch_buf, - &mut self.tags_deduplicator, - )?; - Ok(()) + context.telemetry.events_dropped_encoder().increment(1); + current_points = 0; + continue; + } + + if !context.payload_limits.point_count_fits(metric_points) { + // This metric exceeds the point limit by itself, so it cannot fit in any V3 payload request. + // Close the current range before dropping this oversized metric. + if let Some(start) = current_start.take() { + if start < idx { + ranges.push_back(start..idx); + } } + warn!( + payload_kind, + data_points = metric_points, + point_limit = context.payload_limits.max_points_per_payload, + "Dropping oversized V3 metric that exceeds the point-count limit." + ); + context.telemetry.events_dropped_encoder().increment(1); + current_points = 0; + continue; } - } - fn endpoint_uri(&self) -> Uri { - match self.endpoint { - MetricsEndpoint::SeriesV1 => PathAndQuery::from_static(METRICS_SERIES_V1_PATH).into(), - MetricsEndpoint::SeriesV2 => PathAndQuery::from_static(METRICS_SERIES_V2_PATH).into(), - MetricsEndpoint::Sketches => PathAndQuery::from_static(METRICS_SKETCHES_PATH).into(), + let would_exceed_point_limit = + current_points > 0 && !context.payload_limits.point_count_fits(current_points + metric_points); + if would_exceed_point_limit { + // This metric fits by itself, but not together with the current range. + // Adding this metric would overflow the current range, so start a new range at this metric. + if let Some(start) = current_start { + ranges.push_back(start..idx); + } + current_start = Some(idx); + current_points = 0; + } else if current_start.is_none() { + current_start = Some(idx); } - } - fn endpoint_method(&self) -> Method { - // All endpoints use POST. - Method::POST + current_points += metric_points; } - fn content_type(&self) -> HeaderValue { - match self.endpoint { - MetricsEndpoint::SeriesV1 => CONTENT_TYPE_JSON.clone(), - MetricsEndpoint::SeriesV2 | MetricsEndpoint::Sketches => CONTENT_TYPE_PROTOBUF.clone(), + if let Some(start) = current_start { + if start < metrics.len() { + ranges.push_back(start..metrics.len()); } } + + ranges } -fn field_number_for_metric_type(metric: &Metric) -> u32 { - match metric.values() { - MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) => 1, - MetricValues::Histogram(..) | MetricValues::Distribution(..) => 1, - } +/// Converts a `Uuid` to a `HeaderValue`. +fn uuid_to_header_value(uuid: &Uuid) -> HeaderValue { + let s = uuid.as_hyphenated().to_string(); + // SAFETY: UUID hyphenated format only contains [0-9a-f-], all valid ASCII header chars. + unsafe { HeaderValue::from_maybe_shared_unchecked(s) } } -fn get_message_size(raw_msg_size: usize) -> Result { - const MAX_MESSAGE_SIZE: u64 = i32::MAX as u64; +/// Converts a `usize` to a `HeaderValue`. +fn usize_to_header_value(value: usize) -> HeaderValue { + let s = value.to_string(); + // SAFETY: Integer strings only contain ASCII digits [0-9], all valid header chars. + unsafe { HeaderValue::from_maybe_shared_unchecked(s) } +} - // Individual messages cannot be larger than `i32::MAX`, so check that here before proceeding. - if raw_msg_size as u64 > MAX_MESSAGE_SIZE { - return Err(std::io::Error::other("message size exceeds limit (2147483648 bytes)").into()); +async fn flush_payload( + mut request: Request, event_count: usize, data_point_count: usize, + payloads_tx: &mut mpsc::Sender, batch_id: Option<&Uuid>, batch_seq: usize, batch_len: usize, + payload_info: Option, +) -> Result<(), GenericError> { + // Attach the validation batch UUID and sequence headers if present. + if let Some(batch_id) = batch_id { + let headers = request.headers_mut(); + headers.insert("X-Metrics-Request-ID", uuid_to_header_value(batch_id)); + headers.insert("X-Metrics-Request-Seq", usize_to_header_value(batch_seq)); + headers.insert("X-Metrics-Request-Len", usize_to_header_value(batch_len)); } - Ok(raw_msg_size as u32) + let mut payload_meta = PayloadMetadata::from_event_and_data_point_count(event_count, data_point_count); + if let Some(info) = payload_info { + payload_meta = payload_meta.with(info); + } + let http_payload = HttpPayload::new(payload_meta, request); + let payload = Payload::Http(http_payload); + + payloads_tx + .send(payload) + .await + .error_context("Failed to send payload.")?; + + Ok(()) } -fn get_message_size_from_buffer(buf: &[u8]) -> Result { - get_message_size(buf.len()) +// Encodes a batch of metrics to V3 columnar format. +fn encode_v3_metrics_batch(metrics: &[Metric], additional_tags: &SharedTagSet) -> Result, GenericError> { + let mut writer = v3::V3Writer::new(); + + for metric in metrics { + write_metric_to_v3(&mut writer, metric, additional_tags); + } + + let mut output = Vec::new(); + writer + .finalize(&mut output) + .map_err(|e| generic_error!("Failed to serialize V3 payload: {}", e))?; + + Ok(output) } -fn encode_single_metric( - metric: &Metric, additional_tags: &SharedTagSet, output_buf: &mut Vec, primary_scratch_buf: &mut Vec, - secondary_scratch_buf: &mut Vec, packed_scratch_buf: &mut Vec, - tags_deduplicator: &mut ReusableDeduplicator, -) -> Result<(), protobuf::Error> { - let mut output_stream = CodedOutputStream::vec(output_buf); - let field_number = field_number_for_metric_type(metric); - - write_nested_message(&mut output_stream, primary_scratch_buf, field_number, |os| { - // Depending on the metric type, we write out the appropriate fields. - match metric.values() { - MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) => { - encode_series_v2_metric(metric, additional_tags, os, secondary_scratch_buf, tags_deduplicator) +/// Writes a single metric to the V3 writer. +fn write_metric_to_v3(writer: &mut v3::V3Writer, metric: &Metric, additional_tags: &SharedTagSet) { + let metric_type = match metric.values() { + MetricValues::Counter(..) => v3::V3MetricType::Count, + MetricValues::Rate(..) => v3::V3MetricType::Rate, + MetricValues::Gauge(..) | MetricValues::Set(..) => v3::V3MetricType::Gauge, + MetricValues::Histogram(..) | MetricValues::Distribution(..) => v3::V3MetricType::Sketch, + }; + let is_sketch = metric_type == v3::V3MetricType::Sketch; + + let mut builder = writer.write(metric_type, metric.context().name()); + + // Tags - chain instrumented + additional + origin tags + let all_tags = metric + .context() + .tags() + .into_iter() + .chain(additional_tags) + .chain(metric.context().origin_tags()) + .filter(|t| is_sketch || !is_v3_series_resource_tag(t) && !is_v3_series_device_tag(t)) + .map(|t| t.as_str()); + builder.set_tags(all_tags); + + // Resources - extract host and, for series, promoted resource tags. + let mut resources = Vec::new(); + if let Some(host) = metric.metadata().hostname().filter(|host| !host.is_empty()) { + resources.push(("host", host)); + } + if !is_sketch { + let mut device_resource = None; + for tag in metric + .context() + .origin_tags() + .into_iter() + .chain(metric.context().tags()) + .chain(additional_tags) + { + if is_v3_series_device_tag(tag) { + device_resource = tag.value().filter(|device| !device.is_empty()); + } else if is_v3_series_resource_tag(tag) { + if let Some((rtype, rname)) = tag.value().and_then(|value| value.split_once(':')) { + if !rtype.is_empty() && !rname.is_empty() { + resources.push((rtype, rname)); + } + } } - MetricValues::Histogram(..) | MetricValues::Distribution(..) => encode_sketch_metric( - metric, - additional_tags, - os, - secondary_scratch_buf, - packed_scratch_buf, - tags_deduplicator, - ), } - }) -} + if let Some(device) = device_resource { + let device_idx = usize::from(metric.metadata().hostname().is_some_and(|host| !host.is_empty())); + resources.insert(device_idx, ("device", device)); + } + } + builder.set_resources(&resources); -fn encode_series_v2_metric( - metric: &Metric, additional_tags: &SharedTagSet, output_stream: &mut CodedOutputStream<'_>, - scratch_buf: &mut Vec, tags_deduplicator: &mut ReusableDeduplicator, -) -> Result<(), protobuf::Error> { - // Write the metric name and tags. - output_stream.write_string(SERIES_METRIC_FIELD_NUMBER, metric.context().name())?; - - let deduplicated_tags = get_deduplicated_tags(metric, additional_tags, tags_deduplicator); - write_series_tags(deduplicated_tags, output_stream, scratch_buf)?; - - // Set the host resource. - write_resource( - output_stream, - scratch_buf, - "host", - metric.metadata().hostname().unwrap_or_default(), - )?; - - // Write the origin metadata, if it exists. + // Origin metadata if let Some(origin) = metric.metadata().origin() { match origin { MetricOrigin::SourceType(source_type) => { - output_stream.write_string(SERIES_SOURCE_TYPE_NAME_FIELD_NUMBER, source_type.as_ref())?; + builder.set_source_type(source_type.as_ref()); } MetricOrigin::OriginMetadata { product, subproduct, product_detail, } => { - write_origin_metadata( - output_stream, - scratch_buf, - SERIES_METADATA_FIELD_NUMBER, - *product, - *subproduct, - *product_detail, - )?; + builder.set_origin(*product, *subproduct, *product_detail, false); } } } - // Now write out our metric type, points, and interval (if applicable). - let (metric_type, points, maybe_interval) = match metric.values() { - MetricValues::Counter(points) => (proto::MetricType::COUNT, points.into_iter(), None), - MetricValues::Rate(points, interval) => (proto::MetricType::RATE, points.into_iter(), Some(interval)), - MetricValues::Gauge(points) => (proto::MetricType::GAUGE, points.into_iter(), None), - MetricValues::Set(points) => (proto::MetricType::GAUGE, points.into_iter(), None), - _ => unreachable!("encode_series_v2_metric called with non-series metric"), - }; - - output_stream.write_enum(SERIES_TYPE_FIELD_NUMBER, metric_type.value())?; - - if let Some(unit) = metric.metadata().unit() { - output_stream.write_string(SERIES_UNIT_FIELD_NUMBER, unit)?; - } - - for (timestamp, value) in points { - // If this is a rate metric, scale our value by the interval, in seconds. - let value = maybe_interval - .map(|interval| value / interval.as_secs_f64()) - .unwrap_or(value); - let timestamp = timestamp.map(|ts| ts.get()).unwrap_or(0) as i64; - - write_point(output_stream, scratch_buf, value, timestamp)?; - } - - if let Some(interval) = maybe_interval { - output_stream.write_int64(SERIES_INTERVAL_FIELD_NUMBER, interval.as_secs() as i64)?; + if metric_type != v3::V3MetricType::Sketch { + if let Some(unit) = metric.metadata().unit() { + builder.set_unit(unit); + } } - Ok(()) -} - -fn encode_series_v1_metric( - metric: &Metric, additional_tags: &SharedTagSet, buffer: &mut Vec, - tags_deduplicator: &mut ReusableDeduplicator, -) -> Result<(), serde_json::Error> { - let mut obj = JsonMap::new(); - - obj.insert("metric".into(), JsonValue::String(metric.context().name().to_string())); - - let (type_str, points_iter, maybe_interval) = match metric.values() { - MetricValues::Counter(points) => ("count", points.into_iter(), None), - MetricValues::Rate(points, interval) => ("rate", points.into_iter(), Some(*interval)), - MetricValues::Gauge(points) => ("gauge", points.into_iter(), None), - MetricValues::Set(points) => ("gauge", points.into_iter(), None), - _ => unreachable!("encode_series_v1_metric called with non-series metric"), - }; - - let mut points = Vec::new(); - for (timestamp, value) in points_iter { - // For rates, value is scaled by interval seconds — same as the V2 encoder. - let value = maybe_interval - .map(|interval| value / interval.as_secs_f64()) - .unwrap_or(value); - let timestamp = timestamp.map(|ts| ts.get()).unwrap_or(0) as i64; - - // V1 emits each point as a [timestamp, value] tuple — not a nested object. - let value_json = JsonNumber::from_f64(value) - .map(JsonValue::Number) - .unwrap_or_else(|| JsonValue::from(0)); - points.push(JsonValue::Array(vec![JsonValue::from(timestamp), value_json])); - } - obj.insert("points".into(), JsonValue::Array(points)); - - // Walk the deduplicated tag set once, extracting the first `device:` tag into the device JSON field while - // dropping `dd.internal.resource` (which is a V2-protobuf-only concept with no V1 representation). - let deduplicated = get_deduplicated_tags(metric, additional_tags, tags_deduplicator); - let mut tags_out = Vec::new(); - let mut device: Option = None; - for tag in deduplicated { - if tag.name() == "dd.internal.resource" { - continue; + // Points based on metric type + match metric.values() { + MetricValues::Counter(points) | MetricValues::Gauge(points) => { + for (ts, val) in points { + let timestamp = ts.map(|t| t.get() as i64).unwrap_or(0); + builder.add_point(timestamp, val); + } } - if device.is_none() && tag.name() == "device" { - if let Some(v) = tag.value() { - device = Some(v.to_string()); - continue; + MetricValues::Rate(points, interval) => { + builder.set_interval(interval.as_secs()); + for (ts, val) in points { + let timestamp = ts.map(|t| t.get() as i64).unwrap_or(0); + // Scale by interval as done in V2 + let scaled = val / interval.as_secs_f64(); + builder.add_point(timestamp, scaled); } } - tags_out.push(JsonValue::String(tag.as_str().to_string())); - } - obj.insert("tags".into(), JsonValue::Array(tags_out)); - - // V1 always emits `host` and `interval`, even when empty/zero — matches the Agent encoder. - obj.insert( - "host".into(), - JsonValue::String(metric.metadata().hostname().unwrap_or_default().to_string()), - ); - - if let Some(d) = device.filter(|s| !s.is_empty()) { - obj.insert("device".into(), JsonValue::String(d)); - } - - obj.insert("type".into(), JsonValue::String(type_str.into())); - - let interval_secs = maybe_interval.map(|iv| iv.as_secs() as i64).unwrap_or(0); - obj.insert("interval".into(), JsonValue::from(interval_secs)); - - // V1 only emits `source_type_name` from `MetricOrigin::SourceType`. - if let Some(MetricOrigin::SourceType(s)) = metric.metadata().origin() { - obj.insert("source_type_name".into(), JsonValue::String(s.as_ref().to_string())); - } - - if let Some(unit) = metric.metadata().unit() { - if !unit.is_empty() { - obj.insert("unit".into(), JsonValue::String(unit.to_string())); + MetricValues::Set(points) => { + // Set values are already converted to count in the iterator + for (ts, count) in points { + let timestamp = ts.map(|t| t.get() as i64).unwrap_or(0); + builder.add_point(timestamp, count); + } } - } - - serde_json::to_writer(buffer, &JsonValue::Object(obj)) -} - -fn encode_sketch_metric( - metric: &Metric, additional_tags: &SharedTagSet, output_stream: &mut CodedOutputStream<'_>, - scratch_buf: &mut Vec, packed_scratch_buf: &mut Vec, tags_deduplicator: &mut ReusableDeduplicator, -) -> Result<(), protobuf::Error> { - // Write the metric name and tags. - output_stream.write_string(SKETCH_METRIC_FIELD_NUMBER, metric.context().name())?; - - let deduplicated_tags = get_deduplicated_tags(metric, additional_tags, tags_deduplicator); - write_sketch_tags(deduplicated_tags, output_stream, scratch_buf)?; - - // Write the host. - output_stream.write_string( - SKETCH_HOST_FIELD_NUMBER, - metric.metadata().hostname().unwrap_or_default(), - )?; - - // Set the origin metadata, if it exists. - if let Some(MetricOrigin::OriginMetadata { - product, - subproduct, - product_detail, - }) = metric.metadata().origin() - { - write_origin_metadata( - output_stream, - scratch_buf, - SKETCH_METADATA_FIELD_NUMBER, - *product, - *subproduct, - *product_detail, - )?; - } - - // TODO: emit `metric.metadata().unit()` in the sketch payload once the upstream `agent-payload` proto defines a - // unit field on `SketchPayload.Sketch`. - - // Write out our sketches. - match metric.values() { MetricValues::Distribution(sketches) => { - for (timestamp, value) in sketches { - write_dogsketch(output_stream, scratch_buf, packed_scratch_buf, timestamp, value)?; + for (ts, sketch) in sketches { + let timestamp = ts.map(|t| t.get() as i64).unwrap_or(0); + if !sketch.is_empty() { + let bin_keys: Vec = sketch.bins().iter().map(|b| b.key()).collect(); + let bin_counts: Vec = sketch.bins().iter().map(|b| b.count()).collect(); + builder.add_sketch( + timestamp, + sketch.count() as i64, + sketch.sum().unwrap_or(0.0), + sketch.min().unwrap_or(0.0), + sketch.max().unwrap_or(0.0), + &bin_keys, + &bin_counts, + ); + } } } - MetricValues::Histogram(points) => { - for (timestamp, histogram) in points { - // We convert histograms to sketches to be able to write them out in the payload. - let mut ddsketch = DDSketch::default(); + MetricValues::Histogram(histograms) => { + for (ts, histogram) in histograms { + let timestamp = ts.map(|t| t.get() as i64).unwrap_or(0); + // Convert histogram to DDSketch + let mut sketch = DDSketch::default(); for sample in histogram.samples() { - ddsketch.insert_n(sample.value.into_inner(), sample.weight.0 as u64); + sketch.insert_n(sample.value.into_inner(), sample.weight.0 as u64); + } + if !sketch.is_empty() { + let bin_keys: Vec = sketch.bins().iter().map(|b| b.key()).collect(); + let bin_counts: Vec = sketch.bins().iter().map(|b| b.count()).collect(); + builder.add_sketch( + timestamp, + sketch.count() as i64, + sketch.sum().unwrap_or(0.0), + sketch.min().unwrap_or(0.0), + sketch.max().unwrap_or(0.0), + &bin_keys, + &bin_counts, + ); } - - write_dogsketch(output_stream, scratch_buf, packed_scratch_buf, timestamp, &ddsketch)?; } } - _ => unreachable!("encode_sketch_metric called with non-sketch metric"), } - Ok(()) + builder.close(); } -fn write_resource( - output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, resource_type: &str, resource_name: &str, -) -> Result<(), protobuf::Error> { - write_nested_message(output_stream, scratch_buf, SERIES_RESOURCES_FIELD_NUMBER, |os| { - os.write_string(RESOURCES_TYPE_FIELD_NUMBER, resource_type)?; - os.write_string(RESOURCES_NAME_FIELD_NUMBER, resource_name) - }) +fn is_v3_series_device_tag(tag: &Tag) -> bool { + tag.name() == "device" && tag.value().is_some() } -fn write_origin_metadata( - output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, field_number: u32, origin_product: u32, - origin_category: u32, origin_service: u32, -) -> Result<(), protobuf::Error> { - // TODO: Figure out how to cleanly use `write_nested_message` here. - - scratch_buf.clear(); - - { - let mut origin_output_stream = CodedOutputStream::vec(scratch_buf); - origin_output_stream.write_uint32(ORIGIN_ORIGIN_PRODUCT_FIELD_NUMBER, origin_product)?; - origin_output_stream.write_uint32(ORIGIN_ORIGIN_CATEGORY_FIELD_NUMBER, origin_category)?; - origin_output_stream.write_uint32(ORIGIN_ORIGIN_SERVICE_FIELD_NUMBER, origin_service)?; - origin_output_stream.flush()?; - } +fn is_v3_series_resource_tag(tag: &Tag) -> bool { + tag.name() == "dd.internal.resource" && tag.value().is_some() +} - // We do a little song and dance here because the `Origin` message is embedded inside of `Metadata`, so we need to - // write out field numbers/length delimiters in order: `Metadata`, and then `Origin`... but we write out origin - // message to the scratch buffer first... so we write out our `Metadata` preamble stuff to get its length, and then - // use that in conjunction with the `Origin` message size to write out the full `Metadata` message. - let origin_message_size = get_message_size_from_buffer(scratch_buf)?; - - let mut metadata_preamble_buf = [0; 64]; - let metadata_preamble_len = { - let mut metadata_output_stream = CodedOutputStream::bytes(&mut metadata_preamble_buf[..]); - metadata_output_stream.write_tag(METADATA_ORIGIN_FIELD_NUMBER, WireType::LengthDelimited)?; - metadata_output_stream.write_raw_varint32(origin_message_size)?; - metadata_output_stream.flush()?; - metadata_output_stream.total_bytes_written() as usize +/// Creates a V3 HTTP request from encoded payload data. +async fn create_v3_request( + endpoint_uri: &str, payload: Vec, compression_scheme: CompressionScheme, +) -> Result { + // Our `payload` is the inner `MetricData` message structure at this point, so we just manually write out the + // `Payload` message framing before writing the metric data. + let mut header_buf = [0; 16]; + let header_len = { + let mut header_writer = CodedOutputStream::bytes(&mut header_buf); + header_writer.write_tag(3, WireType::LengthDelimited)?; + header_writer.write_uint64_no_tag(payload.len() as u64)?; + header_writer.flush()?; + header_writer.total_bytes_written() as usize }; - let metadata_message_size = get_message_size(scratch_buf.len() + metadata_preamble_len)?; + let uncompressed_len = header_len + payload.len(); + let buffer = ChunkedBytesBuffer::new(RB_BUFFER_CHUNK_SIZE); + let mut compressor = Compressor::from_scheme(compression_scheme, buffer); + compressor + .write_all(&header_buf[..header_len]) + .await + .error_context("Failed to compress V3 payload.")?; + compressor + .write_all(&payload) + .await + .error_context("Failed to compress V3 payload.")?; + compressor + .flush() + .await + .error_context("Failed to flush V3 compressor.")?; + compressor + .shutdown() + .await + .error_context("Failed to shutdown V3 compressor.")?; - output_stream.write_tag(field_number, WireType::LengthDelimited)?; - output_stream.write_raw_varint32(metadata_message_size)?; - output_stream.write_raw_bytes(&metadata_preamble_buf[..metadata_preamble_len])?; - output_stream.write_raw_bytes(scratch_buf) -} + let content_encoding = compressor.content_encoding(); + let compressed_buf = compressor.into_inner().freeze(); + let compressed_len = compressed_buf.len(); -fn write_point( - output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, value: f64, timestamp: i64, -) -> Result<(), protobuf::Error> { - write_nested_message(output_stream, scratch_buf, SERIES_POINTS_FIELD_NUMBER, |os| { - os.write_double(METRIC_POINT_VALUE_FIELD_NUMBER, value)?; - os.write_int64(METRIC_POINT_TIMESTAMP_FIELD_NUMBER, timestamp) - }) -} + let mut builder = Request::builder() + .method(Method::POST) + .uri(endpoint_uri) + .header(http::header::CONTENT_TYPE, "application/x-protobuf"); -fn write_dogsketch( - output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, packed_scratch_buf: &mut Vec, - timestamp: Option, sketch: &DDSketch, -) -> Result<(), protobuf::Error> { - // If the sketch is empty, we don't write it out. - if sketch.is_empty() { - warn!("Attempted to write an empty sketch to sketches payload, skipping."); - return Ok(()); + if let Some(encoding) = content_encoding { + builder = builder.header(http::header::CONTENT_ENCODING, encoding); } - write_nested_message(output_stream, scratch_buf, SKETCH_DOGSKETCHES_FIELD_NUMBER, |os| { - os.write_int64(DOGSKETCH_TS_FIELD_NUMBER, timestamp.map_or(0, |ts| ts.get() as i64))?; - os.write_int64(DOGSKETCH_CNT_FIELD_NUMBER, sketch.count() as i64)?; - os.write_double(DOGSKETCH_MIN_FIELD_NUMBER, sketch.min().unwrap())?; - os.write_double(DOGSKETCH_MAX_FIELD_NUMBER, sketch.max().unwrap())?; - os.write_double(DOGSKETCH_AVG_FIELD_NUMBER, sketch.avg().unwrap())?; - os.write_double(DOGSKETCH_SUM_FIELD_NUMBER, sketch.sum().unwrap())?; - - let bin_keys = sketch.bins().iter().map(|bin| bin.key()); - write_repeated_packed_from_iter( - os, - packed_scratch_buf, - DOGSKETCH_K_FIELD_NUMBER, - bin_keys, - |inner_os, value| inner_os.write_sint32_no_tag(value), - )?; - - let bin_counts = sketch.bins().iter().map(|bin| bin.count()); - write_repeated_packed_from_iter( - os, - packed_scratch_buf, - DOGSKETCH_N_FIELD_NUMBER, - bin_counts, - |inner_os, value| inner_os.write_uint32_no_tag(value), - ) + let request = builder + .body(compressed_buf) + .map_err(|e| generic_error!("Failed to build V3 request: {}", e))?; + + Ok(V3EncodedRequest { + request, + compressed_len, + uncompressed_len, }) } -fn get_deduplicated_tags<'a>( - metric: &'a Metric, additional_tags: &'a SharedTagSet, tags_deduplicator: &'a mut ReusableDeduplicator, -) -> impl Iterator { - let chained_tags = metric - .context() - .tags() - .into_iter() - .chain(additional_tags) - .chain(metric.context().origin_tags()); - - tags_deduplicator.deduplicated(chained_tags) -} +#[cfg(test)] +mod tests { + use std::sync::Arc; -fn write_tags<'a, I, F>( - tags: I, output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, tag_encoder: F, -) -> Result<(), protobuf::Error> -where - I: Iterator, - F: Fn(&Tag, &mut CodedOutputStream<'_>, &mut Vec) -> Result<(), protobuf::Error>, -{ - for tag in tags { - tag_encoder(tag, output_stream, scratch_buf)?; - } + use saluki_context::{ + tags::{Tag, TagSet}, + Context, + }; + use saluki_core::data_model::{ + event::{metric::MetricMetadata, Event}, + payload::Payload, + }; + use stringtheory::MetaString; + use tokio::time::timeout; - Ok(()) -} + use super::*; -fn write_series_tags<'a, I>( - tags: I, output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, -) -> Result<(), protobuf::Error> -where - I: Iterator, -{ - write_tags(tags, output_stream, scratch_buf, |tag, os, buf| { - // If this is a resource tag, we'll convert it directly to a resource entry. - if tag.name() == "dd.internal.resource" { - if let Some((resource_type, resource_name)) = tag.value().and_then(|s| s.split_once(':')) { - write_resource(os, buf, resource_type, resource_name) - } else { - Ok(()) - } - } else { - // We're dealing with a normal tag. - os.write_string(SERIES_TAGS_FIELD_NUMBER, tag.as_str()) - } - }) -} + #[test] + fn deser_agent_v3_api_nested_settings() { + let raw = r#" +serializer_experimental_use_v3_api: + compression_level: 7 + series: + endpoints: + - https://app.datadoghq.com + validate: true + use_beta: true + beta_route: /api/intake/metrics/custom/series + sketches: + endpoints: + - https://app.datadoghq.eu +"#; + + let config = + serde_yaml::from_str::(raw).expect("configuration should deserialize"); + + assert_eq!(7, config.v3_api.compression_level); + assert_eq!( + Some("https://app.datadoghq.com"), + config.v3_api.series.endpoints.first().map(String::as_str) + ); + assert!(config.v3_api.series.validate); + assert!(config.v3_api.series.use_beta); + assert_eq!("/api/intake/metrics/custom/series", config.v3_api.series.beta_route); + assert_eq!( + Some("https://app.datadoghq.eu"), + config.v3_api.sketches.endpoints.first().map(String::as_str) + ); + } -fn write_sketch_tags<'a, I>( - tags: I, output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, -) -> Result<(), protobuf::Error> -where - I: Iterator, -{ - write_tags(tags, output_stream, scratch_buf, |tag, os, _buf| { - // We always write the tags as-is, without any special handling for resource tags. - os.write_string(SKETCH_TAGS_FIELD_NUMBER, tag.as_str()) - }) -} + #[tokio::test] + async fn create_v3_request_uses_configured_endpoint_uri() { + let request = create_v3_request( + "/api/intake/metrics/custom/series", + Vec::new(), + CompressionScheme::noop(), + ) + .await + .expect("request should be created"); -fn write_nested_message( - output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, field_number: u32, writer: F, -) -> Result<(), protobuf::Error> -where - F: FnOnce(&mut CodedOutputStream<'_>) -> Result<(), protobuf::Error>, -{ - scratch_buf.clear(); - - { - let mut nested_output_stream = CodedOutputStream::vec(scratch_buf); - writer(&mut nested_output_stream)?; - nested_output_stream.flush()?; + assert_eq!("/api/intake/metrics/custom/series", request.request.uri()); } - output_stream.write_tag(field_number, WireType::LengthDelimited)?; - - let nested_message_size = get_message_size_from_buffer(scratch_buf)?; - output_stream.write_raw_varint32(nested_message_size)?; - output_stream.write_raw_bytes(scratch_buf) -} + async fn create_v3_test_request(metrics: &[Metric]) -> V3EncodedRequest { + let encoded = encode_v3_metrics_batch(metrics, &SharedTagSet::default()).expect("metrics should encode to V3"); + create_v3_request(V3_SERIES_ENDPOINT_URI, encoded, CompressionScheme::noop()) + .await + .expect("request should be created") + } -fn write_repeated_packed_from_iter( - output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, field_number: u32, values: I, writer: F, -) -> Result<(), protobuf::Error> -where - I: Iterator, - F: Fn(&mut CodedOutputStream<'_>, T) -> Result<(), protobuf::Error>, -{ - // This is a helper function that lets us write out a packed repeated field from an iterator of values. - // `CodedOutputStream` has similar functions to handle this, but they require a slice of values, which would mean we - // need to either allocate a new vector each time to hold the values, or thread through two additional vectors (one - // for `i32`, one for `u32`) to reuse the allocation... both of which are not great options. - // - // We've simply opted to pass through a _single_ vector that we can reuse, and write the packed values directly to - // that, almost identically to how `CodedOutputStream::write_repeated_packed_*` methods would do it. - - scratch_buf.clear(); - - { - let mut packed_output_stream = CodedOutputStream::vec(scratch_buf); - for value in values { - writer(&mut packed_output_stream, value)?; + fn test_v3_flush_context<'a>( + ep_config: &'a EndpointConfiguration, payload_limits: V3PayloadLimits, telemetry: &'a ComponentTelemetry, + ) -> V3FlushContext<'a> { + V3FlushContext { + endpoint_config: ep_config, + payload_limits, + series_endpoint_uri: V3_SERIES_ENDPOINT_URI, + telemetry, } - packed_output_stream.flush()?; } - let data_size = get_message_size_from_buffer(scratch_buf)?; - - output_stream.write_tag(field_number, WireType::LengthDelimited)?; - output_stream.write_raw_varint32(data_size)?; - output_stream.write_raw_bytes(scratch_buf) -} - -#[cfg(test)] -mod tests { - use std::{sync::Arc, time::Duration}; - - use protobuf::CodedOutputStream; - use saluki_common::iter::ReusableDeduplicator; - use saluki_context::{tags::SharedTagSet, Context}; - use saluki_core::data_model::event::metric::{Metric, MetricMetadata, MetricOrigin, MetricValues}; - use serde_json::Value as JsonValue; - use stringtheory::MetaString; - - use super::{ - encode_series_v1_metric, encode_series_v2_metric, encode_sketch_metric, MetricsEndpoint, - MetricsEndpointEncoder, SERIES_V1_INPUT_SEPARATOR, SERIES_V1_PAYLOAD_PREFIX, SERIES_V1_PAYLOAD_SUFFIX, - }; - use crate::common::datadog::{ - request_builder::EndpointEncoder as _, DEFAULT_SERIALIZER_COMPRESSED_SIZE_LIMIT, - DEFAULT_SERIALIZER_UNCOMPRESSED_SIZE_LIMIT, - }; + #[tokio::test] + async fn v3_payload_requests_split_by_compressed_size_limit() { + let metrics = vec![ + Metric::counter("v3.compressed.split.one", 1.0), + Metric::counter("v3.compressed.split.two", 2.0), + ]; + let single_request = create_v3_test_request(&metrics[..1]).await; + let combined_request = create_v3_test_request(&metrics).await; + assert!(combined_request.compressed_len > single_request.compressed_len); + + let limits = V3PayloadLimits::new(single_request.compressed_len, usize::MAX, 10_000, 10_000); + let ep_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + + let context = test_v3_flush_context(&ep_config, limits, &telemetry); + let requests = encode_v3_payload_requests(V3_SERIES_ENDPOINT_URI, &metrics, context, "series").await; + + assert_eq!(2, requests.len()); + assert_eq!( + vec![1, 1], + requests.iter().map(|request| request.event_count).collect::>() + ); + assert!(requests + .iter() + .all(|request| request.request.body().len() <= limits.max_compressed_size)); + } - fn encode_one_v1(metric: &Metric) -> JsonValue { - let mut buf = Vec::new(); - let host_tags = SharedTagSet::default(); - let mut tags_deduplicator = ReusableDeduplicator::new(); - encode_series_v1_metric(metric, &host_tags, &mut buf, &mut tags_deduplicator) - .expect("encode_series_v1_metric should succeed"); - serde_json::from_slice(&buf).expect("encoder produced invalid JSON") + #[tokio::test] + async fn v3_payload_requests_split_by_uncompressed_size_limit() { + let metrics = vec![ + Metric::counter("v3.uncompressed.split.one", 1.0), + Metric::counter("v3.uncompressed.split.two", 2.0), + ]; + let single_request = create_v3_test_request(&metrics[..1]).await; + let combined_request = create_v3_test_request(&metrics).await; + assert!(combined_request.uncompressed_len > single_request.uncompressed_len); + + let limits = V3PayloadLimits::new(usize::MAX, single_request.uncompressed_len, 10_000, 10_000); + let ep_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + + let context = test_v3_flush_context(&ep_config, limits, &telemetry); + let requests = encode_v3_payload_requests(V3_SERIES_ENDPOINT_URI, &metrics, context, "series").await; + + assert_eq!(2, requests.len()); + assert_eq!( + vec![1, 1], + requests.iter().map(|request| request.event_count).collect::>() + ); } #[test] - fn histogram_vs_sketch_identical_payload() { - // For the same exact set of points, we should be able to construct either a histogram or distribution from - // those points, and when encoded as a sketch payload, end up with the same exact payload. - // - // They should be identical because the goal is that we convert histograms into sketches in the same way we - // would have originally constructed a sketch based on the same samples. - let samples = &[1.0, 2.0, 3.0, 4.0, 5.0]; - let histogram = Metric::histogram("simple_samples", samples); - let distribution = Metric::distribution("simple_samples", samples); - let host_tags = SharedTagSet::default(); - - let mut buf1 = Vec::new(); - let mut buf2 = Vec::new(); - let mut tags_deduplicator = ReusableDeduplicator::new(); - - let mut histogram_payload = Vec::new(); - { - let mut histogram_writer = CodedOutputStream::vec(&mut histogram_payload); - encode_sketch_metric( - &histogram, - &host_tags, - &mut histogram_writer, - &mut buf1, - &mut buf2, - &mut tags_deduplicator, - ) - .expect("Failed to encode histogram as sketch"); - } - - let mut distribution_payload = Vec::new(); - { - let mut distribution_writer = CodedOutputStream::vec(&mut distribution_payload); - encode_sketch_metric( - &distribution, - &host_tags, - &mut distribution_writer, - &mut buf1, - &mut buf2, - &mut tags_deduplicator, - ) - .expect("Failed to encode distribution as sketch"); - } - - assert_eq!(histogram_payload, distribution_payload); + fn v3_metric_ranges_split_by_point_limit() { + let metrics = vec![ + Metric::counter("v3.points.split.one", [(123, 1.0), (124, 2.0)]), + Metric::counter("v3.points.split.two", [(123, 3.0), (124, 4.0)]), + Metric::counter("v3.points.split.three", 5.0), + ]; + let limits = V3PayloadLimits::new(usize::MAX, usize::MAX, 10_000, 3); + let ep_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + let context = test_v3_flush_context(&ep_config, limits, &telemetry); + + let ranges = split_v3_metric_ranges_by_point_limit(&metrics, context, "series") + .into_iter() + .collect::>(); + + assert_eq!(vec![0..1, 1..3], ranges); } #[test] - fn input_valid() { - // Our encoder should always consider series metrics valid when set to either series endpoint, and similarly - // for sketch metrics when set to the sketches endpoint. - let counter = Metric::counter("counter", 1.0); - let rate = Metric::rate("rate", 1.0, Duration::from_secs(1)); - let gauge = Metric::gauge("gauge", 1.0); - let set = Metric::set("set", "foo"); - let histogram = Metric::histogram("histogram", [1.0, 2.0, 3.0]); - let distribution = Metric::distribution("distribution", [1.0, 2.0, 3.0]); - - let series_v1 = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV1); - let series_v2 = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV2); - let sketches_endpoint = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::Sketches); - - for series_endpoint in [&series_v1, &series_v2] { - assert!(series_endpoint.is_valid_input(&counter)); - assert!(series_endpoint.is_valid_input(&rate)); - assert!(series_endpoint.is_valid_input(&gauge)); - assert!(series_endpoint.is_valid_input(&set)); - assert!(!series_endpoint.is_valid_input(&histogram)); - assert!(!series_endpoint.is_valid_input(&distribution)); + fn v3_metric_ranges_skip_zero_point_metrics() { + let metrics = vec![ + Metric::counter("v3.points.zero.before", 1.0), + Metric::counter("v3.points.zero.empty", &[] as &[f64]), + Metric::counter("v3.points.zero.after", 2.0), + ]; + let limits = V3PayloadLimits::new(usize::MAX, usize::MAX, 10_000, 10_000); + let ep_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + let context = test_v3_flush_context(&ep_config, limits, &telemetry); + + let ranges = split_v3_metric_ranges_by_point_limit(&metrics, context, "series") + .into_iter() + .collect::>(); + + assert_eq!(vec![0..1, 2..3], ranges); + } + + #[tokio::test] + async fn v3_split_flush_uses_payload_request_batch_headers() { + let mut metrics = vec![ + Metric::counter("v3.headers.split.one", 1.0), + Metric::counter("v3.headers.split.two", 2.0), + ]; + let single_request = create_v3_test_request(&metrics[..1]).await; + let combined_request = create_v3_test_request(&metrics).await; + assert!(combined_request.compressed_len > single_request.compressed_len); + + let limits = V3PayloadLimits::new(single_request.compressed_len, usize::MAX, 10_000, 10_000); + let ep_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + let batch_id = Uuid::now_v7(); + let (mut payloads_tx, mut payloads_rx) = tokio::sync::mpsc::channel(8); + + let context = test_v3_flush_context(&ep_config, limits, &telemetry); + encode_and_flush_v3_series_metrics( + context, + &mut metrics, + &mut payloads_tx, + Some(&batch_id), + Some(MetricsPayloadInfo::v3_series()), + ) + .await + .expect("V3 metrics should flush"); + + for expected_seq in 0..2 { + let payload = payloads_rx.recv().await.expect("payload should be emitted"); + let Payload::Http(http_payload) = payload else { + panic!("expected HTTP payload"); + }; + let (_, request) = http_payload.into_parts(); + assert_eq!( + batch_id.as_hyphenated().to_string(), + request + .headers() + .get("X-Metrics-Request-ID") + .expect("batch ID header should be present") + .to_str() + .expect("batch ID header should be valid") + ); + assert_eq!( + expected_seq.to_string(), + request + .headers() + .get("X-Metrics-Request-Seq") + .expect("batch sequence header should be present") + .to_str() + .expect("batch sequence header should be valid") + ); + assert_eq!( + "2", + request + .headers() + .get("X-Metrics-Request-Len") + .expect("batch length header should be present") + .to_str() + .expect("batch length header should be valid") + ); } - assert!(!sketches_endpoint.is_valid_input(&counter)); - assert!(!sketches_endpoint.is_valid_input(&rate)); - assert!(!sketches_endpoint.is_valid_input(&gauge)); - assert!(!sketches_endpoint.is_valid_input(&set)); - assert!(sketches_endpoint.is_valid_input(&histogram)); - assert!(sketches_endpoint.is_valid_input(&distribution)); + assert!(metrics.is_empty()); } - #[test] - fn input_data_point_count_tracks_metric_values() { - let counter = Metric::counter("counter", [(123, 1.0), (124, 2.0)]); - let histogram = Metric::histogram("histogram", [1.0, 2.0, 3.0]); - - let series_endpoint = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV2); - let sketches_endpoint = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::Sketches); + #[tokio::test] + async fn v3_sketch_flush_uses_split_payload_requests() { + let mut metrics = vec![ + Metric::distribution("v3.sketch.split.one", [1.0, 2.0, 3.0]), + Metric::distribution("v3.sketch.split.two", [4.0, 5.0, 6.0]), + ]; + let single_request = create_v3_test_request(&metrics[..1]).await; + let combined_request = create_v3_test_request(&metrics).await; + assert!(combined_request.compressed_len > single_request.compressed_len); + + let limits = V3PayloadLimits::new(single_request.compressed_len, usize::MAX, 10_000, 10_000); + let ep_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + let batch_id = Uuid::now_v7(); + let (mut payloads_tx, mut payloads_rx) = tokio::sync::mpsc::channel(8); + + let context = test_v3_flush_context(&ep_config, limits, &telemetry); + encode_and_flush_v3_sketch_metrics( + context, + &mut metrics, + &mut payloads_tx, + Some(&batch_id), + Some(MetricsPayloadInfo::v3_sketches()), + ) + .await + .expect("V3 sketches should flush"); + + for expected_seq in 0..2 { + let payload = payloads_rx.recv().await.expect("payload should be emitted"); + let Payload::Http(http_payload) = payload else { + panic!("expected HTTP payload"); + }; + let (_, request) = http_payload.into_parts(); + assert_eq!(V3_SKETCHES_ENDPOINT_URI, request.uri()); + assert_eq!( + expected_seq.to_string(), + request + .headers() + .get("X-Metrics-Request-Seq") + .expect("batch sequence header should be present") + .to_str() + .expect("batch sequence header should be valid") + ); + assert_eq!( + "2", + request + .headers() + .get("X-Metrics-Request-Len") + .expect("batch length header should be present") + .to_str() + .expect("batch length header should be valid") + ); + } - assert_eq!(series_endpoint.input_data_point_count(&counter), 2); - assert_eq!(sketches_endpoint.input_data_point_count(&histogram), 1); + assert!(metrics.is_empty()); } #[test] - fn series_metric_unit_encoded() { - // A gauge with a unit in its metadata must produce a series protobuf payload that contains the unit string - // in field 6 (MetricSeries.unit), which the Datadog backend already accepts. - // - // In production this state is reached when histogram aggregation flushes timer (`ms`) statistics as gauges, - // each carrying unit = "millisecond" propagated through MetricMetadata. + fn v3_series_metric_unit_refs_are_encoded_sparsely() { let context = Context::from_static_parts("my.timer.avg", &[]); let metadata = MetricMetadata::default().with_unit(MetaString::from_static("millisecond")); let gauge = Metric::from_parts(context, MetricValues::gauge([1.0_f64]), metadata); + let context = Context::from_static_parts("my.counter", &[]); + let no_unit = Metric::from_parts(context, MetricValues::gauge([2.0_f64]), MetricMetadata::default()); + let context = Context::from_static_parts("my.timer.max", &[]); + let metadata = MetricMetadata::default().with_unit(MetaString::from_static("millisecond")); + let same_unit = Metric::from_parts(context, MetricValues::gauge([3.0_f64]), metadata); - let host_tags = SharedTagSet::default(); - let mut scratch_buf = Vec::new(); - let mut tags_deduplicator = ReusableDeduplicator::new(); - - let mut payload = Vec::new(); - { - let mut writer = CodedOutputStream::vec(&mut payload); - encode_series_v2_metric( - &gauge, - &host_tags, - &mut writer, - &mut scratch_buf, - &mut tags_deduplicator, - ) - .expect("Failed to encode gauge as series metric"); - writer.flush().expect("Failed to flush"); - } - - // In the protobuf wire format, a string field with field number 6 has tag byte 0x32 ((6 << 3) | 2). - // The tag is followed by a varint length and then the UTF-8 bytes of the string. - let expected_tag: u8 = (6 << 3) | 2; // 0x32 - let expected_value = b"millisecond"; - - let tag_pos = payload - .windows(1 + 1 + expected_value.len()) - .position(|w| w[0] == expected_tag && w[1] == expected_value.len() as u8 && &w[2..] == expected_value); + let payload = encode_v3_metrics_batch(&[gauge, no_unit, same_unit], &SharedTagSet::default()) + .expect("V3 metric should encode successfully"); + let expected_unit_dict = [ + 0xca, 0x01, // field 25, length-delimited. + 0x0c, // field payload length: varint string length + string bytes. + 0x0b, b'm', b'i', b'l', b'l', b'i', b's', b'e', b'c', b'o', b'n', b'd', + ]; assert!( - tag_pos.is_some(), - "series payload should contain unit field (field 6 = 'millisecond'), got bytes: {:?}", + payload + .windows(expected_unit_dict.len()) + .any(|window| window == expected_unit_dict), + "V3 payload should contain DictUnitStr field for 'millisecond', got bytes: {:?}", payload ); - } - #[test] - fn series_v1_basic_payload_shape() { - // Each metric variant maps to the right `type` string, points are emitted as [ts, value] tuples, - // and `interval`/`host` are always present (zero/empty when not set). - let counter = Metric::counter("my.count", 5.0); - let counter_json = encode_one_v1(&counter); - assert_eq!(counter_json["metric"], "my.count"); - assert_eq!(counter_json["type"], "count"); - assert_eq!(counter_json["interval"], 0); - assert_eq!(counter_json["host"], ""); - assert_eq!(counter_json["tags"], JsonValue::Array(vec![])); - let points = counter_json["points"].as_array().expect("points is array"); - assert_eq!(points.len(), 1); - assert_eq!(points[0][0], 0); - assert_eq!(points[0][1], 5.0); - // Optional fields must be absent when not set. - assert!(counter_json.get("unit").is_none()); - assert!(counter_json.get("source_type_name").is_none()); - assert!(counter_json.get("device").is_none()); - - let rate = Metric::rate("my.rate", 30.0, Duration::from_secs(10)); - let rate_json = encode_one_v1(&rate); - assert_eq!(rate_json["type"], "rate"); - assert_eq!(rate_json["interval"], 10); - // Rate value scaled by interval seconds: 30 / 10 = 3. - let rate_points = rate_json["points"].as_array().expect("rate points is array"); - assert_eq!(rate_points[0][1], 3.0); - - let gauge = Metric::gauge("my.gauge", 42.0); - let gauge_json = encode_one_v1(&gauge); - assert_eq!(gauge_json["type"], "gauge"); - - // Sets are encoded as gauges with the set cardinality as the value (consistent with V2). - let set = Metric::set("my.set", "alpha"); - let set_json = encode_one_v1(&set); - assert_eq!(set_json["type"], "gauge"); - let set_points = set_json["points"].as_array().expect("set points is array"); - assert_eq!(set_points[0][1], 1.0); + let expected_unit_ref = [ + 0xd2, 0x01, // field 26, length-delimited. + 0x02, // packed field payload length. + 0x02, 0x00, // sparse unit refs for metrics 1 and 3 only: refs [1, 1] -> deltas [1, 0]. + ]; + assert!( + payload + .windows(expected_unit_ref.len()) + .any(|window| window == expected_unit_ref), + "V3 payload should contain UnitRef field for 'millisecond', got bytes: {:?}", + payload + ); } #[test] - fn series_v1_unit_and_hostname_emitted() { - let context = Context::from_static_parts("my.timer.avg", &[]); - let metadata = MetricMetadata::default() - .with_unit(MetaString::from_static("millisecond")) - .with_hostname(Some(Arc::from("host-1"))); - let gauge = Metric::from_parts(context, MetricValues::gauge([1.0_f64]), metadata); + fn v3_sketch_metric_unit_not_encoded() { + let context = Context::from_static_parts("my.histogram", &[]); + let metadata = MetricMetadata::default().with_unit(MetaString::from_static("millisecond")); + let histogram = Metric::from_parts(context, MetricValues::histogram([1.0_f64]), metadata); - let json = encode_one_v1(&gauge); - assert_eq!(json["unit"], "millisecond"); - assert_eq!(json["host"], "host-1"); - } + let payload = encode_v3_metrics_batch(&[histogram], &SharedTagSet::default()) + .expect("V3 sketch metric should encode successfully"); - #[test] - fn series_v1_device_tag_extraction() { - // A `device:` tag is extracted into the `device` JSON field and dropped from `tags`. - let context = Context::from_static_parts("my.metric", &["device:eth0", "env:prod"]); - let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), MetricMetadata::default()); - - let json = encode_one_v1(&counter); - assert_eq!(json["device"], "eth0"); - let tags = json["tags"].as_array().expect("tags is array"); - let tag_strs: Vec<&str> = tags.iter().filter_map(|v| v.as_str()).collect(); assert!( - !tag_strs.iter().any(|t| t.starts_with("device:")), - "device tag must be removed: {:?}", - tag_strs + !payload + .windows(b"millisecond".len()) + .any(|window| window == b"millisecond"), + "V3 sketch payload should not contain unit bytes, matching the Agent V3 sketch builder: {:?}", + payload ); - assert!(tag_strs.contains(&"env:prod")); } #[test] - fn series_v1_source_type_name_from_source_type_origin() { - let context = Context::from_static_parts("my.metric", &[]); - let metadata = MetricMetadata::default().with_source_type(Some(Arc::from("integration_x"))); - let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), metadata); - - let json = encode_one_v1(&counter); - assert_eq!(json["source_type_name"], "integration_x"); + fn v3_series_promotes_device_and_internal_resource_tags_to_resources() { + let context = Context::from_static_parts( + "series.resources", + &[ + "env:prod", + "device:switch1", + "dd.internal.resource:pod:pod-a", + "dd.internal.resource:malformed", + ], + ); + let metadata = MetricMetadata::default().with_hostname(Some(Arc::from("host-a"))); + let metric = Metric::from_parts(context, MetricValues::gauge([1.0_f64]), metadata); + + let payload = + encode_v3_metrics_batch(&[metric], &SharedTagSet::default()).expect("V3 series should encode successfully"); + + assert_contains_bytes(&payload, b"env:prod"); + assert!(!contains_bytes(&payload, b"device:switch1")); + assert!(!contains_bytes(&payload, b"dd.internal.resource:pod:pod-a")); + assert!(!contains_bytes(&payload, b"dd.internal.resource:malformed")); + + let expected_resource_dict = [ + 0x22, // field 4, length-delimited. + 0x25, // field payload length. + 0x04, b'h', b'o', b's', b't', 0x06, b'h', b'o', b's', b't', b'-', b'a', 0x06, b'd', b'e', b'v', b'i', b'c', + b'e', 0x07, b's', b'w', b'i', b't', b'c', b'h', b'1', 0x03, b'p', b'o', b'd', 0x05, b'p', b'o', b'd', b'-', + b'a', + ]; + assert_contains_bytes(&payload, &expected_resource_dict); } #[test] - fn series_v1_origin_metadata_dropped() { - // OriginMetadata is V2-protobuf only; V1 must drop it. - let context = Context::from_static_parts("my.metric", &[]); - let metadata = MetricMetadata::default().with_origin(Some(MetricOrigin::dogstatsd())); - let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), metadata); - - let json = encode_one_v1(&counter); - assert!(json.get("source_type_name").is_none()); + fn v3_series_promotes_additional_and_origin_resource_tags_without_empty_host() { + let context = Context::from_static_parts("series.additional_origin_resources", &["env:prod"]) + .with_origin_tags(tag_set(["dd.internal.resource:pod:pod-origin"])); + let additional_tags = SharedTagSet::from(tag_set([ + "team:core", + "device:switch1", + "dd.internal.resource:container:container-a", + ])); + let metadata = MetricMetadata::default().with_hostname(Some(Arc::from(""))); + let metric = Metric::from_parts(context, MetricValues::gauge([1.0_f64]), metadata); + + let payload = + encode_v3_metrics_batch(&[metric], &additional_tags).expect("V3 series should encode successfully"); + + assert_contains_bytes(&payload, b"env:prod"); + assert_contains_bytes(&payload, b"team:core"); + assert!(!contains_bytes(&payload, b"device:switch1")); + assert!(!contains_bytes(&payload, b"dd.internal.resource:container:container-a")); + assert!(!contains_bytes(&payload, b"dd.internal.resource:pod:pod-origin")); + + let expected_resource_dict = [ + 0x22, // field 4, length-delimited. + 0x34, // field payload length. + 0x06, b'd', b'e', b'v', b'i', b'c', b'e', 0x07, b's', b'w', b'i', b't', b'c', b'h', b'1', 0x03, b'p', b'o', + b'd', 0x0a, b'p', b'o', b'd', b'-', b'o', b'r', b'i', b'g', b'i', b'n', 0x09, b'c', b'o', b'n', b't', b'a', + b'i', b'n', b'e', b'r', 0x0b, b'c', b'o', b'n', b't', b'a', b'i', b'n', b'e', b'r', b'-', b'a', + ]; + assert_contains_bytes(&payload, &expected_resource_dict); + assert!(!contains_bytes(&payload, b"host")); } #[test] - fn series_v1_dd_internal_resource_dropped() { - // `dd.internal.resource` is V2-protobuf-only; V1 must drop these tags silently. - let context = Context::from_static_parts("my.metric", &["dd.internal.resource:host:foo", "env:prod"]); - let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), MetricMetadata::default()); - - let json = encode_one_v1(&counter); - let tags = json["tags"].as_array().expect("tags is array"); - let tag_strs: Vec<&str> = tags.iter().filter_map(|v| v.as_str()).collect(); - assert!( - !tag_strs.iter().any(|t| t.starts_with("dd.internal.resource:")), - "dd.internal.resource tag must be dropped: {:?}", - tag_strs + fn v3_sketch_keeps_device_and_internal_resource_tags_as_tags() { + let context = Context::from_static_parts( + "sketch.resources", + &["env:prod", "device:switch1", "dd.internal.resource:pod:pod-a"], ); - assert!(tag_strs.contains(&"env:prod")); + let metadata = MetricMetadata::default().with_hostname(Some(Arc::from("host-a"))); + let metric = Metric::from_parts(context, MetricValues::histogram([1.0_f64]), metadata); + + let payload = + encode_v3_metrics_batch(&[metric], &SharedTagSet::default()).expect("V3 sketch should encode successfully"); + + assert_contains_bytes(&payload, b"env:prod"); + assert_contains_bytes(&payload, b"device:switch1"); + assert_contains_bytes(&payload, b"dd.internal.resource:pod:pod-a"); + + let expected_resource_dict = [ + 0x22, // field 4, length-delimited. + 0x0c, // field payload length. + 0x04, b'h', b'o', b's', b't', 0x06, b'h', b'o', b's', b't', b'-', b'a', + ]; + assert_contains_bytes(&payload, &expected_resource_dict); } - #[test] - fn series_v1_endpoint_routing() { - // SeriesV1 advertises the V1 URI, JSON content type, and the {"series":[...]} framing. - let encoder = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV1); - assert_eq!(encoder.endpoint_uri().path(), "/api/v1/series"); - assert_eq!(encoder.content_type(), "application/json"); - assert_eq!(encoder.get_payload_prefix(), Some(SERIES_V1_PAYLOAD_PREFIX)); - assert_eq!(encoder.get_payload_suffix(), Some(SERIES_V1_PAYLOAD_SUFFIX)); - assert_eq!(encoder.get_input_separator(), Some(SERIES_V1_INPUT_SEPARATOR)); - assert_eq!( - encoder.compressed_size_limit(), - DEFAULT_SERIALIZER_COMPRESSED_SIZE_LIMIT - ); - assert_eq!( - encoder.uncompressed_size_limit(), - DEFAULT_SERIALIZER_UNCOMPRESSED_SIZE_LIMIT + #[tokio::test] + async fn validation_split_flush_assigns_batch_id_to_carried_metric() { + let v2_endpoint_config = EndpointConfiguration::new(CompressionScheme::noop(), 1, None); + let v2_series_builder = Some( + v2::create_v2_request_builder(MetricsEndpoint::SeriesV2, &v2_endpoint_config) + .await + .expect("V2 request builder should be created"), ); + let v3_endpoint_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + let (events_tx, events_rx) = tokio::sync::mpsc::channel(1); + let (payloads_tx, mut payloads_rx) = tokio::sync::mpsc::channel(8); + + let request_builder_handle = tokio::spawn(run_request_builder( + v2_series_builder, + None, + MetricsEncoderMode::Validation, + MetricsEncoderMode::V2Only, + v3_endpoint_config, + V3PayloadLimits::new(usize::MAX, usize::MAX, 10_000, 10_000), + V3_SERIES_ENDPOINT_URI.to_string(), + telemetry, + events_rx, + payloads_tx, + Duration::from_millis(10), + false, + )); + + let mut events = EventsBuffer::default(); + assert!(events + .try_push(Event::Metric(Metric::counter("validation.split.one", 1.0))) + .is_none()); + assert!(events + .try_push(Event::Metric(Metric::counter("validation.split.two", 2.0))) + .is_none()); + events_tx + .send(events) + .await + .expect("events should be sent to request builder"); + + let mut flushed_requests = Vec::new(); + for _ in 0..4 { + let payload = timeout(Duration::from_secs(1), payloads_rx.recv()) + .await + .expect("payload should arrive before timeout") + .expect("payload channel should remain open"); + let Payload::Http(http_payload) = payload else { + panic!("expected HTTP payload"); + }; + let (_, request) = http_payload.into_parts(); + let batch_id = request + .headers() + .get("X-Metrics-Request-ID") + .expect("validation batch ID header should be present") + .to_str() + .expect("validation batch ID should be valid header text") + .to_string(); + flushed_requests.push((request.uri().to_string(), batch_id)); + } - // Sketches use the generic serializer payload limits in the Datadog Agent. - let sketches = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::Sketches); - assert_eq!( - sketches.compressed_size_limit(), - DEFAULT_SERIALIZER_COMPRESSED_SIZE_LIMIT - ); - assert_eq!( - sketches.uncompressed_size_limit(), - DEFAULT_SERIALIZER_UNCOMPRESSED_SIZE_LIMIT + assert_eq!("/api/v2/series", flushed_requests[0].0); + assert_eq!(V3_SERIES_ENDPOINT_URI, flushed_requests[1].0); + assert_eq!("/api/v2/series", flushed_requests[2].0); + assert_eq!(V3_SERIES_ENDPOINT_URI, flushed_requests[3].0); + + assert_eq!(flushed_requests[0].1, flushed_requests[1].1); + assert_eq!(flushed_requests[2].1, flushed_requests[3].1); + assert_ne!(flushed_requests[0].1, flushed_requests[2].1); + + drop(events_tx); + request_builder_handle + .await + .expect("request builder task should complete") + .expect("request builder should stop cleanly"); + } + + fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool { + haystack.windows(needle.len()).any(|window| window == needle) + } + + fn assert_contains_bytes(haystack: &[u8], needle: &[u8]) { + assert!( + contains_bytes(haystack, needle), + "expected payload to contain bytes {:?}, got {:?}", + needle, + haystack ); + } - // V2 series stays on protobuf with no framing. - let v2 = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV2); - assert_eq!(v2.endpoint_uri().path(), "/api/v2/series"); - assert_eq!(v2.content_type(), "application/x-protobuf"); - assert!(v2.get_payload_prefix().is_none()); + fn tag_set(tags: [&'static str; N]) -> TagSet { + tags.into_iter().map(Tag::from_static).collect() } } @@ -1629,10 +1892,18 @@ mod config_smoke { #[tokio::test] async fn smoke_test() { - run_config_smoke_tests(structs::DATADOG_METRICS_CONFIGURATION, &[], json!({}), |cfg| { - cfg.as_typed::() - .expect("DatadogMetricsConfiguration should deserialize") - }) + run_config_smoke_tests( + structs::DATADOG_METRICS_CONFIGURATION, + &[ + "serializer_experimental_use_v3_api.sketches.beta_route", + "serializer_experimental_use_v3_api.sketches.use_beta", + ], + json!({}), + |cfg| { + cfg.as_typed::() + .expect("DatadogMetricsConfiguration should deserialize") + }, + ) .await } } @@ -1642,12 +1913,10 @@ mod use_v2_api_series_default { use saluki_config::ConfigurationLoader; use serde_json::json; - use super::{DatadogMetricsConfiguration, SERIES_V2_COMPRESSED_SIZE_LIMIT, SERIES_V2_UNCOMPRESSED_SIZE_LIMIT}; + use super::{v2, DatadogMetricsConfiguration}; use crate::{common::datadog::clamp_payload_limits, config::KEY_ALIASES}; - /// `use_v2_api_series` defaults to `true` (preserves V2 protobuf behavior when the flag is absent). - /// The nested-form (`use_v2_api.series`) and env-var (`DD_USE_V2_API_SERIES`) paths to the flat key - /// are exercised end-to-end by the `config_smoke::smoke_test` runner via `KEY_ALIASES`. + /// `use_v2_api_series` defaults to `true`, preserving V2 protobuf behavior when the flag is absent. #[tokio::test] async fn defaults_to_true_when_absent() { let cfg = ConfigurationLoader::default() @@ -1684,19 +1953,19 @@ mod use_v2_api_series_default { #[test] fn clamps_series_payload_limit_keys_to_api_limits() { let (uncompressed_limit, compressed_limit) = clamp_payload_limits( - SERIES_V2_UNCOMPRESSED_SIZE_LIMIT + 1, - SERIES_V2_COMPRESSED_SIZE_LIMIT + 1, - SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, - SERIES_V2_COMPRESSED_SIZE_LIMIT, + v2::SERIES_V2_UNCOMPRESSED_SIZE_LIMIT + 1, + v2::SERIES_V2_COMPRESSED_SIZE_LIMIT + 1, + v2::SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, + v2::SERIES_V2_COMPRESSED_SIZE_LIMIT, ); - assert_eq!(uncompressed_limit, SERIES_V2_UNCOMPRESSED_SIZE_LIMIT); - assert_eq!(compressed_limit, SERIES_V2_COMPRESSED_SIZE_LIMIT); + assert_eq!(uncompressed_limit, v2::SERIES_V2_UNCOMPRESSED_SIZE_LIMIT); + assert_eq!(compressed_limit, v2::SERIES_V2_COMPRESSED_SIZE_LIMIT); let (uncompressed_limit, compressed_limit) = clamp_payload_limits( 5678, 1234, - SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, - SERIES_V2_COMPRESSED_SIZE_LIMIT, + v2::SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, + v2::SERIES_V2_COMPRESSED_SIZE_LIMIT, ); assert_eq!(uncompressed_limit, 5678); assert_eq!(compressed_limit, 1234); diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v1/mod.rs b/lib/saluki-components/src/encoders/datadog/metrics/v1/mod.rs new file mode 100644 index 00000000000..e345c9c66e8 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v1/mod.rs @@ -0,0 +1,235 @@ +use http::HeaderValue; +use saluki_common::iter::ReusableDeduplicator; +use saluki_context::tags::{SharedTagSet, Tag}; +use saluki_core::data_model::event::metric::{Metric, MetricOrigin, MetricValues}; +use serde_json::{Map as JsonMap, Number as JsonNumber, Value as JsonValue}; + +pub(super) const SERIES_COMPRESSED_SIZE_LIMIT: usize = 2_000_000; // ~2 MiB +pub(super) const SERIES_UNCOMPRESSED_SIZE_LIMIT: usize = 4_000_000; // ~4 MiB + +pub(super) static CONTENT_TYPE: HeaderValue = HeaderValue::from_static("application/json"); + +// JSON framing for the V1 series payload, which wraps the array of `Serie` objects in a top-level object. +pub(super) const SERIES_PAYLOAD_PREFIX: &[u8] = b"{\"series\":["; +pub(super) const SERIES_PAYLOAD_SUFFIX: &[u8] = b"]}"; +pub(super) const SERIES_INPUT_SEPARATOR: &[u8] = b","; + +pub(super) fn encode_series_metric( + metric: &Metric, additional_tags: &SharedTagSet, buffer: &mut Vec, + tags_deduplicator: &mut ReusableDeduplicator, +) -> Result<(), serde_json::Error> { + let mut obj = JsonMap::new(); + + obj.insert("metric".into(), JsonValue::String(metric.context().name().to_string())); + + let (type_str, points_iter, maybe_interval) = match metric.values() { + MetricValues::Counter(points) => ("count", points.into_iter(), None), + MetricValues::Rate(points, interval) => ("rate", points.into_iter(), Some(*interval)), + MetricValues::Gauge(points) => ("gauge", points.into_iter(), None), + MetricValues::Set(points) => ("gauge", points.into_iter(), None), + _ => unreachable!("encode_series_metric called with non-series metric"), + }; + + let mut points = Vec::new(); + for (timestamp, value) in points_iter { + let value = maybe_interval + .map(|interval| value / interval.as_secs_f64()) + .unwrap_or(value); + let timestamp = timestamp.map(|ts| ts.get()).unwrap_or(0) as i64; + let value_json = JsonNumber::from_f64(value) + .map(JsonValue::Number) + .unwrap_or_else(|| JsonValue::from(0)); + points.push(JsonValue::Array(vec![JsonValue::from(timestamp), value_json])); + } + obj.insert("points".into(), JsonValue::Array(points)); + + let deduplicated = get_deduplicated_tags(metric, additional_tags, tags_deduplicator); + let mut tags_out = Vec::new(); + let mut device: Option = None; + for tag in deduplicated { + if tag.name() == "dd.internal.resource" { + continue; + } + if device.is_none() && tag.name() == "device" { + if let Some(v) = tag.value() { + device = Some(v.to_string()); + continue; + } + } + tags_out.push(JsonValue::String(tag.as_str().to_string())); + } + obj.insert("tags".into(), JsonValue::Array(tags_out)); + + obj.insert( + "host".into(), + JsonValue::String(metric.metadata().hostname().unwrap_or_default().to_string()), + ); + + if let Some(device) = device.filter(|device| !device.is_empty()) { + obj.insert("device".into(), JsonValue::String(device)); + } + + obj.insert("type".into(), JsonValue::String(type_str.into())); + + let interval_secs = maybe_interval.map(|interval| interval.as_secs() as i64).unwrap_or(0); + obj.insert("interval".into(), JsonValue::from(interval_secs)); + + if let Some(MetricOrigin::SourceType(source_type)) = metric.metadata().origin() { + obj.insert( + "source_type_name".into(), + JsonValue::String(source_type.as_ref().to_string()), + ); + } + + if let Some(unit) = metric.metadata().unit() { + if !unit.is_empty() { + obj.insert("unit".into(), JsonValue::String(unit.to_string())); + } + } + + serde_json::to_writer(buffer, &JsonValue::Object(obj)) +} + +fn get_deduplicated_tags<'a>( + metric: &'a Metric, additional_tags: &'a SharedTagSet, tags_deduplicator: &'a mut ReusableDeduplicator, +) -> impl Iterator { + let chained_tags = metric + .context() + .tags() + .into_iter() + .chain(additional_tags) + .chain(metric.context().origin_tags()); + + tags_deduplicator.deduplicated(chained_tags) +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use saluki_common::iter::ReusableDeduplicator; + use saluki_context::{tags::SharedTagSet, Context}; + use saluki_core::data_model::event::metric::{Metric, MetricMetadata, MetricOrigin, MetricValues}; + use serde_json::Value as JsonValue; + use stringtheory::MetaString; + + use super::encode_series_metric; + + fn encode_one(metric: &Metric) -> JsonValue { + let mut buf = Vec::new(); + let host_tags = SharedTagSet::default(); + let mut tags_deduplicator = ReusableDeduplicator::new(); + encode_series_metric(metric, &host_tags, &mut buf, &mut tags_deduplicator) + .expect("encode_series_metric should succeed"); + serde_json::from_slice(&buf).expect("encoder produced invalid JSON") + } + + #[test] + fn basic_payload_shape() { + // Each metric variant maps to the right `type` string, points are emitted as [ts, value] tuples, + // and `interval`/`host` are always present (zero/empty when not set). + let counter = Metric::counter("my.count", 5.0); + let counter_json = encode_one(&counter); + assert_eq!(counter_json["metric"], "my.count"); + assert_eq!(counter_json["type"], "count"); + assert_eq!(counter_json["interval"], 0); + assert_eq!(counter_json["host"], ""); + assert_eq!(counter_json["tags"], JsonValue::Array(vec![])); + let points = counter_json["points"].as_array().expect("points is array"); + assert_eq!(points.len(), 1); + assert_eq!(points[0][0], 0); + assert_eq!(points[0][1], 5.0); + // Optional fields must be absent when not set. + assert!(counter_json.get("unit").is_none()); + assert!(counter_json.get("source_type_name").is_none()); + assert!(counter_json.get("device").is_none()); + + let rate = Metric::rate("my.rate", 30.0, Duration::from_secs(10)); + let rate_json = encode_one(&rate); + assert_eq!(rate_json["type"], "rate"); + assert_eq!(rate_json["interval"], 10); + // Rate value scaled by interval seconds: 30 / 10 = 3. + let rate_points = rate_json["points"].as_array().expect("rate points is array"); + assert_eq!(rate_points[0][1], 3.0); + + let gauge = Metric::gauge("my.gauge", 42.0); + let gauge_json = encode_one(&gauge); + assert_eq!(gauge_json["type"], "gauge"); + + // Sets are encoded as gauges with the set cardinality as the value, consistent with V2. + let set = Metric::set("my.set", "alpha"); + let set_json = encode_one(&set); + assert_eq!(set_json["type"], "gauge"); + let set_points = set_json["points"].as_array().expect("set points is array"); + assert_eq!(set_points[0][1], 1.0); + } + + #[test] + fn unit_and_hostname_emitted() { + let context = Context::from_static_parts("my.timer.avg", &[]); + let metadata = MetricMetadata::default() + .with_unit(MetaString::from_static("millisecond")) + .with_hostname(Some(Arc::from("host-1"))); + let gauge = Metric::from_parts(context, MetricValues::gauge([1.0_f64]), metadata); + + let json = encode_one(&gauge); + assert_eq!(json["unit"], "millisecond"); + assert_eq!(json["host"], "host-1"); + } + + #[test] + fn device_tag_extraction() { + // A `device:` tag is extracted into the `device` JSON field and dropped from `tags`. + let context = Context::from_static_parts("my.metric", &["device:eth0", "env:prod"]); + let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), MetricMetadata::default()); + + let json = encode_one(&counter); + assert_eq!(json["device"], "eth0"); + let tags = json["tags"].as_array().expect("tags is array"); + let tag_strs: Vec<&str> = tags.iter().filter_map(|v| v.as_str()).collect(); + assert!( + !tag_strs.iter().any(|t| t.starts_with("device:")), + "device tag must be removed: {:?}", + tag_strs + ); + assert!(tag_strs.contains(&"env:prod")); + } + + #[test] + fn source_type_name_from_source_type_origin() { + let context = Context::from_static_parts("my.metric", &[]); + let metadata = MetricMetadata::default().with_source_type(Some(Arc::from("integration_x"))); + let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), metadata); + + let json = encode_one(&counter); + assert_eq!(json["source_type_name"], "integration_x"); + } + + #[test] + fn origin_metadata_dropped() { + // OriginMetadata is V2-protobuf only; V1 must drop it. + let context = Context::from_static_parts("my.metric", &[]); + let metadata = MetricMetadata::default().with_origin(Some(MetricOrigin::dogstatsd())); + let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), metadata); + + let json = encode_one(&counter); + assert!(json.get("source_type_name").is_none()); + } + + #[test] + fn dd_internal_resource_dropped() { + // `dd.internal.resource` is V2-protobuf-only; V1 must drop these tags silently. + let context = Context::from_static_parts("my.metric", &["dd.internal.resource:host:foo", "env:prod"]); + let counter = Metric::from_parts(context, MetricValues::counter([1.0_f64]), MetricMetadata::default()); + + let json = encode_one(&counter); + let tags = json["tags"].as_array().expect("tags is array"); + let tag_strs: Vec<&str> = tags.iter().filter_map(|v| v.as_str()).collect(); + assert!( + !tag_strs.iter().any(|t| t.starts_with("dd.internal.resource:")), + "dd.internal.resource tag must be dropped: {:?}", + tag_strs + ); + assert!(tag_strs.contains(&"env:prod")); + } +} diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v2/constants.rs b/lib/saluki-components/src/encoders/datadog/metrics/v2/constants.rs new file mode 100644 index 00000000000..4b9bdc85570 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v2/constants.rs @@ -0,0 +1,42 @@ +pub const SERIES_V2_COMPRESSED_SIZE_LIMIT: usize = 512_000; // 500 KiB +pub const SERIES_V2_UNCOMPRESSED_SIZE_LIMIT: usize = 5_242_880; // 5 MiB + +// Protocol Buffers field numbers for series and sketch payload messages in the V2 format. +// +// These field numbers come from the Protocol Buffers definitions in `lib/protos/datadog/proto/agent-payload/agent_payload.proto`. +pub const RESOURCES_TYPE_FIELD_NUMBER: u32 = 1; +pub const RESOURCES_NAME_FIELD_NUMBER: u32 = 2; + +pub const METADATA_ORIGIN_FIELD_NUMBER: u32 = 1; + +pub const ORIGIN_ORIGIN_PRODUCT_FIELD_NUMBER: u32 = 4; +pub const ORIGIN_ORIGIN_CATEGORY_FIELD_NUMBER: u32 = 5; +pub const ORIGIN_ORIGIN_SERVICE_FIELD_NUMBER: u32 = 6; + +pub const METRIC_POINT_VALUE_FIELD_NUMBER: u32 = 1; +pub const METRIC_POINT_TIMESTAMP_FIELD_NUMBER: u32 = 2; + +pub const DOGSKETCH_TS_FIELD_NUMBER: u32 = 1; +pub const DOGSKETCH_CNT_FIELD_NUMBER: u32 = 2; +pub const DOGSKETCH_MIN_FIELD_NUMBER: u32 = 3; +pub const DOGSKETCH_MAX_FIELD_NUMBER: u32 = 4; +pub const DOGSKETCH_AVG_FIELD_NUMBER: u32 = 5; +pub const DOGSKETCH_SUM_FIELD_NUMBER: u32 = 6; +pub const DOGSKETCH_K_FIELD_NUMBER: u32 = 7; +pub const DOGSKETCH_N_FIELD_NUMBER: u32 = 8; + +pub const SERIES_RESOURCES_FIELD_NUMBER: u32 = 1; +pub const SERIES_METRIC_FIELD_NUMBER: u32 = 2; +pub const SERIES_TAGS_FIELD_NUMBER: u32 = 3; +pub const SERIES_POINTS_FIELD_NUMBER: u32 = 4; +pub const SERIES_TYPE_FIELD_NUMBER: u32 = 5; +pub const SERIES_UNIT_FIELD_NUMBER: u32 = 6; +pub const SERIES_SOURCE_TYPE_NAME_FIELD_NUMBER: u32 = 7; +pub const SERIES_INTERVAL_FIELD_NUMBER: u32 = 8; +pub const SERIES_METADATA_FIELD_NUMBER: u32 = 9; + +pub const SKETCH_METRIC_FIELD_NUMBER: u32 = 1; +pub const SKETCH_HOST_FIELD_NUMBER: u32 = 2; +pub const SKETCH_TAGS_FIELD_NUMBER: u32 = 4; +pub const SKETCH_DOGSKETCHES_FIELD_NUMBER: u32 = 7; +pub const SKETCH_METADATA_FIELD_NUMBER: u32 = 8; diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v2/mod.rs b/lib/saluki-components/src/encoders/datadog/metrics/v2/mod.rs new file mode 100644 index 00000000000..f50b4fc5ed4 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v2/mod.rs @@ -0,0 +1,793 @@ +use std::{fmt, num::NonZeroU64}; + +use datadog_protos::metrics as proto; +use ddsketch::DDSketch; +use http::{uri::PathAndQuery, HeaderValue, Method, Uri}; +use protobuf::{rt::WireType, CodedOutputStream, Enum}; +use saluki_common::iter::ReusableDeduplicator; +use saluki_context::tags::{SharedTagSet, Tag}; +use saluki_core::data_model::event::metric::{Metric, MetricOrigin, MetricValues}; +use saluki_error::GenericError; +use tracing::warn; + +use super::{ + endpoint::{EndpointConfiguration, MetricsEndpoint}, + v1, +}; +use crate::common::datadog::{ + io::RB_BUFFER_CHUNK_SIZE, + request_builder::{EndpointEncoder, RequestBuilder}, + DEFAULT_INTAKE_COMPRESSED_SIZE_LIMIT, DEFAULT_INTAKE_UNCOMPRESSED_SIZE_LIMIT, METRICS_SERIES_V1_PATH, + METRICS_SERIES_V2_PATH, METRICS_SKETCHES_PATH, +}; + +mod constants; +pub(super) use constants::{SERIES_V2_COMPRESSED_SIZE_LIMIT, SERIES_V2_UNCOMPRESSED_SIZE_LIMIT}; + +/// Creates a V2 request builder for the given endpoint. +/// +/// # Errors +/// +/// If the request builder cannot be created, an error is returned. +pub async fn create_v2_request_builder( + endpoint: MetricsEndpoint, endpoint_config: &EndpointConfiguration, +) -> Result, GenericError> { + let encoder = + MetricsEndpointEncoder::from_endpoint(endpoint).with_additional_tags(endpoint_config.additional_tags().clone()); + + let mut request_builder = + RequestBuilder::new(encoder, endpoint_config.compression_scheme(), RB_BUFFER_CHUNK_SIZE).await?; + request_builder.with_max_inputs_per_payload(endpoint_config.max_metrics_per_payload()); + + Ok(request_builder) +} + +/// An encoder for V2 metrics. +/// +/// This also handles the legacy V1 JSON series endpoint when `use_v2_api_series` is disabled. +#[derive(Debug)] +pub struct MetricsEndpointEncoder { + endpoint: MetricsEndpoint, + primary_scratch_buf: Vec, + secondary_scratch_buf: Vec, + packed_scratch_buf: Vec, + additional_tags: SharedTagSet, + tags_deduplicator: ReusableDeduplicator, +} + +impl MetricsEndpointEncoder { + /// Creates a new `MetricsEndpointEncoder` for the given endpoint. + pub fn from_endpoint(endpoint: MetricsEndpoint) -> Self { + Self { + endpoint, + primary_scratch_buf: Vec::new(), + secondary_scratch_buf: Vec::new(), + packed_scratch_buf: Vec::new(), + additional_tags: SharedTagSet::default(), + tags_deduplicator: ReusableDeduplicator::new(), + } + } + + /// Sets the additional tags to be included with every metric encoded by this encoder. + /// + /// These tags are added in a deduplicated fashion, the same as instrumented tags and origin tags. This is an + /// optimized codepath for tag inclusion in high-volume scenarios, where creating new additional contexts + /// through the traditional means (for example, `ContextResolver`) would be too expensive. + pub fn with_additional_tags(mut self, additional_tags: SharedTagSet) -> Self { + self.additional_tags = additional_tags; + self + } +} + +/// Error returned when a metric fails to encode for either the V1 JSON or V2 protobuf intake. +#[derive(Debug)] +pub enum MetricsEncodeError { + /// Protobuf encoding failed. + Protobuf(protobuf::Error), + + /// JSON encoding failed. + Json(serde_json::Error), +} + +impl fmt::Display for MetricsEncodeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Protobuf(e) => write!(f, "protobuf encode error: {}", e), + Self::Json(e) => write!(f, "json encode error: {}", e), + } + } +} + +impl std::error::Error for MetricsEncodeError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Protobuf(e) => Some(e), + Self::Json(e) => Some(e), + } + } +} + +impl From for MetricsEncodeError { + fn from(value: protobuf::Error) -> Self { + Self::Protobuf(value) + } +} + +impl From for MetricsEncodeError { + fn from(value: serde_json::Error) -> Self { + Self::Json(value) + } +} + +impl EndpointEncoder for MetricsEndpointEncoder { + type Input = Metric; + type EncodeError = MetricsEncodeError; + + fn encoder_name() -> &'static str { + "metrics" + } + + fn compressed_size_limit(&self) -> usize { + match self.endpoint { + MetricsEndpoint::SeriesV1 => v1::SERIES_COMPRESSED_SIZE_LIMIT, + MetricsEndpoint::SeriesV2 => constants::SERIES_V2_COMPRESSED_SIZE_LIMIT, + MetricsEndpoint::Sketches => DEFAULT_INTAKE_COMPRESSED_SIZE_LIMIT, + } + } + + fn uncompressed_size_limit(&self) -> usize { + match self.endpoint { + MetricsEndpoint::SeriesV1 => v1::SERIES_UNCOMPRESSED_SIZE_LIMIT, + MetricsEndpoint::SeriesV2 => constants::SERIES_V2_UNCOMPRESSED_SIZE_LIMIT, + MetricsEndpoint::Sketches => DEFAULT_INTAKE_UNCOMPRESSED_SIZE_LIMIT, + } + } + + fn input_data_point_count(&self, input: &Self::Input) -> usize { + input.values().len() + } + + fn is_valid_input(&self, input: &Self::Input) -> bool { + let is_series_input = matches!( + input.values(), + MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) + ); + + match self.endpoint { + MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => is_series_input, + MetricsEndpoint::Sketches => !is_series_input, + } + } + + fn get_payload_prefix(&self) -> Option<&'static [u8]> { + match self.endpoint { + MetricsEndpoint::SeriesV1 => Some(v1::SERIES_PAYLOAD_PREFIX), + _ => None, + } + } + + fn get_payload_suffix(&self) -> Option<&'static [u8]> { + match self.endpoint { + MetricsEndpoint::SeriesV1 => Some(v1::SERIES_PAYLOAD_SUFFIX), + _ => None, + } + } + + fn get_input_separator(&self) -> Option<&'static [u8]> { + match self.endpoint { + MetricsEndpoint::SeriesV1 => Some(v1::SERIES_INPUT_SEPARATOR), + _ => None, + } + } + + fn encode(&mut self, input: &Self::Input, buffer: &mut Vec) -> Result<(), Self::EncodeError> { + match self.endpoint { + MetricsEndpoint::SeriesV1 => { + v1::encode_series_metric(input, &self.additional_tags, buffer, &mut self.tags_deduplicator)?; + Ok(()) + } + MetricsEndpoint::SeriesV2 | MetricsEndpoint::Sketches => { + // NOTE: We're passing _four_ buffers to `encode_single_metric`, which is a lot, but with good reason. + encode_single_metric( + input, + &self.additional_tags, + buffer, + &mut self.primary_scratch_buf, + &mut self.secondary_scratch_buf, + &mut self.packed_scratch_buf, + &mut self.tags_deduplicator, + )?; + + Ok(()) + } + } + } + + fn endpoint_uri(&self) -> Uri { + match self.endpoint { + MetricsEndpoint::SeriesV1 => PathAndQuery::from_static(METRICS_SERIES_V1_PATH).into(), + MetricsEndpoint::SeriesV2 => PathAndQuery::from_static(METRICS_SERIES_V2_PATH).into(), + MetricsEndpoint::Sketches => PathAndQuery::from_static(METRICS_SKETCHES_PATH).into(), + } + } + + fn endpoint_method(&self) -> Method { + // All endpoints use POST. + Method::POST + } + + fn content_type(&self) -> HeaderValue { + match self.endpoint { + MetricsEndpoint::SeriesV1 => v1::CONTENT_TYPE.clone(), + MetricsEndpoint::SeriesV2 | MetricsEndpoint::Sketches => HeaderValue::from_static("application/x-protobuf"), + } + } +} + +fn field_number_for_metric_type(metric: &Metric) -> u32 { + match metric.values() { + MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) => 1, + MetricValues::Histogram(..) | MetricValues::Distribution(..) => 1, + } +} + +fn get_message_size(raw_msg_size: usize) -> Result { + const MAX_MESSAGE_SIZE: u64 = i32::MAX as u64; + + // Individual messages cannot be larger than `i32::MAX`, so check that here before proceeding. + if raw_msg_size as u64 > MAX_MESSAGE_SIZE { + return Err(std::io::Error::other("message size exceeds limit (2147483648 bytes)").into()); + } + + Ok(raw_msg_size as u32) +} + +fn get_message_size_from_buffer(buf: &[u8]) -> Result { + get_message_size(buf.len()) +} + +fn encode_single_metric( + metric: &Metric, additional_tags: &SharedTagSet, output_buf: &mut Vec, primary_scratch_buf: &mut Vec, + secondary_scratch_buf: &mut Vec, packed_scratch_buf: &mut Vec, + tags_deduplicator: &mut ReusableDeduplicator, +) -> Result<(), protobuf::Error> { + let mut output_stream = CodedOutputStream::vec(output_buf); + let field_number = field_number_for_metric_type(metric); + + write_nested_message(&mut output_stream, primary_scratch_buf, field_number, |os| { + // Depending on the metric type, we write out the appropriate fields. + match metric.values() { + MetricValues::Counter(..) | MetricValues::Rate(..) | MetricValues::Gauge(..) | MetricValues::Set(..) => { + encode_series_metric(metric, additional_tags, os, secondary_scratch_buf, tags_deduplicator) + } + MetricValues::Histogram(..) | MetricValues::Distribution(..) => encode_sketch_metric( + metric, + additional_tags, + os, + secondary_scratch_buf, + packed_scratch_buf, + tags_deduplicator, + ), + } + }) +} + +fn encode_series_metric( + metric: &Metric, additional_tags: &SharedTagSet, output_stream: &mut CodedOutputStream<'_>, + scratch_buf: &mut Vec, tags_deduplicator: &mut ReusableDeduplicator, +) -> Result<(), protobuf::Error> { + // Write the metric name and tags. + output_stream.write_string(constants::SERIES_METRIC_FIELD_NUMBER, metric.context().name())?; + + let deduplicated_tags = get_deduplicated_tags(metric, additional_tags, tags_deduplicator); + write_series_tags(deduplicated_tags, output_stream, scratch_buf)?; + + // Set the host resource. + write_resource( + output_stream, + scratch_buf, + "host", + metric.metadata().hostname().unwrap_or_default(), + )?; + + // Write the origin metadata, if it exists. + if let Some(origin) = metric.metadata().origin() { + match origin { + MetricOrigin::SourceType(source_type) => { + output_stream.write_string(constants::SERIES_SOURCE_TYPE_NAME_FIELD_NUMBER, source_type.as_ref())?; + } + MetricOrigin::OriginMetadata { + product, + subproduct, + product_detail, + } => { + write_origin_metadata( + output_stream, + scratch_buf, + constants::SERIES_METADATA_FIELD_NUMBER, + *product, + *subproduct, + *product_detail, + )?; + } + } + } + + // Now write out our metric type, points, and interval (if applicable). + let (metric_type, points, maybe_interval) = match metric.values() { + MetricValues::Counter(points) => (proto::MetricType::COUNT, points.into_iter(), None), + MetricValues::Rate(points, interval) => (proto::MetricType::RATE, points.into_iter(), Some(interval)), + MetricValues::Gauge(points) => (proto::MetricType::GAUGE, points.into_iter(), None), + MetricValues::Set(points) => (proto::MetricType::GAUGE, points.into_iter(), None), + _ => unreachable!(), + }; + + output_stream.write_enum(constants::SERIES_TYPE_FIELD_NUMBER, metric_type.value())?; + + if let Some(unit) = metric.metadata().unit() { + output_stream.write_string(constants::SERIES_UNIT_FIELD_NUMBER, unit)?; + } + + for (timestamp, value) in points { + // If this is a rate metric, scale our value by the interval, in seconds. + let value = maybe_interval + .map(|interval| value / interval.as_secs_f64()) + .unwrap_or(value); + let timestamp = timestamp.map(|ts| ts.get()).unwrap_or(0) as i64; + + write_point(output_stream, scratch_buf, value, timestamp)?; + } + + if let Some(interval) = maybe_interval { + output_stream.write_int64(constants::SERIES_INTERVAL_FIELD_NUMBER, interval.as_secs() as i64)?; + } + + Ok(()) +} + +fn encode_sketch_metric( + metric: &Metric, additional_tags: &SharedTagSet, output_stream: &mut CodedOutputStream<'_>, + scratch_buf: &mut Vec, packed_scratch_buf: &mut Vec, tags_deduplicator: &mut ReusableDeduplicator, +) -> Result<(), protobuf::Error> { + // Write the metric name and tags. + output_stream.write_string(constants::SKETCH_METRIC_FIELD_NUMBER, metric.context().name())?; + + let deduplicated_tags = get_deduplicated_tags(metric, additional_tags, tags_deduplicator); + write_sketch_tags(deduplicated_tags, output_stream, scratch_buf)?; + + // Write the host. + output_stream.write_string( + constants::SKETCH_HOST_FIELD_NUMBER, + metric.metadata().hostname().unwrap_or_default(), + )?; + + // Set the origin metadata, if it exists. + if let Some(MetricOrigin::OriginMetadata { + product, + subproduct, + product_detail, + }) = metric.metadata().origin() + { + write_origin_metadata( + output_stream, + scratch_buf, + constants::SKETCH_METADATA_FIELD_NUMBER, + *product, + *subproduct, + *product_detail, + )?; + } + + // Write out our sketches. + match metric.values() { + MetricValues::Distribution(sketches) => { + for (timestamp, value) in sketches { + write_dogsketch(output_stream, scratch_buf, packed_scratch_buf, timestamp, value)?; + } + } + MetricValues::Histogram(points) => { + for (timestamp, histogram) in points { + // We convert histograms to sketches to be able to write them out in the payload. + let mut ddsketch = DDSketch::default(); + for sample in histogram.samples() { + ddsketch.insert_n(sample.value.into_inner(), sample.weight.0 as u64); + } + + write_dogsketch(output_stream, scratch_buf, packed_scratch_buf, timestamp, &ddsketch)?; + } + } + _ => unreachable!(), + } + + Ok(()) +} + +fn write_resource( + output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, resource_type: &str, resource_name: &str, +) -> Result<(), protobuf::Error> { + write_nested_message( + output_stream, + scratch_buf, + constants::SERIES_RESOURCES_FIELD_NUMBER, + |os| { + os.write_string(constants::RESOURCES_TYPE_FIELD_NUMBER, resource_type)?; + os.write_string(constants::RESOURCES_NAME_FIELD_NUMBER, resource_name) + }, + ) +} + +fn write_origin_metadata( + output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, field_number: u32, origin_product: u32, + origin_category: u32, origin_service: u32, +) -> Result<(), protobuf::Error> { + // TODO: Figure out how to cleanly use `write_nested_message` here. + + scratch_buf.clear(); + + { + let mut origin_output_stream = CodedOutputStream::vec(scratch_buf); + origin_output_stream.write_uint32(constants::ORIGIN_ORIGIN_PRODUCT_FIELD_NUMBER, origin_product)?; + origin_output_stream.write_uint32(constants::ORIGIN_ORIGIN_CATEGORY_FIELD_NUMBER, origin_category)?; + origin_output_stream.write_uint32(constants::ORIGIN_ORIGIN_SERVICE_FIELD_NUMBER, origin_service)?; + origin_output_stream.flush()?; + } + + // We do a little song and dance here because the `Origin` message is embedded inside of `Metadata`, so we need to + // write out field numbers/length delimiters in order: `Metadata`, and then `Origin`... but we write out origin + // message to the scratch buffer first... so we write out our `Metadata` preamble stuff to get its length, and then + // use that in conjunction with the `Origin` message size to write out the full `Metadata` message. + let origin_message_size = get_message_size_from_buffer(scratch_buf)?; + + let mut metadata_preamble_buf = [0; 64]; + let metadata_preamble_len = { + let mut metadata_output_stream = CodedOutputStream::bytes(&mut metadata_preamble_buf[..]); + metadata_output_stream.write_tag(constants::METADATA_ORIGIN_FIELD_NUMBER, WireType::LengthDelimited)?; + metadata_output_stream.write_raw_varint32(origin_message_size)?; + metadata_output_stream.flush()?; + metadata_output_stream.total_bytes_written() as usize + }; + + let metadata_message_size = get_message_size(scratch_buf.len() + metadata_preamble_len)?; + + output_stream.write_tag(field_number, WireType::LengthDelimited)?; + output_stream.write_raw_varint32(metadata_message_size)?; + output_stream.write_raw_bytes(&metadata_preamble_buf[..metadata_preamble_len])?; + output_stream.write_raw_bytes(scratch_buf) +} + +fn write_point( + output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, value: f64, timestamp: i64, +) -> Result<(), protobuf::Error> { + write_nested_message( + output_stream, + scratch_buf, + constants::SERIES_POINTS_FIELD_NUMBER, + |os| { + os.write_double(constants::METRIC_POINT_VALUE_FIELD_NUMBER, value)?; + os.write_int64(constants::METRIC_POINT_TIMESTAMP_FIELD_NUMBER, timestamp) + }, + ) +} + +fn write_dogsketch( + output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, packed_scratch_buf: &mut Vec, + timestamp: Option, sketch: &DDSketch, +) -> Result<(), protobuf::Error> { + // If the sketch is empty, we don't write it out. + if sketch.is_empty() { + warn!("Attempted to write an empty sketch to sketches payload, skipping."); + return Ok(()); + } + + write_nested_message( + output_stream, + scratch_buf, + constants::SKETCH_DOGSKETCHES_FIELD_NUMBER, + |os| { + os.write_int64( + constants::DOGSKETCH_TS_FIELD_NUMBER, + timestamp.map_or(0, |ts| ts.get() as i64), + )?; + os.write_int64(constants::DOGSKETCH_CNT_FIELD_NUMBER, sketch.count() as i64)?; + os.write_double(constants::DOGSKETCH_MIN_FIELD_NUMBER, sketch.min().unwrap())?; + os.write_double(constants::DOGSKETCH_MAX_FIELD_NUMBER, sketch.max().unwrap())?; + os.write_double(constants::DOGSKETCH_AVG_FIELD_NUMBER, sketch.avg().unwrap())?; + os.write_double(constants::DOGSKETCH_SUM_FIELD_NUMBER, sketch.sum().unwrap())?; + + let bin_keys = sketch.bins().iter().map(|bin| bin.key()); + write_repeated_packed_from_iter( + os, + packed_scratch_buf, + constants::DOGSKETCH_K_FIELD_NUMBER, + bin_keys, + |inner_os, value| inner_os.write_sint32_no_tag(value), + )?; + + let bin_counts = sketch.bins().iter().map(|bin| bin.count()); + write_repeated_packed_from_iter( + os, + packed_scratch_buf, + constants::DOGSKETCH_N_FIELD_NUMBER, + bin_counts, + |inner_os, value| inner_os.write_uint32_no_tag(value), + ) + }, + ) +} + +fn get_deduplicated_tags<'a>( + metric: &'a Metric, additional_tags: &'a SharedTagSet, tags_deduplicator: &'a mut ReusableDeduplicator, +) -> impl Iterator { + let chained_tags = metric + .context() + .tags() + .into_iter() + .chain(additional_tags) + .chain(metric.context().origin_tags()); + + tags_deduplicator.deduplicated(chained_tags) +} + +fn write_tags<'a, I, F>( + tags: I, output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, tag_encoder: F, +) -> Result<(), protobuf::Error> +where + I: Iterator, + F: Fn(&Tag, &mut CodedOutputStream<'_>, &mut Vec) -> Result<(), protobuf::Error>, +{ + for tag in tags { + tag_encoder(tag, output_stream, scratch_buf)?; + } + + Ok(()) +} + +fn write_series_tags<'a, I>( + tags: I, output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, +) -> Result<(), protobuf::Error> +where + I: Iterator, +{ + write_tags(tags, output_stream, scratch_buf, |tag, os, buf| { + // If this is a resource tag, we'll convert it directly to a resource entry. + if tag.name() == "dd.internal.resource" { + if let Some((resource_type, resource_name)) = tag.value().and_then(|s| s.split_once(':')) { + write_resource(os, buf, resource_type, resource_name) + } else { + Ok(()) + } + } else { + // We're dealing with a normal tag. + os.write_string(constants::SERIES_TAGS_FIELD_NUMBER, tag.as_str()) + } + }) +} + +fn write_sketch_tags<'a, I>( + tags: I, output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, +) -> Result<(), protobuf::Error> +where + I: Iterator, +{ + write_tags(tags, output_stream, scratch_buf, |tag, os, _buf| { + // We always write the tags as-is, without any special handling for resource tags. + os.write_string(constants::SKETCH_TAGS_FIELD_NUMBER, tag.as_str()) + }) +} + +fn write_nested_message( + output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, field_number: u32, writer: F, +) -> Result<(), protobuf::Error> +where + F: FnOnce(&mut CodedOutputStream<'_>) -> Result<(), protobuf::Error>, +{ + scratch_buf.clear(); + + { + let mut nested_output_stream = CodedOutputStream::vec(scratch_buf); + writer(&mut nested_output_stream)?; + nested_output_stream.flush()?; + } + + output_stream.write_tag(field_number, WireType::LengthDelimited)?; + + let nested_message_size = get_message_size_from_buffer(scratch_buf)?; + output_stream.write_raw_varint32(nested_message_size)?; + output_stream.write_raw_bytes(scratch_buf) +} + +fn write_repeated_packed_from_iter( + output_stream: &mut CodedOutputStream<'_>, scratch_buf: &mut Vec, field_number: u32, values: I, writer: F, +) -> Result<(), protobuf::Error> +where + I: Iterator, + F: Fn(&mut CodedOutputStream<'_>, T) -> Result<(), protobuf::Error>, +{ + // This is a helper function that lets us write out a packed repeated field from an iterator of values. + // `CodedOutputStream` has similar functions to handle this, but they require a slice of values, which would mean we + // need to either allocate a new vector each time to hold the values, or thread through two additional vectors (one + // for `i32`, one for `u32`) to reuse the allocation... both of which are not great options. + // + // We've simply opted to pass through a _single_ vector that we can reuse, and write the packed values directly to + // that, almost identically to how `CodedOutputStream::write_repeated_packed_*` methods would do it. + + scratch_buf.clear(); + + { + let mut packed_output_stream = CodedOutputStream::vec(scratch_buf); + for value in values { + writer(&mut packed_output_stream, value)?; + } + packed_output_stream.flush()?; + } + + let data_size = get_message_size_from_buffer(scratch_buf)?; + + output_stream.write_tag(field_number, WireType::LengthDelimited)?; + output_stream.write_raw_varint32(data_size)?; + output_stream.write_raw_bytes(scratch_buf) +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use protobuf::CodedOutputStream; + use saluki_common::iter::ReusableDeduplicator; + use saluki_context::{tags::SharedTagSet, Context}; + use saluki_core::data_model::event::metric::{Metric, MetricMetadata, MetricValues}; + use stringtheory::MetaString; + + use super::{encode_series_metric, encode_sketch_metric, v1, MetricsEndpoint, MetricsEndpointEncoder}; + use crate::common::datadog::request_builder::EndpointEncoder as _; + + #[test] + fn histogram_vs_sketch_identical_payload() { + // For the same exact set of points, we should be able to construct either a histogram or distribution from + // those points, and when encoded as a sketch payload, end up with the same exact payload. + // + // They should be identical because the goal is that we convert histograms into sketches in the same way we + // would have originally constructed a sketch based on the same samples. + let samples = &[1.0, 2.0, 3.0, 4.0, 5.0]; + let histogram = Metric::histogram("simple_samples", samples); + let distribution = Metric::distribution("simple_samples", samples); + let host_tags = SharedTagSet::default(); + + let mut buf1 = Vec::new(); + let mut buf2 = Vec::new(); + let mut tags_deduplicator = ReusableDeduplicator::new(); + + let mut histogram_payload = Vec::new(); + { + let mut histogram_writer = CodedOutputStream::vec(&mut histogram_payload); + encode_sketch_metric( + &histogram, + &host_tags, + &mut histogram_writer, + &mut buf1, + &mut buf2, + &mut tags_deduplicator, + ) + .expect("Failed to encode histogram as sketch"); + } + + let mut distribution_payload = Vec::new(); + { + let mut distribution_writer = CodedOutputStream::vec(&mut distribution_payload); + encode_sketch_metric( + &distribution, + &host_tags, + &mut distribution_writer, + &mut buf1, + &mut buf2, + &mut tags_deduplicator, + ) + .expect("Failed to encode distribution as sketch"); + } + + assert_eq!(histogram_payload, distribution_payload); + } + + #[test] + fn input_valid() { + // Our encoder should always consider series metrics valid when set to either series endpoint, and similarly + // for sketch metrics when set to the sketches endpoint. + let counter = Metric::counter("counter", 1.0); + let rate = Metric::rate("rate", 1.0, Duration::from_secs(1)); + let gauge = Metric::gauge("gauge", 1.0); + let set = Metric::set("set", "foo"); + let histogram = Metric::histogram("histogram", [1.0, 2.0, 3.0]); + let distribution = Metric::distribution("distribution", [1.0, 2.0, 3.0]); + + let series_v1 = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV1); + let series_v2 = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV2); + let sketches_endpoint = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::Sketches); + + for series_endpoint in [&series_v1, &series_v2] { + assert!(series_endpoint.is_valid_input(&counter)); + assert!(series_endpoint.is_valid_input(&rate)); + assert!(series_endpoint.is_valid_input(&gauge)); + assert!(series_endpoint.is_valid_input(&set)); + assert!(!series_endpoint.is_valid_input(&histogram)); + assert!(!series_endpoint.is_valid_input(&distribution)); + } + + assert!(!sketches_endpoint.is_valid_input(&counter)); + assert!(!sketches_endpoint.is_valid_input(&rate)); + assert!(!sketches_endpoint.is_valid_input(&gauge)); + assert!(!sketches_endpoint.is_valid_input(&set)); + assert!(sketches_endpoint.is_valid_input(&histogram)); + assert!(sketches_endpoint.is_valid_input(&distribution)); + } + + #[test] + fn input_data_point_count_tracks_metric_values() { + let counter = Metric::counter("counter", [(123, 1.0), (124, 2.0)]); + let histogram = Metric::histogram("histogram", [1.0, 2.0, 3.0]); + + let series_endpoint = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV2); + let sketches_endpoint = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::Sketches); + + assert_eq!(series_endpoint.input_data_point_count(&counter), 2); + assert_eq!(sketches_endpoint.input_data_point_count(&histogram), 1); + } + + #[test] + fn series_metric_unit_encoded() { + // A gauge with a unit in its metadata must produce a series protobuf payload that contains the unit string + // in field 6 (MetricSeries.unit), which the Datadog backend already accepts. + let context = Context::from_static_parts("my.timer.avg", &[]); + let metadata = MetricMetadata::default().with_unit(MetaString::from_static("millisecond")); + let gauge = Metric::from_parts(context, MetricValues::gauge([1.0_f64]), metadata); + + let host_tags = SharedTagSet::default(); + let mut scratch_buf = Vec::new(); + let mut tags_deduplicator = ReusableDeduplicator::new(); + + let mut payload = Vec::new(); + { + let mut writer = CodedOutputStream::vec(&mut payload); + encode_series_metric( + &gauge, + &host_tags, + &mut writer, + &mut scratch_buf, + &mut tags_deduplicator, + ) + .expect("Failed to encode gauge as series metric"); + writer.flush().expect("Failed to flush"); + } + + // In the protobuf wire format, a string field with field number 6 has tag byte 0x32 ((6 << 3) | 2). + // The tag is followed by a varint length and then the UTF-8 bytes of the string. + let expected_tag: u8 = (6 << 3) | 2; // 0x32 + let expected_value = b"millisecond"; + + let tag_pos = payload + .windows(1 + 1 + expected_value.len()) + .position(|w| w[0] == expected_tag && w[1] == expected_value.len() as u8 && &w[2..] == expected_value); + + assert!( + tag_pos.is_some(), + "series payload should contain unit field (field 6 = 'millisecond'), got bytes: {:?}", + payload + ); + } + + #[test] + fn series_v1_endpoint_routing() { + // SeriesV1 advertises the V1 URI, JSON content type, and the {"series":[...]} framing. + let encoder = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV1); + assert_eq!(encoder.endpoint_uri().path(), "/api/v1/series"); + assert_eq!(encoder.content_type(), "application/json"); + assert_eq!(encoder.get_payload_prefix(), Some(v1::SERIES_PAYLOAD_PREFIX)); + assert_eq!(encoder.get_payload_suffix(), Some(v1::SERIES_PAYLOAD_SUFFIX)); + assert_eq!(encoder.get_input_separator(), Some(v1::SERIES_INPUT_SEPARATOR)); + + // V2 series stays on protobuf with no framing. + let v2 = MetricsEndpointEncoder::from_endpoint(MetricsEndpoint::SeriesV2); + assert_eq!(v2.endpoint_uri().path(), "/api/v2/series"); + assert_eq!(v2.content_type(), "application/x-protobuf"); + assert!(v2.get_payload_prefix().is_none()); + } +} diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v3/constants.rs b/lib/saluki-components/src/encoders/datadog/metrics/v3/constants.rs new file mode 100644 index 00000000000..6f54ea38af4 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v3/constants.rs @@ -0,0 +1,29 @@ +// Protocol Buffers field numbers for series and sketch payload messages in the V3 format. +// +// These field numbers come from the Protocol Buffers definitions in `lib/protos/datadog/proto/agent-payload/intake_v3.proto`. +pub const DICT_NAME_STR_FIELD_NUMBER: u32 = 1; +pub const DICT_TAGS_STR_FIELD_NUMBER: u32 = 2; +pub const DICT_TAGSETS_FIELD_NUMBER: u32 = 3; +pub const DICT_RESOURCE_STR_FIELD_NUMBER: u32 = 4; +pub const DICT_RESOURCE_LEN_FIELD_NUMBER: u32 = 5; +pub const DICT_RESOURCE_TYPE_FIELD_NUMBER: u32 = 6; +pub const DICT_RESOURCE_NAME_FIELD_NUMBER: u32 = 7; +pub const DICT_SOURCE_TYPE_NAME_FIELD_NUMBER: u32 = 8; +pub const DICT_ORIGIN_INFO_FIELD_NUMBER: u32 = 9; +pub const TYPES_FIELD_NUMBER: u32 = 10; +pub const NAMES_FIELD_NUMBER: u32 = 11; +pub const TAGS_FIELD_NUMBER: u32 = 12; +pub const RESOURCES_FIELD_NUMBER: u32 = 13; +pub const INTERVALS_FIELD_NUMBER: u32 = 14; +pub const NUM_POINTS_FIELD_NUMBER: u32 = 15; +pub const TIMESTAMPS_FIELD_NUMBER: u32 = 16; +pub const VALS_SINT64_FIELD_NUMBER: u32 = 17; +pub const VALS_FLOAT32_FIELD_NUMBER: u32 = 18; +pub const VALS_FLOAT64_FIELD_NUMBER: u32 = 19; +pub const SKETCH_NUM_BINS_FIELD_NUMBER: u32 = 20; +pub const SKETCH_BIN_KEYS_FIELD_NUMBER: u32 = 21; +pub const SKETCH_BIN_CNTS_FIELD_NUMBER: u32 = 22; +pub const SOURCE_TYPE_NAME_FIELD_NUMBER: u32 = 23; +pub const ORIGIN_INFO_FIELD_NUMBER: u32 = 24; +pub const DICT_UNIT_STR_FIELD_NUMBER: u32 = 25; +pub const UNIT_REFS_FIELD_NUMBER: u32 = 26; diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v3/interner.rs b/lib/saluki-components/src/encoders/datadog/metrics/v3/interner.rs new file mode 100644 index 00000000000..c0ed8219dd6 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v3/interner.rs @@ -0,0 +1,94 @@ +//! Generic interning for dictionary deduplication. + +use std::{borrow::Borrow, hash::Hash}; + +use saluki_common::collections::FastHashMap; + +/// Generic interning structure for dictionary deduplication. +/// +/// Assigns unique 1-based IDs to values, returning the same ID for duplicate values. +/// ID 0 is reserved for "empty/none" in the V3 format. +#[derive(Debug)] +pub struct Interner { + index: FastHashMap, + last_id: i64, +} + +impl Default for Interner { + fn default() -> Self { + Self::new() + } +} + +impl Interner { + /// Creates a new empty interner. + pub fn new() -> Self { + Self { + index: FastHashMap::default(), + last_id: 0, + } + } + + /// Gets the ID for a key, inserting it if not present. + /// + /// Returns `(id, is_new)` where `is_new` is true if the key was newly inserted. + /// IDs are 1-based (0 is reserved for empty/none values). + pub fn get_or_insert(&mut self, key: &Q) -> (i64, bool) + where + K: Borrow, + Q: ToOwned + Hash + Eq + ?Sized, + { + if let Some(&id) = self.index.get(key) { + (id, false) + } else { + self.last_id += 1; + self.index.insert(key.to_owned(), self.last_id); + (self.last_id, true) + } + } + + /// Returns the number of interned values. + #[cfg(test)] + pub fn len(&self) -> usize { + self.index.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_interner_basic() { + let mut interner: Interner = Interner::new(); + + // First insertion returns ID 1 and is_new=true + let (id1, is_new1) = interner.get_or_insert("hello"); + assert_eq!(id1, 1); + assert!(is_new1); + + // Second insertion of same value returns same ID and is_new=false + let (id2, is_new2) = interner.get_or_insert("hello"); + assert_eq!(id2, 1); + assert!(!is_new2); + + // New value gets next ID + let (id3, is_new3) = interner.get_or_insert("world"); + assert_eq!(id3, 2); + assert!(is_new3); + + assert_eq!(interner.len(), 2); + } + + #[test] + fn test_interner_tuples() { + let mut interner: Interner<(i32, i32, i32)> = Interner::new(); + + let (id1, _) = interner.get_or_insert(&(1, 2, 3)); + let (id2, _) = interner.get_or_insert(&(1, 2, 3)); + let (id3, _) = interner.get_or_insert(&(4, 5, 6)); + + assert_eq!(id1, id2); + assert_ne!(id1, id3); + } +} diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v3/mod.rs b/lib/saluki-components/src/encoders/datadog/metrics/v3/mod.rs new file mode 100644 index 00000000000..47cb2c4dc1f --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v3/mod.rs @@ -0,0 +1,26 @@ +//! V3 columnar metrics payload encoder. +//! +//! This module implements the V3 columnar format for Datadog metrics payloads. Unlike the V2 +//! row-based protobuf format where each metric is a complete message, V3 uses a columnar layout +//! with dictionary-based string deduplication for efficient encoding. +//! +//! The key differences from V2: +//! - Dictionary deduplication for metric names, tags, resources, and origin info +//! - Delta encoding for index arrays to reduce payload size +//! - Batch encoding - all metrics must be collected before serialization +//! - Separate value columns for different numeric types (sint64, float32, float64) +//! +//! # Missing +//! +//! - Incrementally compressed blocks. This is a centerpiece of the implementation on the Agent side, +//! but we do this in a single shot as part of this initial implementation. + +mod constants; +mod interner; +mod payload; +mod types; +mod writer; + +pub(super) use payload::{V3EncodedRequest, V3PayloadLimits, V3PayloadRequest}; +pub use types::V3MetricType; +pub use writer::V3Writer; diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v3/payload.rs b/lib/saluki-components/src/encoders/datadog/metrics/v3/payload.rs new file mode 100644 index 00000000000..9dbfce7a607 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v3/payload.rs @@ -0,0 +1,52 @@ +use http::Request; +use saluki_common::buf::FrozenChunkedBytesBuffer; +use saluki_core::data_model::event::metric::Metric; + +/// Limits used when building V3 metrics payloads. +#[derive(Clone, Copy, Debug)] +pub(crate) struct V3PayloadLimits { + pub(crate) max_compressed_size: usize, + pub(crate) max_uncompressed_size: usize, + max_metrics_per_payload: usize, + pub(crate) max_points_per_payload: usize, +} + +impl V3PayloadLimits { + pub(crate) const fn new( + max_compressed_size: usize, max_uncompressed_size: usize, max_metrics_per_payload: usize, + max_points_per_payload: usize, + ) -> Self { + Self { + max_compressed_size, + max_uncompressed_size, + max_metrics_per_payload, + max_points_per_payload, + } + } + + pub(crate) fn request_fits(self, request: &V3EncodedRequest) -> bool { + request.compressed_len <= self.max_compressed_size && request.uncompressed_len <= self.max_uncompressed_size + } + + pub(crate) fn point_count_fits(self, count: usize) -> bool { + count <= self.max_points_per_payload + } + + pub(crate) fn should_flush_metric_count_limit(self, metrics: &[Metric]) -> bool { + metrics.len() >= self.max_metrics_per_payload + } +} + +/// Encoded V3 request with measured payload sizes. +pub(crate) struct V3EncodedRequest { + pub(crate) request: Request, + pub(crate) compressed_len: usize, + pub(crate) uncompressed_len: usize, +} + +/// V3 payload request ready to send with telemetry counts. +pub(crate) struct V3PayloadRequest { + pub(crate) request: Request, + pub(crate) event_count: usize, + pub(crate) data_point_count: usize, +} diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v3/types.rs b/lib/saluki-components/src/encoders/datadog/metrics/v3/types.rs new file mode 100644 index 00000000000..be332b44a22 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v3/types.rs @@ -0,0 +1,211 @@ +//! V3 payload type definitions and protocol buffer field numbers. + +/// V3 metric type values. +/// +/// These match the `metricType` enum in `intake_v3.proto`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] +pub enum V3MetricType { + Count = 1, + Rate = 2, + Gauge = 3, + Sketch = 4, +} + +impl V3MetricType { + /// Returns the numeric value for encoding in the types column. + pub const fn as_u64(self) -> u64 { + self as u64 + } +} + +/// V3 value type values. +/// +/// These are encoded in bits 4-7 of the types column and indicate which +/// value array contains the metric's points. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] +pub enum V3ValueType { + /// Value is zero, not stored explicitly. + Zero = 0x00, + + /// Value is stored in vals_sint64. + Sint64 = 0x10, + + /// Value is stored in vals_float32. + Float32 = 0x20, + + /// Value is stored in vals_float64. + Float64 = 0x30, +} + +impl V3ValueType { + /// Returns the numeric value for encoding in the types column. + pub fn as_u64(self) -> u64 { + self as u64 + } +} + +/// Intermediate point classification for value type compaction. +/// +/// This provides finer-grained classification than [`V3ValueType`] to avoid +/// precision loss when combining different value types. In particular, it +/// distinguishes small integers (that fit losslessly in f32) from large integers +/// (that don't), so that mixing a large integer with a Float32 value correctly +/// escalates to Float64 rather than silently truncating the integer. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[repr(u8)] +enum PointKind { + /// Value is zero. + Zero = 0, + /// Integer with |v| <= 2^24, fits losslessly in both sint64 and f32. + Int24 = 1, + /// Integer with |v| > 2^24, fits in sint64 varint but NOT losslessly in f32. + Int48 = 2, + /// Fractional value exactly representable as f32. + Float32 = 3, + /// Everything else - requires full f64 precision. + Float64 = 4, +} + +/// Maximum integer magnitude that fits losslessly in f32 (2^24). +const F32_INT_MAX: i64 = 1 << 24; + +impl PointKind { + /// Classifies a single f64 value. + fn for_value(v: f64) -> Self { + if v == 0.0 { + return Self::Zero; + } + + // Varint range that fits in 7 bytes or less (49 bits). + const VARINT_WIDTH: i32 = 7 * 7 - 1; + const MAX_INT: i64 = 1 << VARINT_WIDTH; + const MIN_INT: i64 = -MAX_INT; + + let i = v as i64; + if (MIN_INT..MAX_INT).contains(&i) && (i as f64) == v { + if (-F32_INT_MAX..=F32_INT_MAX).contains(&i) { + return Self::Int24; + } + return Self::Int48; + } + + if (v as f32 as f64) == v { + return Self::Float32; + } + + Self::Float64 + } + + /// Combines two point kinds into the smallest kind that can represent both. + /// + /// This is `max(self, other)` in all cases **except**: + /// - `Int48 + Float32 = Float64` (and vice versa), because large integers + /// lose precision in f32, and fractional values can't be stored as sint64. + fn union(self, other: Self) -> Self { + match (self, other) { + (Self::Int48, Self::Float32) | (Self::Float32, Self::Int48) => Self::Float64, + _ => self.max(other), + } + } + + /// Converts to the wire-format value type. + fn to_value_type(self) -> V3ValueType { + match self { + Self::Zero => V3ValueType::Zero, + Self::Int24 | Self::Int48 => V3ValueType::Sint64, + Self::Float32 => V3ValueType::Float32, + Self::Float64 => V3ValueType::Float64, + } + } +} + +/// Determines the best [`V3ValueType`] for a set of f64 values. +/// +/// Uses [`PointKind`] internally to avoid precision loss when mixing +/// large integers with fractional float32 values. +pub(super) fn value_type_for_values(values: impl Iterator) -> V3ValueType { + let mut kind = PointKind::Zero; + for v in values { + kind = kind.union(PointKind::for_value(v)); + } + kind.to_value_type() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_point_kind_classification() { + // Zero + assert_eq!(PointKind::for_value(0.0), PointKind::Zero); + + // Small integers (fit in f32) + assert_eq!(PointKind::for_value(100.0), PointKind::Int24); + assert_eq!(PointKind::for_value(-100.0), PointKind::Int24); + assert_eq!(PointKind::for_value((1 << 24) as f64), PointKind::Int24); + assert_eq!(PointKind::for_value(-((1 << 24) as f64)), PointKind::Int24); + + // Large integers (don't fit losslessly in f32) + assert_eq!(PointKind::for_value(((1 << 24) + 1) as f64), PointKind::Int48); + assert_eq!(PointKind::for_value((1i64 << 30) as f64), PointKind::Int48); + + // Float32 + assert_eq!(PointKind::for_value(1.5), PointKind::Float32); + assert_eq!(PointKind::for_value(2.75), PointKind::Float32); + + // Float64 + assert_eq!(PointKind::for_value(std::f64::consts::PI), PointKind::Float64); + let large = ((1i64 << 50) + 1) as f64; + assert_eq!(PointKind::for_value(large), PointKind::Float64); + } + + #[test] + fn test_point_kind_union() { + // Standard widening (max) + assert_eq!(PointKind::Zero.union(PointKind::Int24), PointKind::Int24); + assert_eq!(PointKind::Int24.union(PointKind::Int48), PointKind::Int48); + assert_eq!(PointKind::Int24.union(PointKind::Float32), PointKind::Float32); + assert_eq!(PointKind::Float32.union(PointKind::Float64), PointKind::Float64); + assert_eq!(PointKind::Float64.union(PointKind::Zero), PointKind::Float64); + + // The critical case: large integer + float32 must escalate to float64 + assert_eq!(PointKind::Int48.union(PointKind::Float32), PointKind::Float64); + assert_eq!(PointKind::Float32.union(PointKind::Int48), PointKind::Float64); + } + + #[test] + fn test_value_type_for_values() { + // All zeros + assert_eq!(value_type_for_values([0.0, 0.0].into_iter()), V3ValueType::Zero); + + // Small integers + assert_eq!(value_type_for_values([100.0, 200.0].into_iter()), V3ValueType::Sint64); + + // Large integers + assert_eq!( + value_type_for_values([(1i64 << 30) as f64, 200.0].into_iter()), + V3ValueType::Sint64 + ); + + // Small integer + float32 → Float32 (safe, small int fits in f32) + assert_eq!(value_type_for_values([100.0, 1.5].into_iter()), V3ValueType::Float32); + + // Large integer + float32 → Float64 (the bug fix!) + assert_eq!( + value_type_for_values([(1i64 << 30) as f64, 1.5].into_iter()), + V3ValueType::Float64 + ); + + // Float64 value forces Float64 + assert_eq!( + value_type_for_values([100.0, std::f64::consts::PI].into_iter()), + V3ValueType::Float64 + ); + + // Empty iterator + assert_eq!(value_type_for_values(std::iter::empty()), V3ValueType::Zero); + } +} diff --git a/lib/saluki-components/src/encoders/datadog/metrics/v3/writer.rs b/lib/saluki-components/src/encoders/datadog/metrics/v3/writer.rs new file mode 100644 index 00000000000..71ccd96e8f8 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/v3/writer.rs @@ -0,0 +1,820 @@ +//! V3 columnar metrics writer. +//! +//! The [`V3Writer`] accumulates metrics in columnar format with dictionary deduplication, +//! then produces [`V3EncodedData`] ready for protobuf serialization. + +use protobuf::CodedOutputStream; +use saluki_error::GenericError; + +use super::constants::*; +use super::interner::Interner; +use super::types::{value_type_for_values, V3MetricType, V3ValueType}; + +const FLAG_NO_INDEX: u64 = 0x100; +const FLAG_HAS_UNIT: u64 = 0x200; + +/// Encoded V3 payload data ready for protobuf serialization. +/// +/// Used primarily as a helper for testing. +#[derive(Debug, Default)] +struct V3EncodedData { + // Dictionary encoded bytes (varint-length-prefixed strings) + pub dict_name_bytes: Vec, + pub dict_tags_bytes: Vec, + pub dict_tagsets: Vec, + pub dict_resource_str_bytes: Vec, + pub dict_resource_len: Vec, + pub dict_resource_type: Vec, + pub dict_resource_name: Vec, + pub dict_source_type_bytes: Vec, + pub dict_origin_info: Vec, + pub dict_unit_bytes: Vec, + + // Per-metric columns (one entry per metric, except conditional columns) + pub types: Vec, + pub names: Vec, + pub tags: Vec, + pub resources: Vec, + pub intervals: Vec, + pub num_points: Vec, + pub source_type_names: Vec, + pub origin_infos: Vec, + pub unit_refs: Vec, // Present only for metrics with FLAG_HAS_UNIT set. + + // Point data (varies per metric based on num_points) + pub timestamps: Vec, + pub vals_sint64: Vec, + pub vals_float32: Vec, + pub vals_float64: Vec, + + // Sketch data + pub sketch_num_bins: Vec, + pub sketch_bin_keys: Vec, + pub sketch_bin_cnts: Vec, +} + +/// V3 columnar metrics writer. +/// +/// Accumulates metrics in columnar format with dictionary deduplication. +/// Call [`V3Writer::write`] for each metric, then [`V3Writer::close`] to finalize +/// and get the encoded data. +#[derive(Debug, Default)] +pub struct V3Writer { + // Interners for dictionary deduplication + name_interner: Interner, + tag_interner: Interner, + tagset_interner: Interner>, + resource_str_interner: Interner, + resource_interner: Interner>, + source_type_interner: Interner, + origin_interner: Interner<(i32, i32, i32)>, + unit_interner: Interner, + + // Dictionary encoded bytes + dict_name_bytes: Vec, + dict_tags_bytes: Vec, + dict_tagsets: Vec, + dict_resource_str_bytes: Vec, + dict_resource_len: Vec, + dict_resource_type: Vec, + dict_resource_name: Vec, + dict_source_type_bytes: Vec, + dict_origin_info: Vec, + dict_unit_bytes: Vec, + + // Per-metric columns (one entry per metric, except conditional columns) + types: Vec, + names: Vec, + tags: Vec, + resources: Vec, + intervals: Vec, + num_points: Vec, + source_type_names: Vec, + origin_infos: Vec, + unit_refs: Vec, // Present only for metrics with FLAG_HAS_UNIT set. + + // Point data + timestamps: Vec, + vals_sint64: Vec, + vals_float32: Vec, + vals_float64: Vec, + + // Sketch data + sketch_num_bins: Vec, + sketch_bin_keys: Vec, + sketch_bin_cnts: Vec, + + // Scratch data + tag_ids: Vec, + resource_ids: Vec<(i64, i64)>, +} + +impl V3Writer { + /// Creates a new V3 writer. + pub fn new() -> Self { + Self::default() + } + + /// Begins writing a new metric. + /// + /// Returns a [`V3MetricBuilder`] that must be used to set the metric's + /// properties and add points, then closed with [`V3MetricBuilder::close`]. + pub fn write(&mut self, metric_type: V3MetricType, name: &str) -> V3MetricBuilder<'_> { + let name_id = self.intern_name(name); + let metric_idx = self.types.len(); + let point_start_idx = self.vals_float64.len(); + let sint64_start_idx = self.vals_sint64.len(); + + // Initialize the per-metric columns with default values + self.types.push(metric_type.as_u64()); + self.names.push(name_id); + self.tags.push(0); + self.resources.push(0); + self.intervals.push(0); + self.num_points.push(0); + self.source_type_names.push(0); + self.origin_infos.push(0); + + V3MetricBuilder { + writer: self, + point_start_idx, + sint64_start_idx, + metric_idx, + unit_ref_idx: None, + } + } + + fn finalize_inner(mut self) -> V3EncodedData { + // Delta encode all of the index arrays first. + delta_encode(&mut self.names); + delta_encode(&mut self.tags); + delta_encode(&mut self.resources); + delta_encode(&mut self.source_type_names); + delta_encode(&mut self.origin_infos); + delta_encode(&mut self.unit_refs); + delta_encode(&mut self.timestamps); + + V3EncodedData { + dict_name_bytes: self.dict_name_bytes, + dict_tags_bytes: self.dict_tags_bytes, + dict_tagsets: self.dict_tagsets, + dict_resource_str_bytes: self.dict_resource_str_bytes, + dict_resource_len: self.dict_resource_len, + dict_resource_type: self.dict_resource_type, + dict_resource_name: self.dict_resource_name, + dict_source_type_bytes: self.dict_source_type_bytes, + dict_origin_info: self.dict_origin_info, + dict_unit_bytes: self.dict_unit_bytes, + types: self.types, + names: self.names, + tags: self.tags, + resources: self.resources, + intervals: self.intervals, + num_points: self.num_points, + source_type_names: self.source_type_names, + origin_infos: self.origin_infos, + unit_refs: self.unit_refs, + timestamps: self.timestamps, + vals_sint64: self.vals_sint64, + vals_float32: self.vals_float32, + vals_float64: self.vals_float64, + sketch_num_bins: self.sketch_num_bins, + sketch_bin_keys: self.sketch_bin_keys, + sketch_bin_cnts: self.sketch_bin_cnts, + } + } + + /// Finalizes the writer and serializes the data to the given output buffer. + /// + /// This performs delta encoding on all index arrays. + pub fn finalize(self, output: &mut Vec) -> Result<(), GenericError> { + let data = self.finalize_inner(); + + // Create our writer and start, well.. writing! + let mut os = CodedOutputStream::vec(output); + + // Dictionary fields (bytes - varint-length-prefixed strings concatenated) + if !data.dict_name_bytes.is_empty() { + os.write_bytes(DICT_NAME_STR_FIELD_NUMBER, &data.dict_name_bytes)?; + } + if !data.dict_tags_bytes.is_empty() { + os.write_bytes(DICT_TAGS_STR_FIELD_NUMBER, &data.dict_tags_bytes)?; + } + + // Packed repeated fields for dictionaries + os.write_repeated_packed_sint64(DICT_TAGSETS_FIELD_NUMBER, &data.dict_tagsets)?; + + if !data.dict_resource_str_bytes.is_empty() { + os.write_bytes(DICT_RESOURCE_STR_FIELD_NUMBER, &data.dict_resource_str_bytes)?; + } + + os.write_repeated_packed_int64(DICT_RESOURCE_LEN_FIELD_NUMBER, &data.dict_resource_len)?; + os.write_repeated_packed_sint64(DICT_RESOURCE_TYPE_FIELD_NUMBER, &data.dict_resource_type)?; + os.write_repeated_packed_sint64(DICT_RESOURCE_NAME_FIELD_NUMBER, &data.dict_resource_name)?; + + if !data.dict_source_type_bytes.is_empty() { + os.write_bytes(DICT_SOURCE_TYPE_NAME_FIELD_NUMBER, &data.dict_source_type_bytes)?; + } + + os.write_repeated_packed_int32(DICT_ORIGIN_INFO_FIELD_NUMBER, &data.dict_origin_info)?; + if !data.dict_unit_bytes.is_empty() { + os.write_bytes(DICT_UNIT_STR_FIELD_NUMBER, &data.dict_unit_bytes)?; + } + + // Per-metric columns + os.write_repeated_packed_uint64(TYPES_FIELD_NUMBER, &data.types)?; + os.write_repeated_packed_sint64(NAMES_FIELD_NUMBER, &data.names)?; + os.write_repeated_packed_sint64(TAGS_FIELD_NUMBER, &data.tags)?; + os.write_repeated_packed_sint64(RESOURCES_FIELD_NUMBER, &data.resources)?; + os.write_repeated_packed_uint64(INTERVALS_FIELD_NUMBER, &data.intervals)?; + os.write_repeated_packed_uint64(NUM_POINTS_FIELD_NUMBER, &data.num_points)?; + os.write_repeated_packed_sint64(SOURCE_TYPE_NAME_FIELD_NUMBER, &data.source_type_names)?; + os.write_repeated_packed_sint64(ORIGIN_INFO_FIELD_NUMBER, &data.origin_infos)?; + os.write_repeated_packed_sint64(UNIT_REFS_FIELD_NUMBER, &data.unit_refs)?; + + // Point data + os.write_repeated_packed_sint64(TIMESTAMPS_FIELD_NUMBER, &data.timestamps)?; + os.write_repeated_packed_sint64(VALS_SINT64_FIELD_NUMBER, &data.vals_sint64)?; + os.write_repeated_packed_float(VALS_FLOAT32_FIELD_NUMBER, &data.vals_float32)?; + os.write_repeated_packed_double(VALS_FLOAT64_FIELD_NUMBER, &data.vals_float64)?; + + // Sketch data + os.write_repeated_packed_uint64(SKETCH_NUM_BINS_FIELD_NUMBER, &data.sketch_num_bins)?; + os.write_repeated_packed_sint32(SKETCH_BIN_KEYS_FIELD_NUMBER, &data.sketch_bin_keys)?; + os.write_repeated_packed_uint32(SKETCH_BIN_CNTS_FIELD_NUMBER, &data.sketch_bin_cnts)?; + + os.flush()?; + Ok(()) + } + + // Internal helper methods + + fn intern_name(&mut self, name: &str) -> i64 { + if name.is_empty() { + return 0; + } + let (id, is_new) = self.name_interner.get_or_insert(name); + if is_new { + append_len_str(&mut self.dict_name_bytes, name); + } + id + } + + fn intern_tag(&mut self, tag: &str) { + if tag.is_empty() { + self.tag_ids.push(0); + return; + } + + let (id, is_new) = self.tag_interner.get_or_insert(tag); + if is_new { + append_len_str(&mut self.dict_tags_bytes, tag); + } + self.tag_ids.push(id); + } + + fn intern_tagset(&mut self, tags: I) -> i64 + where + I: Iterator, + S: AsRef, + { + self.tag_ids.clear(); + for tag in tags { + self.intern_tag(tag.as_ref()); + } + + if self.tag_ids.is_empty() { + return 0; + } + + let (id, is_new) = self.tagset_interner.get_or_insert(&self.tag_ids); + if is_new { + self.encode_tagset(); + } + id + } + + fn encode_tagset(&mut self) { + // Push the length + self.dict_tagsets.push(self.tag_ids.len() as i64); + + let start = self.dict_tagsets.len(); + + // Add all tag IDs + self.dict_tagsets.extend_from_slice(&self.tag_ids); + + // Sort and delta-encode the tagset portion + self.dict_tagsets[start..].sort_unstable(); + delta_encode(&mut self.dict_tagsets[start..]); + } + + fn intern_resource_str(&mut self, s: &str) -> i64 { + if s.is_empty() { + return 0; + } + let (id, is_new) = self.resource_str_interner.get_or_insert(s); + if is_new { + append_len_str(&mut self.dict_resource_str_bytes, s); + } + id + } + + fn intern_resources(&mut self, resources: &[(&str, &str)]) -> i64 { + self.resource_ids.clear(); + for (resource_type, resource_name) in resources { + let type_id = self.intern_resource_str(resource_type); + let name_id = self.intern_resource_str(resource_name); + self.resource_ids.push((type_id, name_id)); + } + + if self.resource_ids.is_empty() { + return 0; + } + + let (id, is_new) = self.resource_interner.get_or_insert(&self.resource_ids); + if is_new { + self.encode_resources(); + } + id + } + + fn encode_resources(&mut self) { + self.dict_resource_len.push(self.resource_ids.len() as i64); + + let type_start = self.dict_resource_type.len(); + let name_start = self.dict_resource_name.len(); + + for (type_id, name_id) in &self.resource_ids { + self.dict_resource_type.push(*type_id); + self.dict_resource_name.push(*name_id); + } + + delta_encode(&mut self.dict_resource_type[type_start..]); + delta_encode(&mut self.dict_resource_name[name_start..]); + } + + fn intern_source_type(&mut self, s: &str) -> i64 { + if s.is_empty() { + return 0; + } + let (id, is_new) = self.source_type_interner.get_or_insert(s); + if is_new { + append_len_str(&mut self.dict_source_type_bytes, s); + } + id + } + + fn intern_origin(&mut self, product: i32, category: i32, service: i32) -> i64 { + if product == 0 && category == 0 && service == 0 { + return 0; + } + + let (id, is_new) = self.origin_interner.get_or_insert(&(product, category, service)); + if is_new { + self.dict_origin_info.push(product); + self.dict_origin_info.push(category); + self.dict_origin_info.push(service); + } + id + } + + fn intern_unit(&mut self, unit: &str) -> i64 { + if unit.is_empty() { + return 0; + } + let (id, is_new) = self.unit_interner.get_or_insert(unit); + if is_new { + append_len_str(&mut self.dict_unit_bytes, unit); + } + id + } +} + +/// Builder for a single metric within a V3 payload. +/// +/// Use the setter methods to configure the metric, add points with [`add_point`](Self::add_point), +/// then call [`close`](Self::close) to finalize. +pub struct V3MetricBuilder<'a> { + writer: &'a mut V3Writer, + point_start_idx: usize, + sint64_start_idx: usize, + metric_idx: usize, + unit_ref_idx: Option, +} + +impl<'a> V3MetricBuilder<'a> { + /// Sets the tags for this metric. + /// + /// Tags should be in "key:value" format. + pub fn set_tags(&mut self, tags: I) + where + I: Iterator, + S: AsRef, + { + let tagset_id = self.writer.intern_tagset(tags); + self.writer.tags[self.metric_idx] = tagset_id; + } + + /// Sets the resources for this metric. + /// + /// Resources are (type, name) pairs, for example, (`host`, `server1`). + pub fn set_resources(&mut self, resources: &[(&str, &str)]) { + let res_id = self.writer.intern_resources(resources); + self.writer.resources[self.metric_idx] = res_id; + } + + /// Sets the interval for this metric (used for rate metrics). + pub fn set_interval(&mut self, interval: u64) { + self.writer.intervals[self.metric_idx] = interval; + } + + /// Sets the source type name for this metric. + pub fn set_source_type(&mut self, source_type: &str) { + if source_type.is_empty() { + self.writer.source_type_names[self.metric_idx] = 0; + return; + } + let id = self.writer.intern_source_type(source_type); + self.writer.source_type_names[self.metric_idx] = id; + } + + /// Sets the origin metadata for this metric. + pub fn set_origin(&mut self, product: u32, category: u32, service: u32, no_index: bool) { + let id = self + .writer + .intern_origin(product as i32, category as i32, service as i32); + self.writer.origin_infos[self.metric_idx] = id; + if no_index { + self.writer.types[self.metric_idx] |= FLAG_NO_INDEX; + } + } + + /// Sets the unit for this metric. + pub fn set_unit(&mut self, unit: &str) { + if unit.is_empty() { + self.writer.types[self.metric_idx] &= !FLAG_HAS_UNIT; + if let Some(unit_ref_idx) = self.unit_ref_idx.take() { + self.writer.unit_refs.remove(unit_ref_idx); + } + return; + } + + let id = self.writer.intern_unit(unit); + if let Some(unit_ref_idx) = self.unit_ref_idx { + self.writer.unit_refs[unit_ref_idx] = id; + } else { + self.unit_ref_idx = Some(self.writer.unit_refs.len()); + self.writer.unit_refs.push(id); + } + self.writer.types[self.metric_idx] |= FLAG_HAS_UNIT; + } + + /// Adds a data point to this metric. + pub fn add_point(&mut self, timestamp: i64, value: f64) { + self.writer.timestamps.push(timestamp); + self.writer.vals_float64.push(value); + self.writer.num_points[self.metric_idx] += 1; + } + + /// Adds sketch data for a distribution metric. + /// + /// For sketches, the summary values (count, sum, min, max) are stored as points, + /// and the bin keys/counts are stored separately. + pub fn add_sketch( + &mut self, timestamp: i64, count: i64, sum: f64, min: f64, max: f64, bin_keys: &[i32], bin_counts: &[u32], + ) { + self.writer.timestamps.push(timestamp); + + // Count goes in sint64, sum/min/max go in float64 + self.writer.vals_sint64.push(count); + self.writer.vals_float64.push(sum); + self.writer.vals_float64.push(min); + self.writer.vals_float64.push(max); + + // Store bin data + self.writer.sketch_num_bins.push(bin_keys.len() as u64); + + let key_start = self.writer.sketch_bin_keys.len(); + self.writer.sketch_bin_keys.extend_from_slice(bin_keys); + self.writer.sketch_bin_cnts.extend_from_slice(bin_counts); + + // Delta-encode this sketch's bin keys + delta_encode_i32(&mut self.writer.sketch_bin_keys[key_start..]); + + self.writer.num_points[self.metric_idx] += 1; + } + + /// Finalizes this metric. + /// + /// This compacts the point values to use the smallest representation + /// that can hold all values without loss. + pub fn close(mut self) { + self.compact_values(); + } + + fn compact_values(&mut self) { + let count = self.writer.num_points[self.metric_idx] as usize; + if count == 0 { + return; + } + + let start = self.point_start_idx; + let end = self.writer.vals_float64.len(); + + // Determine the best value type for all points in this metric. + let val_ty = value_type_for_values(self.writer.vals_float64[start..end].iter().copied()); + + // Update the type field + self.writer.types[self.metric_idx] |= val_ty.as_u64(); + + // Convert values to the appropriate storage + match val_ty { + V3ValueType::Zero => { + // Values are all zero, don't store anything + self.writer.vals_float64.truncate(start); + } + V3ValueType::Sint64 => { + let is_sketch = (self.writer.types[self.metric_idx] & 0x0F) == V3MetricType::Sketch as u64; + if is_sketch { + // For sketches, vals_sint64 already has one count per point (pushed by add_sketch), + // and vals_float64 has 3 values per point (sum, min, max). When compacting to Sint64, + // we need to interleave them as: sum, min, max, cnt per point. + let counts: Vec = self.writer.vals_sint64[self.sint64_start_idx..].to_vec(); + self.writer.vals_sint64.truncate(self.sint64_start_idx); + for (i, cnt) in counts.into_iter().enumerate() { + let f_off = start + i * 3; + self.writer.vals_sint64.push(self.writer.vals_float64[f_off] as i64); + self.writer.vals_sint64.push(self.writer.vals_float64[f_off + 1] as i64); + self.writer.vals_sint64.push(self.writer.vals_float64[f_off + 2] as i64); + self.writer.vals_sint64.push(cnt); + } + } else { + for i in start..end { + self.writer.vals_sint64.push(self.writer.vals_float64[i] as i64); + } + } + self.writer.vals_float64.truncate(start); + } + V3ValueType::Float32 => { + for i in start..end { + self.writer.vals_float32.push(self.writer.vals_float64[i] as f32); + } + self.writer.vals_float64.truncate(start); + } + V3ValueType::Float64 => { + // Already stored in vals_float64, keep them + } + } + } +} + +fn append_len_str(dst: &mut Vec, s: &str) { + let mut len = s.len() as u64; + loop { + let mut byte = (len & 0x7F) as u8; + len >>= 7; + if len != 0 { + byte |= 0x80; + } + dst.push(byte); + if len == 0 { + break; + } + } + dst.extend_from_slice(s.as_bytes()); +} + +fn delta_encode(s: &mut [i64]) { + if s.len() < 2 { + return; + } + for i in (1..s.len()).rev() { + s[i] -= s[i - 1]; + } +} + +fn delta_encode_i32(s: &mut [i32]) { + if s.len() < 2 { + return; + } + for i in (1..s.len()).rev() { + s[i] -= s[i - 1]; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_delta_encode() { + let mut data = vec![100, 110, 130, 145]; + delta_encode(&mut data); + assert_eq!(data, vec![100, 10, 20, 15]); + } + + #[test] + fn test_delta_encode_empty() { + let mut data: Vec = vec![]; + delta_encode(&mut data); + assert!(data.is_empty()); + } + + #[test] + fn test_delta_encode_single() { + let mut data = vec![42]; + delta_encode(&mut data); + assert_eq!(data, vec![42]); + } + + #[test] + fn test_append_len_str() { + let mut buf = Vec::new(); + append_len_str(&mut buf, "hello"); + // Length 5 = 0x05, then "hello" + assert_eq!(buf, vec![5, b'h', b'e', b'l', b'l', b'o']); + } + + #[test] + fn test_writer_basic() { + let mut writer = V3Writer::new(); + + { + let mut metric = writer.write(V3MetricType::Gauge, "test.metric"); + metric.set_tags(["env:prod", "service:web"].iter().copied()); + metric.add_point(1000, 42.0); + metric.add_point(1010, 43.5); + metric.close(); + } + + let data = writer.finalize_inner(); + + assert_eq!(data.types.len(), 1); + assert_eq!(data.names.len(), 1); + assert_eq!(data.timestamps.len(), 2); + } + + #[test] + fn test_writer_unit() { + let mut writer = V3Writer::new(); + + { + let mut metric = writer.write(V3MetricType::Gauge, "has.unit"); + metric.set_unit("millisecond"); + metric.add_point(1000, 42.0); + metric.close(); + } + { + let mut metric = writer.write(V3MetricType::Gauge, "no.unit"); + metric.add_point(1000, 43.0); + metric.close(); + } + { + let mut metric = writer.write(V3MetricType::Gauge, "same.unit"); + metric.set_unit("millisecond"); + metric.add_point(1000, 44.0); + metric.close(); + } + + let data = writer.finalize_inner(); + + assert_eq!(data.unit_refs, vec![1, 0]); + assert_eq!(data.dict_unit_bytes, b"\x0bmillisecond"); + assert_eq!(data.types[0] & FLAG_HAS_UNIT, FLAG_HAS_UNIT); + assert_eq!(data.types[1] & FLAG_HAS_UNIT, 0); + assert_eq!(data.types[2] & FLAG_HAS_UNIT, FLAG_HAS_UNIT); + } + + #[test] + fn test_writer_multiple_metrics() { + let mut writer = V3Writer::new(); + + { + let mut m1 = writer.write(V3MetricType::Count, "metric1"); + m1.add_point(1000, 10.0); + m1.close(); + } + + { + let mut m2 = writer.write(V3MetricType::Rate, "metric2"); + m2.set_interval(60); + m2.add_point(2000, 20.0); + m2.close(); + } + + let data = writer.finalize_inner(); + + assert_eq!(data.types.len(), 2); + assert_eq!(data.names.len(), 2); + assert_eq!(data.intervals[0], 0); + // Second metric's interval won't be 60 directly since names is delta-encoded, + // but we can verify the structure is correct + } + + #[test] + fn test_value_compaction_zero() { + let mut writer = V3Writer::new(); + + { + let mut metric = writer.write(V3MetricType::Gauge, "zero.metric"); + metric.add_point(1000, 0.0); + metric.add_point(2000, 0.0); + metric.close(); + } + + let data = writer.finalize_inner(); + + // Values should be compacted - zero values don't need storage + assert!(data.vals_float64.is_empty()); + assert!(data.vals_sint64.is_empty()); + assert!(data.vals_float32.is_empty()); + } + + #[test] + fn test_value_compaction_int() { + let mut writer = V3Writer::new(); + + { + let mut metric = writer.write(V3MetricType::Count, "int.metric"); + metric.add_point(1000, 100.0); + metric.add_point(2000, 200.0); + metric.close(); + } + + let data = writer.finalize_inner(); + + // Integer values should be stored in sint64 + assert!(data.vals_float64.is_empty()); + assert_eq!(data.vals_sint64, vec![100, 200]); + assert!(data.vals_float32.is_empty()); + } + + #[test] + fn test_serialize_empty() { + let writer = V3Writer::new(); + let mut output = Vec::new(); + writer.finalize(&mut output).unwrap(); + assert!(output.is_empty()); + } + + #[test] + fn test_value_compaction_large_int_plus_float32() { + // Regression test: a large integer (> 2^24) mixed with a fractional + // float32 value must use Float64, not Float32, to avoid precision loss. + let mut writer = V3Writer::new(); + + { + let mut metric = writer.write(V3MetricType::Gauge, "mixed.metric"); + metric.add_point(1000, (1i64 << 30) as f64); // large int, doesn't fit in f32 + metric.add_point(2000, 1.5); // fractional, fits in f32 + metric.close(); + } + + let data = writer.finalize_inner(); + + // Must be stored in float64, not float32 + assert!( + data.vals_float32.is_empty(), + "large int should not be stored as float32" + ); + assert_eq!(data.vals_float64, vec![(1i64 << 30) as f64, 1.5]); + assert!(data.vals_sint64.is_empty()); + } + + #[test] + fn test_value_compaction_small_int_plus_float32() { + // Small integers (|v| <= 2^24) mixed with float32 values should + // compact to Float32, since small ints fit losslessly in f32. + let mut writer = V3Writer::new(); + + { + let mut metric = writer.write(V3MetricType::Gauge, "small.mixed"); + metric.add_point(1000, 100.0); + metric.add_point(2000, 1.5); + metric.close(); + } + + let data = writer.finalize_inner(); + + assert!(data.vals_float64.is_empty()); + assert_eq!(data.vals_float32, vec![100.0, 1.5]); + assert!(data.vals_sint64.is_empty()); + } + + #[test] + fn test_serialize_basic_metric() { + let mut writer = V3Writer::new(); + + { + let mut metric = writer.write(V3MetricType::Gauge, "test.metric"); + metric.add_point(1000, 42.0); + metric.close(); + } + + let mut output = Vec::new(); + writer.finalize(&mut output).unwrap(); + + // Should produce non-empty output + assert!(!output.is_empty()); + } +} diff --git a/lib/saluki-components/vendor/core_schema.yaml b/lib/saluki-components/vendor/core_schema.yaml index 6081eabd0ae..7dbb33f4006 100644 --- a/lib/saluki-components/vendor/core_schema.yaml +++ b/lib/saluki-components/vendor/core_schema.yaml @@ -14037,7 +14037,7 @@ properties: properties: compression_level: node_type: setting - type: number + type: integer default: 0 series: node_type: section @@ -14053,6 +14053,19 @@ properties: default: [] items: type: string + shadow_sample_rate: + node_type: setting + type: number + default: 0.001 + tags: + - golang_type:float64 + shadow_sites: + node_type: setting + type: array + default: + - datadoghq.com + items: + type: string use_beta: node_type: setting type: boolean diff --git a/lib/saluki-core/Cargo.toml b/lib/saluki-core/Cargo.toml index 3d636b08742..92102cd1f84 100644 --- a/lib/saluki-core/Cargo.toml +++ b/lib/saluki-core/Cargo.toml @@ -9,6 +9,7 @@ repository = { workspace = true } workspace = true [dependencies] +anymap3 = { workspace = true } async-trait = { workspace = true } bitmask-enum = { workspace = true } ddsketch = { workspace = true } diff --git a/lib/saluki-core/src/data_model/payload/metadata.rs b/lib/saluki-core/src/data_model/payload/metadata.rs index 9e8c3458999..fcfced12276 100644 --- a/lib/saluki-core/src/data_model/payload/metadata.rs +++ b/lib/saluki-core/src/data_model/payload/metadata.rs @@ -1,8 +1,16 @@ +use std::any::Any; + +use anymap3::{CloneAny, Map}; + /// Payload metadata. +/// +/// Contains the event count and an extensible map of typed metadata values. +/// Components can store and retrieve arbitrary typed data using the `set` and `get` methods. #[derive(Clone)] pub struct PayloadMetadata { event_count: usize, data_point_count: usize, + extensions: Map, } impl PayloadMetadata { @@ -11,6 +19,7 @@ impl PayloadMetadata { PayloadMetadata { event_count, data_point_count: 0, + extensions: Map::new(), } } @@ -19,6 +28,7 @@ impl PayloadMetadata { PayloadMetadata { event_count, data_point_count, + extensions: Map::new(), } } @@ -31,4 +41,20 @@ impl PayloadMetadata { pub fn data_point_count(&self) -> usize { self.data_point_count } + + /// Gets a reference to a typed extension value, if present. + pub fn get(&self) -> Option<&T> { + self.extensions.get::() + } + + /// Sets a typed extension value, returning `self` for chaining. + pub fn with(mut self, value: T) -> Self { + self.extensions.insert(value); + self + } + + /// Sets a typed extension value in place. + pub fn set(&mut self, value: T) { + self.extensions.insert(value); + } } diff --git a/test/correctness/dsd-plain-v3-validation/config.yaml b/test/correctness/dsd-plain-v3-validation/config.yaml new file mode 100644 index 00000000000..c91eb1af1c7 --- /dev/null +++ b/test/correctness/dsd-plain-v3-validation/config.yaml @@ -0,0 +1,22 @@ +analysis_mode: metrics +millstone: + image: saluki-images/millstone:latest + config_path: millstone.yaml +datadog_intake: + image: saluki-images/datadog-intake:latest + config_path: ../datadog-intake.yaml +baseline: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test +comparison: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test + - DD_DATA_PLANE_ENABLED=true + - DD_DATA_PLANE_DOGSTATSD_ENABLED=true + - DD_AGGREGATE_CONTEXT_LIMIT=500000 diff --git a/test/correctness/dsd-plain-v3-validation/datadog.yaml b/test/correctness/dsd-plain-v3-validation/datadog.yaml new file mode 100644 index 00000000000..98525edcdd0 --- /dev/null +++ b/test/correctness/dsd-plain-v3-validation/datadog.yaml @@ -0,0 +1,36 @@ +# Using a fixed hostname is both required to avoid errors, and also will ensure consistent tags between DSD/ADP. +hostname: "correctness-testing" + +# Dummy API key. +api_key: dummy-api-key-correctness-testing + +# We have to specifically configure the health port to use. +health_port: 5555 + +# Point ourselves at the datadog-intake service. +dd_url: "http://datadog-intake:2049" + +# Turn off UDP and listen on a UDS socket instead. +dogstatsd_port: 0 +dogstatsd_socket: /airlock/metrics.sock + +# Ensure origin detection is disabled since we can't support it with ADP in standalone mode. +dogstatsd_origin_detection: false + +# Gauges can be processed out-of-order when multiple workers are used, while ADP does not use multiple workers, so ADP +# always ends up with the correct (last seen) value, while DSD might return the last seen value... or the value seen +# four updates ago, etc etc. +dogstatsd_workers_count: 1 + +# Enable V3 metrics encoding in validation mode: both V2 and V3 payloads are sent simultaneously, +# paired by X-Metrics-Request-ID. V3 payloads are counted in the metrics dump; V2 payloads are +# used only for comparison against V3 to validate encoding correctness. +serializer_experimental_use_v3_api: + series: + endpoints: + - "http://datadog-intake:2049" + validate: true + sketches: + endpoints: + - "http://datadog-intake:2049" + validate: true diff --git a/test/correctness/dsd-plain-v3-validation/millstone.yaml b/test/correctness/dsd-plain-v3-validation/millstone.yaml new file mode 100644 index 00000000000..3e0b309eeb5 --- /dev/null +++ b/test/correctness/dsd-plain-v3-validation/millstone.yaml @@ -0,0 +1,91 @@ +seed: + [ + 2, + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 29, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 73, + 79, + 83, + 89, + 97, + 101, + 103, + 107, + 109, + 113, + 127, + 131, + ] +target: "unixgram:///airlock/metrics.sock" +aggregation_bucket_width_secs: 10 +volume: 10000 +corpus: + # TODO: This is a little confusing, because we're specifying the number of metrics to generate (which we _will_ + # honor faithfully) but since we're specifying the contexts count in the payload definition, we might not + # actually generate 10,000 unique contexts, but instead somewhere below 3,000, where each of them is repeated a + # few times to reach the total count. + # + # We need to figure that out, since the intent is that specifying a fixed count should lead to that many metrics + # (and no more) being generated, such that you could depend on that for testing purposes. + size: 10000 + payload: + dogstatsd: + contexts: + constant: 3000 + name_length: + inclusive: + min: 4 + max: 8 + tag_length: + inclusive: + min: 4 + max: 8 + tags_per_msg: + inclusive: + min: 2 + max: 4 + value: + float_probability: 0.5 + range: + inclusive: + min: -9999999 + max: 9999999 + multivalue_count: + inclusive: + min: 2 + max: 32 + multivalue_pack_probability: 0.08 + kind_weights: + metric: 100 + event: 0 + service_check: 0 + # Weights based on analyzing internal Datadog usage data of metric type for metrics sent to the Agent over DogStatsD. + metric_weights: + count: 208 + gauge: 66 + timer: 0 + distribution: 72 + # We specifically _don't_ want to generate sets, because we can't assert their correctness once they've been + # aggregated: a gauge is generated for each aggregator flush that represents the unique number of values in a + # given set, but in general, gauges are meant to be last-write-wins, so unless the metric names/tags can + # indicate that they're for a set, we can't know that it's safe for us to _aggregate_ the gauge values, and with + # our default behavior of taking the latest gauge value... we end up with non-deterministic results. + set: 0 + histogram: 1 diff --git a/test/correctness/dsd-plain-v3/config.yaml b/test/correctness/dsd-plain-v3/config.yaml new file mode 100644 index 00000000000..c91eb1af1c7 --- /dev/null +++ b/test/correctness/dsd-plain-v3/config.yaml @@ -0,0 +1,22 @@ +analysis_mode: metrics +millstone: + image: saluki-images/millstone:latest + config_path: millstone.yaml +datadog_intake: + image: saluki-images/datadog-intake:latest + config_path: ../datadog-intake.yaml +baseline: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test +comparison: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test + - DD_DATA_PLANE_ENABLED=true + - DD_DATA_PLANE_DOGSTATSD_ENABLED=true + - DD_AGGREGATE_CONTEXT_LIMIT=500000 diff --git a/test/correctness/dsd-plain-v3/datadog.yaml b/test/correctness/dsd-plain-v3/datadog.yaml new file mode 100644 index 00000000000..f2bed674a08 --- /dev/null +++ b/test/correctness/dsd-plain-v3/datadog.yaml @@ -0,0 +1,35 @@ +# Using a fixed hostname is both required to avoid errors, and also will ensure consistent tags between DSD/ADP. +hostname: "correctness-testing" + +# Dummy API key. +api_key: dummy-api-key-correctness-testing + +# We have to specifically configure the health port to use. +health_port: 5555 + +# Point ourselves at the datadog-intake service. +dd_url: "http://datadog-intake:2049" + +# Turn off UDP and listen on a UDS socket instead. +dogstatsd_port: 0 +dogstatsd_socket: /airlock/metrics.sock + +# Ensure origin detection is disabled since we can't support it with ADP in standalone mode. +dogstatsd_origin_detection: false + +# Gauges can be processed out-of-order when multiple workers are used, while ADP does not use multiple workers, so ADP +# always ends up with the correct (last seen) value, while DSD might return the last seen value... or the value seen +# four updates ago, etc etc. +dogstatsd_workers_count: 1 + +# Enable V3 metrics encoding for all endpoints. +# +# We leave validation mode off since we want to focus on just the V3 metrics, and we don't yet have a way to separate +# the V3 metrics from the V2 metrics in order to emulate validation done on the backend. +serializer_experimental_use_v3_api: + series: + endpoints: + - "http://datadog-intake:2049" + sketches: + endpoints: + - "http://datadog-intake:2049" diff --git a/test/correctness/dsd-plain-v3/millstone.yaml b/test/correctness/dsd-plain-v3/millstone.yaml new file mode 100644 index 00000000000..3e0b309eeb5 --- /dev/null +++ b/test/correctness/dsd-plain-v3/millstone.yaml @@ -0,0 +1,91 @@ +seed: + [ + 2, + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 29, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 73, + 79, + 83, + 89, + 97, + 101, + 103, + 107, + 109, + 113, + 127, + 131, + ] +target: "unixgram:///airlock/metrics.sock" +aggregation_bucket_width_secs: 10 +volume: 10000 +corpus: + # TODO: This is a little confusing, because we're specifying the number of metrics to generate (which we _will_ + # honor faithfully) but since we're specifying the contexts count in the payload definition, we might not + # actually generate 10,000 unique contexts, but instead somewhere below 3,000, where each of them is repeated a + # few times to reach the total count. + # + # We need to figure that out, since the intent is that specifying a fixed count should lead to that many metrics + # (and no more) being generated, such that you could depend on that for testing purposes. + size: 10000 + payload: + dogstatsd: + contexts: + constant: 3000 + name_length: + inclusive: + min: 4 + max: 8 + tag_length: + inclusive: + min: 4 + max: 8 + tags_per_msg: + inclusive: + min: 2 + max: 4 + value: + float_probability: 0.5 + range: + inclusive: + min: -9999999 + max: 9999999 + multivalue_count: + inclusive: + min: 2 + max: 32 + multivalue_pack_probability: 0.08 + kind_weights: + metric: 100 + event: 0 + service_check: 0 + # Weights based on analyzing internal Datadog usage data of metric type for metrics sent to the Agent over DogStatsD. + metric_weights: + count: 208 + gauge: 66 + timer: 0 + distribution: 72 + # We specifically _don't_ want to generate sets, because we can't assert their correctness once they've been + # aggregated: a gauge is generated for each aggregator flush that represents the unique number of values in a + # given set, but in general, gauges are meant to be last-write-wins, so unless the metric names/tags can + # indicate that they're for a set, we can't know that it's safe for us to _aggregate_ the gauge values, and with + # our default behavior of taking the latest gauge value... we end up with non-deterministic results. + set: 0 + histogram: 1 From 53976e198784b31d3fe17a0089ebd6bdb5ce5266 Mon Sep 17 00:00:00 2001 From: Raymond Zhao <35050708+rayz@users.noreply.github.com> Date: Tue, 2 Jun 2026 15:39:04 -0400 Subject: [PATCH 2/2] feat(metrics): add v3 shadow sampling (#1792) ## Summary Adds V3 series shadow sampling support to ADP, matching the Core Agent config/defaults for: - `serializer_experimental_use_v3_api.series.shadow_sample_rate` - `serializer_experimental_use_v3_api.series.shadow_sites` - `serializer_experimental_use_v3_api.series.beta_route` When series V3 is not authoritative, ADP can now sample V2 series flushes and send a correlated V3 beta shadow payload with the same metrics validation batch headers. Shadowing is limited to V2 series baselines, matching the Core Agent behavior. Also fixes the V3 correctness harness so the new `dsd-plain-v3` cases decode and compare V3 payloads correctly. Fake intake now handles V3 metric routes, and `stele` normalizes V3 columnar payloads, including host resources and Agent-compatible sketch summary ordering. Todo / Follow Up: Shadow sampling is currently encoder-scoped in ADP. A follow-up should make it endpoint/resolver-scoped like the Core Agent for mixed endpoint and multi-site configurations. ## Change Type - [ ] Bug fix - [x] New feature - [ ] Non-functional (chore, refactoring, docs) - [ ] Performance ## How did you test this PR? Unit Tests / CI ## References --- Cargo.lock | 1 + .../src/app/metrics/handlers.rs | 25 +- .../datadog-intake/src/app/metrics/mod.rs | 3 + .../datadog-intake/src/app/metrics/state.rs | 12 +- bin/correctness/stele/Cargo.toml | 1 + bin/correctness/stele/src/metrics.rs | 529 +++++++++++++++++- .../src/common/datadog/config.rs | 2 + .../src/common/datadog/endpoints.rs | 128 ++++- .../src/common/datadog/io.rs | 2 + .../src/common/datadog/protocol.rs | 57 ++ .../src/config_registry/datadog/encoders.rs | 30 + .../config_registry/datadog/unsupported.rs | 28 - .../src/encoders/datadog/metrics/mod.rs | 399 +++++++++++-- .../src/encoders/datadog/metrics/shadow.rs | 39 ++ .../src/forwarders/datadog/mod.rs | 15 +- .../cases/dsd-plain-v3-validation/config.yaml | 19 + .../dsd-plain-v3-validation/datadog.yaml | 36 ++ .../dsd-plain-v3-validation/millstone.yaml | 57 ++ .../cases/dsd-plain-v3/config.yaml | 19 + .../cases/dsd-plain-v3/datadog.yaml | 34 ++ .../cases/dsd-plain-v3/millstone.yaml | 57 ++ 21 files changed, 1400 insertions(+), 93 deletions(-) create mode 100644 lib/saluki-components/src/encoders/datadog/metrics/shadow.rs create mode 100644 test/correctness/cases/dsd-plain-v3-validation/config.yaml create mode 100644 test/correctness/cases/dsd-plain-v3-validation/datadog.yaml create mode 100644 test/correctness/cases/dsd-plain-v3-validation/millstone.yaml create mode 100644 test/correctness/cases/dsd-plain-v3/config.yaml create mode 100644 test/correctness/cases/dsd-plain-v3/datadog.yaml create mode 100644 test/correctness/cases/dsd-plain-v3/millstone.yaml diff --git a/Cargo.lock b/Cargo.lock index 5c101faf3ec..4c74e2d90fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4806,6 +4806,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "simdutf8", "stringtheory", ] diff --git a/bin/correctness/datadog-intake/src/app/metrics/handlers.rs b/bin/correctness/datadog-intake/src/app/metrics/handlers.rs index eff1578ecc5..814d322aab8 100644 --- a/bin/correctness/datadog-intake/src/app/metrics/handlers.rs +++ b/bin/correctness/datadog-intake/src/app/metrics/handlers.rs @@ -1,5 +1,5 @@ use axum::{body::Bytes, extract::State, http::StatusCode, Json}; -use datadog_protos::metrics::{MetricPayload, SketchPayload}; +use datadog_protos::metrics::{v3::Payload as V3Payload, MetricPayload, SketchPayload}; use protobuf::Message as _; use stele::Metric; use tracing::{error, info}; @@ -89,3 +89,26 @@ pub async fn handle_sketch_beta(State(state): State, body: Bytes) } } } + +pub async fn handle_metrics_v3(State(state): State, body: Bytes) -> StatusCode { + info!("Received metrics v3 payload."); + + let payload = match V3Payload::parse_from_bytes(&body[..]) { + Ok(payload) => payload, + Err(e) => { + error!(error = %e, "Failed to parse metrics v3 payload."); + return StatusCode::BAD_REQUEST; + } + }; + + match state.merge_v3_payload(payload) { + Ok(()) => { + info!("Processed metrics v3 payload."); + StatusCode::ACCEPTED + } + Err(e) => { + error!(error = %e, "Failed to merge metrics v3 payload."); + StatusCode::BAD_REQUEST + } + } +} diff --git a/bin/correctness/datadog-intake/src/app/metrics/mod.rs b/bin/correctness/datadog-intake/src/app/metrics/mod.rs index d2414990ee6..49e623fb9de 100644 --- a/bin/correctness/datadog-intake/src/app/metrics/mod.rs +++ b/bin/correctness/datadog-intake/src/app/metrics/mod.rs @@ -14,6 +14,9 @@ pub fn build_metrics_router() -> Router { .route("/metrics/dump", get(handle_metrics_dump)) .route("/api/v1/series", post(handle_series_v1)) .route("/api/v2/series", post(handle_series_v2)) + .route("/api/intake/metrics/v3/series", post(handle_metrics_v3)) + .route("/api/intake/metrics/v3beta/series", post(handle_metrics_v3)) .route("/api/beta/sketches", post(handle_sketch_beta)) + .route("/api/intake/metrics/v3/sketches", post(handle_metrics_v3)) .with_state(MetricsState::new()) } diff --git a/bin/correctness/datadog-intake/src/app/metrics/state.rs b/bin/correctness/datadog-intake/src/app/metrics/state.rs index ef40aad3857..e21d0b67493 100644 --- a/bin/correctness/datadog-intake/src/app/metrics/state.rs +++ b/bin/correctness/datadog-intake/src/app/metrics/state.rs @@ -1,6 +1,6 @@ use std::sync::{Arc, Mutex}; -use datadog_protos::metrics::{MetricPayload, SketchPayload}; +use datadog_protos::metrics::{v3::Payload as V3Payload, MetricPayload, SketchPayload}; use saluki_error::GenericError; use stele::Metric; @@ -43,6 +43,16 @@ impl MetricsState { Ok(()) } + /// Merges the given metrics v3 payload into the current metrics state. + pub fn merge_v3_payload(&self, payload: V3Payload) -> Result<(), GenericError> { + let metrics = Metric::try_from_v3(payload)?; + + let mut data = self.metrics.lock().unwrap(); + data.extend(metrics); + + Ok(()) + } + /// Merges the given series v1 payload into the current metrics state. pub fn merge_series_v1_payload(&self, bytes: &[u8]) -> Result<(), GenericError> { let metrics = Metric::try_from_series_v1(bytes)?; diff --git a/bin/correctness/stele/Cargo.toml b/bin/correctness/stele/Cargo.toml index 8b3dfff3eca..07d52a89efe 100644 --- a/bin/correctness/stele/Cargo.toml +++ b/bin/correctness/stele/Cargo.toml @@ -20,4 +20,5 @@ saluki-error = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } +simdutf8 = { workspace = true } stringtheory = { workspace = true } diff --git a/bin/correctness/stele/src/metrics.rs b/bin/correctness/stele/src/metrics.rs index b4b6847281e..89fd106d873 100644 --- a/bin/correctness/stele/src/metrics.rs +++ b/bin/correctness/stele/src/metrics.rs @@ -1,6 +1,6 @@ use std::fmt; -use datadog_protos::metrics::{MetricPayload, MetricType, SketchPayload}; +use datadog_protos::metrics::{v3::Payload as V3Payload, Dogsketch, MetricPayload, MetricType, SketchPayload}; use ddsketch::DDSketch; use float_cmp::ApproxEqRatio as _; use saluki_error::{generic_error, GenericError}; @@ -350,6 +350,459 @@ impl Metric { } } +// V3 metric type constants (from intake_v3.proto metricType enum). +const V3_METRIC_TYPE_COUNT: u64 = 1; +const V3_METRIC_TYPE_RATE: u64 = 2; +const V3_METRIC_TYPE_GAUGE: u64 = 3; +const V3_METRIC_TYPE_SKETCH: u64 = 4; + +// V3 value type constants (from intake_v3.proto valueType enum). +const V3_VALUE_TYPE_ZERO: u64 = 0x00; +const V3_VALUE_TYPE_SINT64: u64 = 0x10; +const V3_VALUE_TYPE_FLOAT32: u64 = 0x20; +const V3_VALUE_TYPE_FLOAT64: u64 = 0x30; + +/// Tracks cursors into the various value arrays of a v3 payload during decoding. +struct V3ValueCursors { + timestamp: usize, + sint64: usize, + float32: usize, + float64: usize, + sketch_point: usize, + sketch_bin_key: usize, + sketch_bin_cnt: usize, +} + +impl V3ValueCursors { + fn new() -> Self { + Self { + timestamp: 0, + sint64: 0, + float32: 0, + float64: 0, + sketch_point: 0, + sketch_bin_key: 0, + sketch_bin_cnt: 0, + } + } +} + +impl Metric { + /// Attempts to parse metrics from a v3 payload. + /// + /// The v3 format uses columnar encoding with dictionary deduplication and delta encoding. + /// + /// # Errors + /// + /// If the payload contains invalid data, an error will be returned. + pub fn try_from_v3(mut payload: V3Payload) -> Result, GenericError> { + let data = payload + .metricData + .take() + .ok_or_else(|| generic_error!("V3 payload missing metricData"))?; + + let num_metrics = data.types.len(); + if num_metrics == 0 { + return Ok(Vec::new()); + } + + // Parse dictionaries. + let names_dict = parse_dict_strings(&data.dictNameStr)?; + let tags_dict = parse_dict_strings(&data.dictTagStr)?; + let tagsets_dict = parse_tagsets(&data.dictTagsets, &tags_dict)?; + let resources_dict = parse_resources( + &data.dictResourceLen, + &data.dictResourceType, + &data.dictResourceName, + &parse_dict_strings(&data.dictResourceStr)?, + )?; + + // Delta-decode index arrays. + let mut name_refs = data.nameRefs; + let mut tagset_refs = data.tagsetRefs; + let mut resources_refs = data.resourcesRefs; + let mut timestamps = data.timestamps; + delta_decode(&mut name_refs); + delta_decode(&mut tagset_refs); + delta_decode(&mut resources_refs); + delta_decode(&mut timestamps); + + // Delta-decode sketch bin keys (per-sketch sequences are individually delta-encoded, + // but we handle that during iteration). + let mut sketch_bin_keys = data.sketchBinKeys; + + let mut cursors = V3ValueCursors::new(); + let mut metrics = Vec::with_capacity(num_metrics); + + for i in 0..num_metrics { + let type_field = data + .types + .get(i) + .copied() + .ok_or_else(|| generic_error!("Ran out of metric types"))?; + let metric_type = type_field & 0x0F; + let value_type = type_field & 0xF0; + let num_points = data + .numPoints + .get(i) + .copied() + .ok_or_else(|| generic_error!("Ran out of numPoints")) + .and_then(|num_points| u64_to_usize(num_points, "numPoints"))?; + + // Resolve name (1-based index). + let name_ref = name_refs + .get(i) + .copied() + .ok_or_else(|| generic_error!("Ran out of nameRefs")) + .and_then(|name_ref| i64_to_usize(name_ref, "name ref"))?; + let name = if name_ref == 0 { + String::new() + } else { + names_dict + .get(name_ref - 1) + .ok_or_else(|| generic_error!("Invalid name ref {} (dict size {})", name_ref, names_dict.len()))? + .clone() + }; + + // Resolve tags (1-based index). + let tagset_ref = tagset_refs + .get(i) + .copied() + .ok_or_else(|| generic_error!("Ran out of tagsetRefs")) + .and_then(|tagset_ref| i64_to_usize(tagset_ref, "tagset ref"))?; + let mut tags = if tagset_ref == 0 { + Vec::new() + } else { + tagsets_dict + .get(tagset_ref - 1) + .ok_or_else(|| { + generic_error!("Invalid tagset ref {} (dict size {})", tagset_ref, tagsets_dict.len()) + })? + .clone() + }; + + let resource_ref = resources_refs + .get(i) + .copied() + .map(|resource_ref| i64_to_usize(resource_ref, "resource ref")) + .transpose()? + .unwrap_or(0); + if resource_ref != 0 { + let resources = resources_dict.get(resource_ref - 1).ok_or_else(|| { + generic_error!( + "Invalid resource ref {} (dict size {})", + resource_ref, + resources_dict.len() + ) + })?; + if let Some((_, host_name)) = resources + .iter() + .find(|(resource_type, resource_name)| resource_type == "host" && !resource_name.is_empty()) + { + tags.push(format!("host:{}", host_name)); + } + } + + let mut values = Vec::with_capacity(num_points); + + if metric_type == V3_METRIC_TYPE_SKETCH { + for _ in 0..num_points { + // Read timestamp. + let ts = *timestamps + .get(cursors.timestamp) + .ok_or_else(|| generic_error!("Ran out of timestamps"))?; + let timestamp = u64::try_from(ts).map_err(|_| generic_error!("Invalid timestamp: {}", ts))?; + cursors.timestamp += 1; + + // The Agent writes sketch summaries as sum, min, max, then count. Count is always in valsSint64, + // but integer summaries can share that column, so count must be read after the summary values. + let sum = read_value( + value_type, + &mut cursors, + &data.valsSint64, + &data.valsFloat32, + &data.valsFloat64, + )?; + let min = read_value( + value_type, + &mut cursors, + &data.valsSint64, + &data.valsFloat32, + &data.valsFloat64, + )?; + let max = read_value( + value_type, + &mut cursors, + &data.valsSint64, + &data.valsFloat32, + &data.valsFloat64, + )?; + let cnt = *data + .valsSint64 + .get(cursors.sint64) + .ok_or_else(|| generic_error!("Ran out of sint64 values for sketch count"))?; + cursors.sint64 += 1; + let avg = if cnt != 0 { sum / cnt as f64 } else { 0.0 }; + + // Read bin data. + let num_bins = *data + .sketchNumBins + .get(cursors.sketch_point) + .ok_or_else(|| generic_error!("Ran out of sketchNumBins"))? + as usize; + cursors.sketch_point += 1; + + let bin_key_start = cursors.sketch_bin_key; + let bin_key_end = bin_key_start + num_bins; + if bin_key_end > sketch_bin_keys.len() { + return Err(generic_error!("Ran out of sketch bin keys")); + } + + // Delta-decode this sketch's bin keys. + delta_decode_i32(&mut sketch_bin_keys[bin_key_start..bin_key_end]); + + let k: Vec = sketch_bin_keys[bin_key_start..bin_key_end].to_vec(); + cursors.sketch_bin_key = bin_key_end; + + let bin_cnt_start = cursors.sketch_bin_cnt; + let bin_cnt_end = bin_cnt_start + num_bins; + if bin_cnt_end > data.sketchBinCnts.len() { + return Err(generic_error!("Ran out of sketch bin counts")); + } + let n: Vec = data.sketchBinCnts[bin_cnt_start..bin_cnt_end].to_vec(); + cursors.sketch_bin_cnt = bin_cnt_end; + + // Build a Dogsketch proto and use the existing TryFrom conversion. + let mut dogsketch = Dogsketch::new(); + dogsketch.ts = ts; + dogsketch.cnt = cnt; + dogsketch.min = min; + dogsketch.max = max; + dogsketch.avg = avg; + dogsketch.sum = sum; + dogsketch.set_k(k); + dogsketch.set_n(n); + + let sketch = DDSketch::try_from(dogsketch) + .map_err(|e| generic_error!("Failed to convert v3 sketch to DDSketch: {}", e))?; + values.push((timestamp, MetricValue::Sketch { sketch })); + } + } else { + for _ in 0..num_points { + // Read timestamp. + let ts = *timestamps + .get(cursors.timestamp) + .ok_or_else(|| generic_error!("Ran out of timestamps"))?; + let timestamp = u64::try_from(ts).map_err(|_| generic_error!("Invalid timestamp: {}", ts))?; + cursors.timestamp += 1; + + // Read point value. + let value = read_value( + value_type, + &mut cursors, + &data.valsSint64, + &data.valsFloat32, + &data.valsFloat64, + )?; + + let metric_value = match metric_type { + V3_METRIC_TYPE_COUNT => MetricValue::Count { value }, + V3_METRIC_TYPE_RATE => MetricValue::Rate { + interval: data + .intervals + .get(i) + .copied() + .ok_or_else(|| generic_error!("Ran out of intervals"))?, + value, + }, + V3_METRIC_TYPE_GAUGE => MetricValue::Gauge { value }, + other => return Err(generic_error!("Unknown v3 metric type: {}", other)), + }; + + values.push((timestamp, metric_value)); + } + } + + metrics.push(Metric { + context: MetricContext { name, tags }, + values, + }); + } + + Ok(metrics) + } +} + +/// Delta-decode in place: convert deltas to absolute values (prefix sum). +fn delta_decode(s: &mut [i64]) { + for i in 1..s.len() { + s[i] += s[i - 1]; + } +} + +/// Delta-decode i32 values in place. +fn delta_decode_i32(s: &mut [i32]) { + for i in 1..s.len() { + s[i] += s[i - 1]; + } +} + +/// Read a varint from a byte slice, returning `(value, bytes_consumed)`. +fn read_varint(data: &[u8]) -> Result<(u64, usize), GenericError> { + let mut value: u64 = 0; + let mut shift = 0; + for (i, &byte) in data.iter().enumerate() { + value |= ((byte & 0x7F) as u64) << shift; + if byte & 0x80 == 0 { + return Ok((value, i + 1)); + } + shift += 7; + if shift >= 64 { + return Err(generic_error!("Varint too large")); + } + } + Err(generic_error!("Unexpected end of data reading varint")) +} + +/// Parse varint-length-prefixed strings from a byte buffer. +fn parse_dict_strings(data: &[u8]) -> Result, GenericError> { + let mut strings = Vec::new(); + let mut offset = 0; + while offset < data.len() { + let (len, varint_size) = read_varint(&data[offset..])?; + offset += varint_size; + let len = len as usize; + if offset + len > data.len() { + return Err(generic_error!("Dictionary string extends past end of buffer")); + } + let s = simdutf8::basic::from_utf8(&data[offset..offset + len]) + .map_err(|e| generic_error!("Invalid UTF-8 in dictionary string: {}", e))?; + strings.push(s.to_string()); + offset += len; + } + Ok(strings) +} + +/// Parse tagsets from the `dictTagsets` array using the tag dictionary. +/// +/// Each tagset in `dict_tagsets` is encoded as: length, then that many delta-encoded tag indices. +fn parse_tagsets(dict_tagsets: &[i64], tags_dict: &[String]) -> Result>, GenericError> { + let mut tagsets = Vec::new(); + let mut offset = 0; + while offset < dict_tagsets.len() { + let count = i64_to_usize(dict_tagsets[offset], "tagset length")?; + offset += 1; + if offset + count > dict_tagsets.len() { + return Err(generic_error!("Tagset extends past end of dictTagsets array")); + } + + // Delta-decode the tag indices within this tagset. + let mut tag_indices: Vec = dict_tagsets[offset..offset + count].to_vec(); + delta_decode(&mut tag_indices); + + // Resolve tag indices (1-based) to tag strings. + let mut tags = Vec::with_capacity(count); + for &idx in &tag_indices { + let idx = i64_to_usize(idx, "tag index")?; + if idx == 0 { + continue; + } + let tag = tags_dict + .get(idx - 1) + .ok_or_else(|| generic_error!("Invalid tag index {} (dict size {})", idx, tags_dict.len()))?; + tags.push(tag.clone()); + } + tagsets.push(tags); + offset += count; + } + Ok(tagsets) +} + +fn u64_to_usize(value: u64, field: &str) -> Result { + usize::try_from(value).map_err(|_| generic_error!("Invalid {}: {}", field, value)) +} + +fn i64_to_usize(value: i64, field: &str) -> Result { + usize::try_from(value).map_err(|_| generic_error!("Invalid negative {}: {}", field, value)) +} + +/// Parse resource sets from V3 resource dictionaries. +/// +/// Each resource set is encoded as one length entry plus that many locally delta-encoded type/name dictionary indexes. +fn parse_resources( + dict_resource_len: &[i64], dict_resource_type: &[i64], dict_resource_name: &[i64], resource_strings: &[String], +) -> Result>, GenericError> { + let mut resources = Vec::with_capacity(dict_resource_len.len()); + let mut offset = 0; + + for &count in dict_resource_len { + let count = usize::try_from(count).map_err(|_| generic_error!("Invalid negative resource count: {}", count))?; + if offset + count > dict_resource_type.len() || offset + count > dict_resource_name.len() { + return Err(generic_error!("Resource set extends past resource dictionary arrays")); + } + + let mut type_indices = dict_resource_type[offset..offset + count].to_vec(); + let mut name_indices = dict_resource_name[offset..offset + count].to_vec(); + delta_decode(&mut type_indices); + delta_decode(&mut name_indices); + + let mut resource_set = Vec::with_capacity(count); + for (&type_idx, &name_idx) in type_indices.iter().zip(name_indices.iter()) { + let resource_type = resource_strings + .get(resource_index(type_idx)?) + .ok_or_else(|| generic_error!("Invalid resource type index {}", type_idx))? + .clone(); + let resource_name = resource_strings + .get(resource_index(name_idx)?) + .ok_or_else(|| generic_error!("Invalid resource name index {}", name_idx))? + .clone(); + resource_set.push((resource_type, resource_name)); + } + + resources.push(resource_set); + offset += count; + } + + Ok(resources) +} + +fn resource_index(idx: i64) -> Result { + let idx = usize::try_from(idx).map_err(|_| generic_error!("Invalid negative resource index: {}", idx))?; + idx.checked_sub(1) + .ok_or_else(|| generic_error!("Invalid zero resource index")) +} + +/// Read the next f64 value from the appropriate value array based on `value_type`. +fn read_value( + value_type: u64, cursors: &mut V3ValueCursors, vals_sint64: &[i64], vals_float32: &[f32], vals_float64: &[f64], +) -> Result { + match value_type { + V3_VALUE_TYPE_ZERO => Ok(0.0), + V3_VALUE_TYPE_SINT64 => { + let v = *vals_sint64 + .get(cursors.sint64) + .ok_or_else(|| generic_error!("Ran out of sint64 values"))?; + cursors.sint64 += 1; + Ok(v as f64) + } + V3_VALUE_TYPE_FLOAT32 => { + let v = *vals_float32 + .get(cursors.float32) + .ok_or_else(|| generic_error!("Ran out of float32 values"))?; + cursors.float32 += 1; + Ok(v as f64) + } + V3_VALUE_TYPE_FLOAT64 => { + let v = *vals_float64 + .get(cursors.float64) + .ok_or_else(|| generic_error!("Ran out of float64 values"))?; + cursors.float64 += 1; + Ok(v) + } + _ => Err(generic_error!("Unknown v3 value type: {:#x}", value_type)), + } +} + fn approx_eq_ratio_optional(a: Option, b: Option, ratio: f64) -> bool { match (a, b) { (Some(a), Some(b)) => a.approx_eq_ratio(&b, ratio), @@ -484,4 +937,78 @@ mod tests { assert!(metrics[0].context.tags.contains(&"host:server-1".to_string())); assert!(metrics[0].context.tags.contains(&"env:prod".to_string())); } + + #[test] + fn try_from_v3_folds_host_resource_into_tags() { + use datadog_protos::metrics::v3::{MetricData, Payload}; + + let mut data = MetricData::new(); + data.dictNameStr = length_prefixed_strings(["my.metric"]); + data.dictTagStr = length_prefixed_strings(["env:prod"]); + data.dictTagsets = vec![1, 1]; + data.dictResourceStr = length_prefixed_strings(["host", "server-1", "device", "eth0"]); + data.dictResourceLen = vec![2]; + data.dictResourceType = vec![1, 2]; + data.dictResourceName = vec![2, 2]; + data.types = vec![V3_METRIC_TYPE_COUNT | V3_VALUE_TYPE_ZERO]; + data.nameRefs = vec![1]; + data.tagsetRefs = vec![1]; + data.resourcesRefs = vec![1]; + data.intervals = vec![0]; + data.numPoints = vec![1]; + data.timestamps = vec![1]; + + let mut payload = Payload::new(); + payload.metricData = Some(data).into(); + + let metrics = Metric::try_from_v3(payload).expect("parse should succeed"); + assert_eq!(metrics.len(), 1); + assert!(metrics[0].context.tags.contains(&"env:prod".to_string())); + assert!(metrics[0].context.tags.contains(&"host:server-1".to_string())); + assert!(!metrics[0].context.tags.iter().any(|tag| tag.starts_with("device:"))); + } + + #[test] + fn try_from_v3_decodes_integer_sketch_summary_order() { + use datadog_protos::metrics::v3::{MetricData, Payload}; + + let mut data = MetricData::new(); + data.dictNameStr = length_prefixed_strings(["my.sketch"]); + data.types = vec![V3_METRIC_TYPE_SKETCH | V3_VALUE_TYPE_SINT64]; + data.nameRefs = vec![1]; + data.tagsetRefs = vec![0]; + data.resourcesRefs = vec![0]; + data.intervals = vec![0]; + data.numPoints = vec![1]; + data.timestamps = vec![123]; + // Agent V3 sketch ordering is sum, min, max, count when integer summaries share valsSint64. + data.valsSint64 = vec![10, 1, 4, 4]; + data.sketchNumBins = vec![1]; + data.sketchBinKeys = vec![0]; + data.sketchBinCnts = vec![4]; + + let mut payload = Payload::new(); + payload.metricData = Some(data).into(); + + let metrics = Metric::try_from_v3(payload).expect("parse should succeed"); + assert_eq!(metrics.len(), 1); + + let MetricValue::Sketch { sketch } = &metrics[0].values[0].1 else { + panic!("expected sketch value"); + }; + assert_eq!(sketch.count(), 4); + assert_eq!(sketch.sum(), Some(10.0)); + assert_eq!(sketch.min(), Some(1.0)); + assert_eq!(sketch.max(), Some(4.0)); + assert_eq!(sketch.avg(), Some(2.5)); + } + + fn length_prefixed_strings(strings: impl IntoIterator) -> Vec { + let mut bytes = Vec::new(); + for s in strings { + bytes.push(s.len() as u8); + bytes.extend_from_slice(s.as_bytes()); + } + bytes + } } diff --git a/lib/saluki-components/src/common/datadog/config.rs b/lib/saluki-components/src/common/datadog/config.rs index 5e0c6cadf5c..cb94644f6af 100644 --- a/lib/saluki-components/src/common/datadog/config.rs +++ b/lib/saluki-components/src/common/datadog/config.rs @@ -890,6 +890,8 @@ mod config_smoke { structs::FORWARDER_CONFIGURATION, &[ "serializer_experimental_use_v3_api.sketches.beta_route", + "serializer_experimental_use_v3_api.sketches.shadow_sample_rate", + "serializer_experimental_use_v3_api.sketches.shadow_sites", "serializer_experimental_use_v3_api.sketches.use_beta", ], json!({ "api_key": "smoke-test-api-key" }), diff --git a/lib/saluki-components/src/common/datadog/endpoints.rs b/lib/saluki-components/src/common/datadog/endpoints.rs index a94d97e42d7..8df17bd6ad5 100644 --- a/lib/saluki-components/src/common/datadog/endpoints.rs +++ b/lib/saluki-components/src/common/datadog/endpoints.rs @@ -20,6 +20,9 @@ use super::protocol::{MetricsPayloadInfo, MetricsProtocolVersion}; static DD_URL_REGEX: LazyLock = LazyLock::new(|| Regex::new(r"^app(\.mrf)?(\.[a-z]{2}\d)?\.(datad(oghq|0g)\.(com|eu)|ddog-gov\.com)$").unwrap()); +static DD_SITE_FROM_HOSTNAME_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r"(?:^|\.)([a-z]{2,}\d{1,2}\.)?(datad(?:oghq|0g)\.(?:com|eu)|ddog-gov\.com)\.?$").unwrap() +}); pub const DEFAULT_SITE: &str = "datadoghq.com"; @@ -45,6 +48,9 @@ pub struct EndpointV3Settings { /// Whether validation mode is enabled for sketches (send both V2 and V3). pub sketches_validation_mode: bool, + + /// Whether this endpoint accepts sampled V3 beta series shadow payloads. + pub series_shadow_mode: bool, } impl EndpointV3Settings { @@ -53,17 +59,22 @@ impl EndpointV3Settings { /// The `v3_series_endpoints` and `v3_sketches_endpoints` are lists of configured endpoint names. /// If the endpoint name matches any entry, V3 is enabled for that metric type. pub fn from_endpoint_url( - configured_endpoint: &str, v3_series_endpoints: &[String], v3_sketches_endpoints: &[String], - series_validate: bool, sketches_validate: bool, + configured_endpoint: &str, resolved_endpoint: &Url, v3_series_endpoints: &[String], + v3_sketches_endpoints: &[String], series_validate: bool, sketches_validate: bool, + series_shadow_sites: &[String], ) -> Self { let use_v3_series = v3_series_endpoints.iter().any(|e| configured_endpoint == e); let use_v3_sketches = v3_sketches_endpoints.iter().any(|e| configured_endpoint == e); + let series_shadow_mode = !use_v3_series + && extract_site_from_url(resolved_endpoint.as_str()) + .is_some_and(|site| series_shadow_sites.iter().any(|shadow_site| shadow_site == &site)); Self { use_v3_series, use_v3_sketches, series_validation_mode: use_v3_series && series_validate, sketches_validation_mode: use_v3_sketches && sketches_validate, + series_shadow_mode, } } @@ -100,8 +111,11 @@ impl EndpointV3Settings { if is_sketch { // V3 sketches: accept if V3 sketches is enabled self.use_v3_sketches + } else if info.is_shadow() { + // V3 shadow series: accept only when this V2-authoritative endpoint is shadow-enabled. + self.series_shadow_mode } else { - // V3 series: accept if V3 series is enabled + // V3 series: accept if V3 series is enabled. self.use_v3_series } } @@ -117,7 +131,9 @@ impl EndpointV3Settings { return false; }; - if info.is_sketch() { + if info.is_shadow() { + self.series_shadow_mode + } else if info.is_sketch() { self.sketches_validation_mode } else { self.series_validation_mode @@ -125,6 +141,15 @@ impl EndpointV3Settings { } } +pub(crate) fn extract_site_from_url(raw_url: &str) -> Option { + let url = Url::parse(raw_url).ok()?; + let hostname = url.host_str()?.trim_end_matches('.').to_ascii_lowercase(); + let captures = DD_SITE_FROM_HOSTNAME_REGEX.captures(&hostname)?; + let datacenter = captures.get(1).map_or("", |m| m.as_str()); + let domain = captures.get(2)?.as_str(); + Some(format!("{datacenter}{domain}")) +} + /// Error type for invalid endpoints. #[derive(Debug, Snafu)] #[snafu(context(suffix(false)))] @@ -943,6 +968,7 @@ mod tests { use_v3_sketches: false, series_validation_mode: true, sketches_validation_mode: false, + series_shadow_mode: false, }; assert!(settings.should_receive_validation_headers(Some(MetricsPayloadInfo::v2_series()))); @@ -952,6 +978,79 @@ mod tests { assert!(!settings.should_receive_validation_headers(None)); } + #[test] + fn extract_site_from_url_matches_datadog_domains() { + assert_eq!( + Some("datadoghq.com".to_string()), + extract_site_from_url("https://1-2-3-agent.datadoghq.com/api/v2/series") + ); + assert_eq!( + Some("us3.datadoghq.com".to_string()), + extract_site_from_url("https://intake.profile.us3.datadoghq.com/v1/input") + ); + assert_eq!(None, extract_site_from_url("https://vector.example.test/api/v2/series")); + } + + #[test] + fn shadow_payloads_are_endpoint_scoped() { + let resolved = ResolvedEndpoint::from_raw_endpoint("https://app.datadoghq.com", "fake-api-key") + .expect("endpoint should resolve"); + let settings = EndpointV3Settings::from_endpoint_url( + resolved.configured_endpoint(), + resolved.endpoint(), + &[], + &[], + false, + false, + &["datadoghq.com".to_string()], + ); + + assert!(settings.should_receive_payload(Some(MetricsPayloadInfo::v2_shadow_series()))); + assert!(settings.should_receive_payload(Some(MetricsPayloadInfo::v3_shadow_series()))); + assert!(!settings.should_receive_payload(Some(MetricsPayloadInfo::v3_series()))); + assert!(settings.should_receive_validation_headers(Some(MetricsPayloadInfo::v2_shadow_series()))); + assert!(settings.should_receive_validation_headers(Some(MetricsPayloadInfo::v3_shadow_series()))); + } + + #[test] + fn shadow_payloads_require_allowed_site_and_v2_authoritative_endpoint() { + let us3 = ResolvedEndpoint::from_raw_endpoint("https://app.us3.datadoghq.com", "fake-api-key") + .expect("endpoint should resolve"); + let settings = EndpointV3Settings::from_endpoint_url( + us3.configured_endpoint(), + us3.endpoint(), + &[], + &[], + false, + false, + &["datadoghq.com".to_string()], + ); + assert!(!settings.should_receive_payload(Some(MetricsPayloadInfo::v3_shadow_series()))); + + let settings = EndpointV3Settings::from_endpoint_url( + us3.configured_endpoint(), + us3.endpoint(), + &[], + &[], + false, + false, + &["us3.datadoghq.com".to_string()], + ); + assert!(settings.should_receive_payload(Some(MetricsPayloadInfo::v3_shadow_series()))); + + let v3_series_endpoints = vec![us3.configured_endpoint().to_string()]; + let settings = EndpointV3Settings::from_endpoint_url( + us3.configured_endpoint(), + us3.endpoint(), + &v3_series_endpoints, + &[], + false, + false, + &["us3.datadoghq.com".to_string()], + ); + assert!(!settings.should_receive_payload(Some(MetricsPayloadInfo::v3_shadow_series()))); + } + #[test] fn v3_endpoint_matching_uses_configured_endpoint_before_version_prefix() { let resolved = ResolvedEndpoint::from_raw_endpoint("https://app.datadoghq.com", "fake-api-key") @@ -963,10 +1062,12 @@ mod tests { let v3_series_endpoints = vec!["https://app.datadoghq.com".to_string()]; let settings = EndpointV3Settings::from_endpoint_url( resolved.configured_endpoint(), + resolved.endpoint(), &v3_series_endpoints, &[], false, false, + &["datadoghq.com".to_string()], ); assert!(settings.use_v3_series); @@ -975,12 +1076,16 @@ mod tests { #[test] fn v3_endpoint_matching_is_endpoint_based() { let v3_series_endpoints = vec!["https://app.us".to_string()]; + let resolved = ResolvedEndpoint::from_raw_endpoint("https://app.us5.datadoghq.com", "fake-api-key") + .expect("endpoint should resolve"); let settings = EndpointV3Settings::from_endpoint_url( - "https://app.us5.datadoghq.com", + resolved.configured_endpoint(), + resolved.endpoint(), &v3_series_endpoints, &[], false, false, + &["datadoghq.com".to_string()], ); assert!(!settings.use_v3_series); @@ -989,8 +1094,17 @@ mod tests { #[test] fn v3_endpoint_matching_requires_exact_configured_endpoint() { let v3_series_endpoints = vec!["app.datadoghq.com/".to_string()]; - let settings = - EndpointV3Settings::from_endpoint_url("https://app.datadoghq.com", &v3_series_endpoints, &[], false, false); + let resolved = ResolvedEndpoint::from_raw_endpoint("https://app.datadoghq.com", "fake-api-key") + .expect("endpoint should resolve"); + let settings = EndpointV3Settings::from_endpoint_url( + resolved.configured_endpoint(), + resolved.endpoint(), + &v3_series_endpoints, + &[], + false, + false, + &["datadoghq.com".to_string()], + ); assert!(!settings.use_v3_series); } diff --git a/lib/saluki-components/src/common/datadog/io.rs b/lib/saluki-components/src/common/datadog/io.rs index bff99da9212..7d18ed9aa5c 100644 --- a/lib/saluki-components/src/common/datadog/io.rs +++ b/lib/saluki-components/src/common/datadog/io.rs @@ -367,10 +367,12 @@ async fn run_endpoint_io_loop( let v3_api = config.v3_api(); let endpoint_v3_settings = EndpointV3Settings::from_endpoint_url( &configured_endpoint, + endpoint.endpoint(), &v3_api.series.endpoints, &v3_api.sketches.endpoints, v3_api.series.validate, v3_api.sketches.validate, + &v3_api.series.shadow_sites, ); debug!( endpoint_url, diff --git a/lib/saluki-components/src/common/datadog/protocol.rs b/lib/saluki-components/src/common/datadog/protocol.rs index 394f78ceaa7..72f4f80554d 100644 --- a/lib/saluki-components/src/common/datadog/protocol.rs +++ b/lib/saluki-components/src/common/datadog/protocol.rs @@ -9,6 +9,14 @@ fn default_v3_beta_series_route() -> String { METRICS_SERIES_V3_BETA_PATH.to_owned() } +const fn default_v3_series_shadow_sample_rate() -> f64 { + 0.001 +} + +fn default_v3_series_shadow_sites() -> Vec { + vec!["datadoghq.com".to_string()] +} + /// The type of metrics payload. #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum MetricsPayloadType { @@ -40,6 +48,10 @@ pub struct MetricsPayloadInfo { /// The type of metrics (series or sketches). pub payload_type: MetricsPayloadType, + + /// Whether this payload is part of sampled V3 shadow validation. + #[serde(default)] + pub shadow: bool, } impl MetricsPayloadInfo { @@ -48,6 +60,7 @@ impl MetricsPayloadInfo { Self { version: MetricsProtocolVersion::V2, payload_type: MetricsPayloadType::Series, + shadow: false, } } @@ -56,6 +69,7 @@ impl MetricsPayloadInfo { Self { version: MetricsProtocolVersion::V2, payload_type: MetricsPayloadType::Sketches, + shadow: false, } } @@ -64,6 +78,7 @@ impl MetricsPayloadInfo { Self { version: MetricsProtocolVersion::V3, payload_type: MetricsPayloadType::Series, + shadow: false, } } @@ -72,6 +87,25 @@ impl MetricsPayloadInfo { Self { version: MetricsProtocolVersion::V3, payload_type: MetricsPayloadType::Sketches, + shadow: false, + } + } + + /// Creates a new V2 series payload info for sampled shadow validation. + pub const fn v2_shadow_series() -> Self { + Self { + version: MetricsProtocolVersion::V2, + payload_type: MetricsPayloadType::Series, + shadow: true, + } + } + + /// Creates a new V3 series payload info for sampled shadow validation. + pub const fn v3_shadow_series() -> Self { + Self { + version: MetricsProtocolVersion::V3, + payload_type: MetricsPayloadType::Series, + shadow: true, } } @@ -79,6 +113,11 @@ impl MetricsPayloadInfo { pub const fn is_sketch(&self) -> bool { matches!(self.payload_type, MetricsPayloadType::Sketches) } + + /// Returns true if this payload is part of sampled shadow validation. + pub const fn is_shadow(&self) -> bool { + self.shadow + } } /// V3 API settings for a specific metric type (series or sketches). @@ -109,6 +148,22 @@ pub struct V3ApiSettings { /// Defaults to `/api/intake/metrics/v3beta/series`. #[serde(default = "default_v3_beta_series_route")] pub beta_route: String, + + /// Per-flush probability of sending a sampled V3 beta shadow payload. + /// + /// This only applies to series metrics when V3 is not authoritative. + /// + /// Defaults to `0.001`. + #[serde(default = "default_v3_series_shadow_sample_rate")] + pub shadow_sample_rate: f64, + + /// Datadog sites eligible for sampled V3 beta shadow payloads. + /// + /// This only applies to series metrics when V3 is not authoritative. + /// + /// Defaults to `["datadoghq.com"]`. + #[serde(default = "default_v3_series_shadow_sites")] + pub shadow_sites: Vec, } impl Default for V3ApiSettings { @@ -118,6 +173,8 @@ impl Default for V3ApiSettings { validate: false, use_beta: false, beta_route: default_v3_beta_series_route(), + shadow_sample_rate: default_v3_series_shadow_sample_rate(), + shadow_sites: default_v3_series_shadow_sites(), } } } diff --git a/lib/saluki-components/src/config_registry/datadog/encoders.rs b/lib/saluki-components/src/config_registry/datadog/encoders.rs index 5075813ce2f..d6e020fd5cd 100644 --- a/lib/saluki-components/src/config_registry/datadog/encoders.rs +++ b/lib/saluki-components/src/config_registry/datadog/encoders.rs @@ -230,6 +230,36 @@ crate::declare_annotations! { pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), }; + /// `serializer_experimental_use_v3_api.series.shadow_sample_rate`—sample rate for V3 beta series shadows. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SAMPLE_RATE = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SAMPLE_RATE, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: None, + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + + /// `serializer_experimental_use_v3_api.series.shadow_sites`—Datadog sites eligible for V3 beta shadows. + SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SITES = SalukiAnnotation { + schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SITES, + support_level: SupportLevel::Full, + additional_yaml_paths: &[], + env_var_override: None, + used_by: &[ + structs::DATADOG_METRICS_CONFIGURATION, + structs::FORWARDER_CONFIGURATION, + ], + value_type_override: None, + test_json: Some(r#"["us3.datadoghq.com"]"#), + pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), + }; + /// `serializer_experimental_use_v3_api.series.use_beta`—whether to send V3 series payloads to the beta route. SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_USE_BETA = SalukiAnnotation { schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_USE_BETA, diff --git a/lib/saluki-components/src/config_registry/datadog/unsupported.rs b/lib/saluki-components/src/config_registry/datadog/unsupported.rs index 77f132a5d44..21290a5cba2 100644 --- a/lib/saluki-components/src/config_registry/datadog/unsupported.rs +++ b/lib/saluki-components/src/config_registry/datadog/unsupported.rs @@ -177,34 +177,6 @@ crate::declare_annotations! { pipeline_affinity: PipelineAffinity::CrossCutting, }; - /// `serializer_experimental_use_v3_api.series.shadow_sample_rate` - V3 API series shadow traffic sample rate. - SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SAMPLE_RATE = SalukiAnnotation { - schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SAMPLE_RATE, - // V3 shadow traffic not implemented. - support_level: SupportLevel::Incompatible(Severity::Low), - additional_yaml_paths: &[], - env_var_override: None, - used_by: &[], - value_type_override: None, - test_json: None, - // Metrics encoder (dd_metrics_encode) is used by DogStatsD, Checks, and OTLP native (Traces active); APM traces use a separate encoder. - pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), - }; - - /// `serializer_experimental_use_v3_api.series.shadow_sites` - V3 API series shadow traffic sites. - SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SITES = SalukiAnnotation { - schema: &schema::SERIALIZER_EXPERIMENTAL_USE_V3_API_SERIES_SHADOW_SITES, - // V3 shadow traffic not implemented. - support_level: SupportLevel::Incompatible(Severity::Low), - additional_yaml_paths: &[], - env_var_override: None, - used_by: &[], - value_type_override: None, - test_json: None, - // Metrics encoder (dd_metrics_encode) is used by DogStatsD, Checks, and OTLP native (Traces active); APM traces use a separate encoder. - pipeline_affinity: PipelineAffinity::Pipelines(&[Pipeline::DogStatsD, Pipeline::Checks, Pipeline::Traces]), - }; - /// `serializer_max_series_points_per_payload` - max series points per payload. SERIALIZER_MAX_SERIES_POINTS_PER_PAYLOAD = SalukiAnnotation { schema: &schema::SERIALIZER_MAX_SERIES_POINTS_PER_PAYLOAD, diff --git a/lib/saluki-components/src/encoders/datadog/metrics/mod.rs b/lib/saluki-components/src/encoders/datadog/metrics/mod.rs index 4b64edff5ad..378da6fe6d3 100644 --- a/lib/saluki-components/src/encoders/datadog/metrics/mod.rs +++ b/lib/saluki-components/src/encoders/datadog/metrics/mod.rs @@ -32,7 +32,12 @@ use tokio::{io::AsyncWriteExt as _, select, sync::mpsc, time::sleep}; use tracing::{debug, error, warn}; use uuid::Uuid; -use self::v3::{V3EncodedRequest, V3PayloadLimits, V3PayloadRequest}; +#[cfg(test)] +use self::shadow::shadow_sample_matches; +use self::{ + shadow::{SeriesShadowConfig, SeriesShadowState}, + v3::{V3EncodedRequest, V3PayloadLimits, V3PayloadRequest}, +}; use crate::{ common::datadog::{ clamp_payload_limits, @@ -49,6 +54,7 @@ use crate::{ mod endpoint; use self::endpoint::{EndpointConfiguration, MetricsEndpoint}; +mod shadow; mod v1; mod v2; mod v3; @@ -100,6 +106,14 @@ const fn default_log_payloads() -> bool { false } +fn series_shadow_config_for_endpoint(series_endpoint: MetricsEndpoint, sample_rate: f64) -> SeriesShadowConfig { + SeriesShadowConfig::new(if series_endpoint == MetricsEndpoint::SeriesV2 { + sample_rate + } else { + 0.0 + }) +} + /// Encoding mode for a metrics endpoint. #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum MetricsEncoderMode { @@ -301,12 +315,13 @@ impl EncoderBuilder for DatadogMetricsConfiguration { } else { v2_compression_scheme }; - let v3_series_endpoint_uri = if self.v3_api.series.use_beta { + let series_endpoint_uri = if self.v3_api.series.use_beta { self.v3_api.series.beta_route.clone() } else { V3_SERIES_ENDPOINT_URI.to_string() }; - let v3_payload_limits = V3PayloadLimits::new( + let shadow_series_endpoint_uri = self.v3_api.series.beta_route.clone(); + let payload_limits = V3PayloadLimits::new( self.max_series_payload_size, self.max_series_uncompressed_payload_size, self.max_metrics_per_payload, @@ -318,7 +333,7 @@ impl EncoderBuilder for DatadogMetricsConfiguration { self.max_metrics_per_payload, self.additional_tags.clone(), ); - let v3_endpoint_config = EndpointConfiguration::new( + let endpoint_config = EndpointConfiguration::new( v3_compression_scheme, self.max_metrics_per_payload, self.additional_tags.clone(), @@ -329,12 +344,20 @@ impl EncoderBuilder for DatadogMetricsConfiguration { MetricsEncoderMode::from_config(self.v3_api.use_v3_series(), self.v3_api.use_v3_series_validate()); let sketches_mode = MetricsEncoderMode::from_config(self.v3_api.use_v3_sketches(), self.v3_api.use_v3_sketches_validate()); - let series_endpoint = if self.use_v2_api_series { MetricsEndpoint::SeriesV2 } else { MetricsEndpoint::SeriesV1 }; + let series_shadow_config = + series_shadow_config_for_endpoint(series_endpoint, self.v3_api.series.shadow_sample_rate); + let v3_runtime_config = V3RuntimeConfig { + endpoint_config, + payload_limits, + series_endpoint_uri, + shadow_series_endpoint_uri, + series_shadow_config, + }; let generic_payload_limits = clamp_payload_limits( self.max_uncompressed_payload_size, self.max_payload_size, @@ -386,9 +409,7 @@ impl EncoderBuilder for DatadogMetricsConfiguration { v2_sketch_builder, series_mode, sketches_mode, - v3_endpoint_config, - v3_payload_limits, - v3_series_endpoint_uri, + v3_runtime_config, telemetry, flush_timeout, log_payloads: self.log_payloads, @@ -425,14 +446,20 @@ pub struct DatadogMetrics { v2_sketch_builder: Option>, series_mode: MetricsEncoderMode, sketches_mode: MetricsEncoderMode, - v3_endpoint_config: EndpointConfiguration, - v3_payload_limits: V3PayloadLimits, - v3_series_endpoint_uri: String, + v3_runtime_config: V3RuntimeConfig, telemetry: ComponentTelemetry, flush_timeout: Duration, log_payloads: bool, } +struct V3RuntimeConfig { + endpoint_config: EndpointConfiguration, + payload_limits: V3PayloadLimits, + series_endpoint_uri: String, + shadow_series_endpoint_uri: String, + series_shadow_config: SeriesShadowConfig, +} + #[async_trait] impl Encoder for DatadogMetrics { async fn run(mut self: Box, mut context: EncoderContext) -> Result<(), GenericError> { @@ -441,9 +468,7 @@ impl Encoder for DatadogMetrics { v2_sketch_builder, series_mode, sketches_mode, - v3_endpoint_config, - v3_payload_limits, - v3_series_endpoint_uri, + v3_runtime_config, telemetry, flush_timeout, log_payloads, @@ -459,9 +484,7 @@ impl Encoder for DatadogMetrics { v2_sketch_builder, series_mode, sketches_mode, - v3_endpoint_config, - v3_payload_limits, - v3_series_endpoint_uri, + v3_runtime_config, telemetry, events_rx, payloads_tx, @@ -538,26 +561,35 @@ fn log_metric_payload(metric: &Metric) { async fn run_request_builder( mut v2_series_builder: Option>, mut v2_sketch_builder: Option>, series_mode: MetricsEncoderMode, - sketches_mode: MetricsEncoderMode, v3_endpoint_config: EndpointConfiguration, v3_payload_limits: V3PayloadLimits, - v3_series_endpoint_uri: String, telemetry: ComponentTelemetry, mut events_rx: mpsc::Receiver, - mut payloads_tx: mpsc::Sender, flush_timeout: Duration, log_payloads: bool, + sketches_mode: MetricsEncoderMode, v3_runtime_config: V3RuntimeConfig, telemetry: ComponentTelemetry, + mut events_rx: mpsc::Receiver, mut payloads_tx: mpsc::Sender, + flush_timeout: Duration, log_payloads: bool, ) -> Result<(), GenericError> { let mut pending_flush = false; let pending_flush_timeout = sleep(flush_timeout); tokio::pin!(pending_flush_timeout); - let mut v3_series_metrics = series_mode.needs_v3().then(Vec::::new); + let mut v3_series_metrics = + (series_mode.needs_v3() || v3_runtime_config.series_shadow_config.is_enabled()).then(Vec::::new); let mut v3_sketch_metrics = sketches_mode.needs_v3().then(Vec::::new); let mut series_batch_id = None; let mut sketches_batch_id = None; + let mut series_shadow_state = SeriesShadowState::default(); + let series_shadow_config = v3_runtime_config.series_shadow_config; let tag_series = series_mode.needs_tagging(); let tag_sketches = sketches_mode.needs_tagging(); let v3_flush_context = V3FlushContext { - endpoint_config: &v3_endpoint_config, - payload_limits: v3_payload_limits, - series_endpoint_uri: &v3_series_endpoint_uri, + endpoint_config: &v3_runtime_config.endpoint_config, + payload_limits: v3_runtime_config.payload_limits, + series_endpoint_uri: &v3_runtime_config.series_endpoint_uri, + telemetry: &telemetry, + }; + let v3_shadow_flush_context = V3FlushContext { + endpoint_config: &v3_runtime_config.endpoint_config, + payload_limits: v3_runtime_config.payload_limits, + series_endpoint_uri: &v3_runtime_config.shadow_series_endpoint_uri, telemetry: &telemetry, }; @@ -590,17 +622,25 @@ async fn run_request_builder( &mut sketches_batch_id, ), }; - if endpoint_mode.needs_batch_id() && batch_id.is_none() { + let is_series = matches!(endpoint, MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2); + let series_shadow_active = is_series + && matches!(endpoint_mode, MetricsEncoderMode::V2Only) + && series_shadow_state.ensure_decision(series_shadow_config); + let needs_batch_id = endpoint_mode.needs_batch_id() || series_shadow_active; + if needs_batch_id && batch_id.is_none() { *batch_id = Some(Uuid::now_v7()); } - let active_batch_id = endpoint_mode.needs_batch_id().then_some(batch_id.as_ref()).flatten(); + let active_batch_id = needs_batch_id.then_some(batch_id.as_ref()).flatten(); + let should_buffer_v3 = endpoint_mode.needs_v3() || series_shadow_active; // Store a copy of the metric in `maybe_v3_metrics` if it's present. // // We have to do this before encoding because `RequestBuilder::encode` consumes the metric. This also means we'll // need to _remove_ the metric if encoding fails. - if let Some(metrics) = maybe_v3_metrics { - metrics.push(metric.clone()); + if should_buffer_v3 { + if let Some(metrics) = maybe_v3_metrics { + metrics.push(metric.clone()); + } } // Attempt encoding the metric for V2 if configured. @@ -610,12 +650,27 @@ async fn run_request_builder( // the metric wasn't encoded for V2 and we want our V2/V3 payload batches to be consistent in // validation mode. let v2_payload_info = match endpoint { - MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => tag_series.then(MetricsPayloadInfo::v2_series), + MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => { + if series_shadow_active { + Some(MetricsPayloadInfo::v2_shadow_series()) + } else { + tag_series.then(MetricsPayloadInfo::v2_series) + } + } MetricsEndpoint::Sketches => tag_sketches.then(MetricsPayloadInfo::v2_sketches), }; + // If V2 flushes while this batch is not shadowed, the current metric may start the next V2 batch. + // Keep a clone so we can add it to the next V3 shadow batch if that next batch samples in. + let metric_for_next_shadow_batch = (is_series + && matches!(endpoint_mode, MetricsEncoderMode::V2Only) + && !series_shadow_active) + .then(|| metric.clone()); + let mut v2_encoded = false; let v2_flushed = if let Some(builder) = maybe_v2_builder { - let result = encode_v2_metrics(builder, metric, &telemetry, &mut payloads_tx, active_batch_id, v2_payload_info).await?; - if !result.encoded() { + let result = + encode_v2_metrics(builder, metric, &telemetry, &mut payloads_tx, active_batch_id, v2_payload_info).await?; + v2_encoded = result.encoded(); + if should_buffer_v3 && !result.encoded() { if let Some(metrics) = maybe_v3_metrics { let _ = metrics.pop(); } @@ -629,13 +684,19 @@ async fn run_request_builder( // If we flushed via V2, or we've hit our max metrics per payload limit in pure V3 mode, we need to flush our V3 metrics // as well. let v3_payload_info = match endpoint { - MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => tag_series.then(MetricsPayloadInfo::v3_series), + MetricsEndpoint::SeriesV1 | MetricsEndpoint::SeriesV2 => { + if series_shadow_active { + Some(MetricsPayloadInfo::v3_shadow_series()) + } else { + tag_series.then(MetricsPayloadInfo::v3_series) + } + } MetricsEndpoint::Sketches => tag_sketches.then(MetricsPayloadInfo::v3_sketches), }; - let mut carried_metric_into_next_batch = false; + let mut split_metric = None; let v3_flushed = if let Some(v3_metrics) = maybe_v3_metrics { let should_flush_v3 = match endpoint_mode { - MetricsEncoderMode::V2Only => false, + MetricsEncoderMode::V2Only => series_shadow_active && v2_flushed, MetricsEncoderMode::V3Enabled => { v2_flushed || v3_flush_context.payload_limits.should_flush_metric_count_limit(v3_metrics) } @@ -645,20 +706,21 @@ async fn run_request_builder( // V2 flushes the previous batch without the current metric (the metric // that triggered the flush is re-encoded into the next V2 batch). Pop it // from V3 before flushing so both batches cover the same set of metrics. - let split_metric = if v2_flushed { v3_metrics.pop() } else { None }; + split_metric = if v2_flushed { v3_metrics.pop() } else { None }; + let flush_context = if series_shadow_active { + v3_shadow_flush_context + } else { + v3_flush_context + }; encode_and_flush_v3_metrics( endpoint, - v3_flush_context, + flush_context, v3_metrics, &mut payloads_tx, active_batch_id, v3_payload_info, ) .await?; - if let Some(m) = split_metric { - carried_metric_into_next_batch = true; - v3_metrics.push(m); - } true } else { false @@ -667,10 +729,52 @@ async fn run_request_builder( false }; - // If a V2-triggered split leaves the current metric pending in the next batch, assign that pending - // V2/V3 pair a fresh validation ID. Otherwise, the next timeout flush would omit validation headers. + // A validation flush completes the current V2/V3 pair. If V2 carried the current metric into the + // next batch, keep the V3 copy and assign that next pair a fresh validation ID. if endpoint_mode.needs_batch_id() && (v2_flushed || v3_flushed) { - *batch_id = carried_metric_into_next_batch.then(Uuid::now_v7); + *batch_id = if let Some(m) = split_metric.take() { + if let Some(metrics) = maybe_v3_metrics { + metrics.push(m); + } + Some(Uuid::now_v7()) + } else { + None + }; + } else if is_series && series_shadow_active && (v2_flushed || v3_flushed) { + // A shadow flush completes the current sampled pair. The next V2 batch gets a new shadow sample + // decision, so only carry the split metric into V3 if that next batch samples in. + series_shadow_state.reset(); + *batch_id = if let Some(m) = split_metric.take() { + if series_shadow_state.ensure_decision(series_shadow_config) { + if let Some(metrics) = maybe_v3_metrics { + metrics.push(m); + } + Some(Uuid::now_v7()) + } else { + None + } + } else { + None + }; + } else if is_series + && matches!(endpoint_mode, MetricsEncoderMode::V2Only) + && series_shadow_config.is_enabled() + && v2_flushed + { + // This V2 batch was not shadowed, but the flushed metric may have started the next V2 batch. + // Re-sample for that next batch and seed the V3 buffer only if the new decision samples in. + series_shadow_state.reset(); + *batch_id = None; + if v2_encoded { + if let Some(m) = metric_for_next_shadow_batch { + if series_shadow_state.ensure_decision(series_shadow_config) { + if let Some(metrics) = maybe_v3_metrics { + metrics.push(m); + } + *batch_id = Some(Uuid::now_v7()); + } + } + } } } @@ -688,8 +792,18 @@ async fn run_request_builder( pending_flush = false; // Flush any pending series metrics. - let v2_series_payload_info = tag_series.then(MetricsPayloadInfo::v2_series); - let series_active_batch_id = series_mode.needs_batch_id().then_some(series_batch_id.as_ref()).flatten(); + // Timeout flushes complete the current batch, so reuse the existing shadow decision instead of sampling + // a new one for metrics that are already buffered. + let series_shadow_active = matches!(series_mode, MetricsEncoderMode::V2Only) + && series_shadow_state.active.unwrap_or(false); + let v2_series_payload_info = if series_shadow_active { + Some(MetricsPayloadInfo::v2_shadow_series()) + } else { + tag_series.then(MetricsPayloadInfo::v2_series) + }; + let series_active_batch_id = (series_mode.needs_batch_id() || series_shadow_active) + .then_some(series_batch_id.as_ref()) + .flatten(); let mut v2_series_flush_succeeded = true; if let Some(builder) = &mut v2_series_builder { if let Err(e) = flush_v2_metrics(builder, &mut payloads_tx, series_active_batch_id, v2_series_payload_info).await { @@ -698,11 +812,21 @@ async fn run_request_builder( } } - let v3_series_payload_info = tag_series.then(MetricsPayloadInfo::v3_series); + let v3_series_payload_info = if series_shadow_active { + Some(MetricsPayloadInfo::v3_shadow_series()) + } else { + tag_series.then(MetricsPayloadInfo::v3_series) + }; if let Some(metrics) = &mut v3_series_metrics { if v2_series_flush_succeeded { + // Shadow series use the V3 beta route; normal V3 series use the configured authoritative route. + let flush_context = if series_shadow_active { + v3_shadow_flush_context + } else { + v3_flush_context + }; if let Err(e) = encode_and_flush_v3_series_metrics( - v3_flush_context, + flush_context, metrics, &mut payloads_tx, series_active_batch_id, @@ -713,6 +837,7 @@ async fn run_request_builder( error!(error = %e, "Failed to flush V3 series metrics: {}", e); } } else { + // Validation/shadow V3 must not outlive a failed V2 baseline flush. warn!("Failed to flush V2 series metrics, skipping V3 series flush."); metrics.clear(); } @@ -720,6 +845,10 @@ async fn run_request_builder( if series_mode.needs_batch_id() { series_batch_id = None; } + if matches!(series_mode, MetricsEncoderMode::V2Only) && series_shadow_config.is_enabled() { + series_shadow_state.reset(); + series_batch_id = None; + } // Flush any pending sketch metrics. let v2_sketches_payload_info = tag_sketches.then(MetricsPayloadInfo::v2_sketches); @@ -1377,6 +1506,9 @@ serializer_experimental_use_v3_api: validate: true use_beta: true beta_route: /api/intake/metrics/custom/series + shadow_sample_rate: 0.25 + shadow_sites: + - datadoghq.eu sketches: endpoints: - https://app.datadoghq.eu @@ -1393,12 +1525,36 @@ serializer_experimental_use_v3_api: assert!(config.v3_api.series.validate); assert!(config.v3_api.series.use_beta); assert_eq!("/api/intake/metrics/custom/series", config.v3_api.series.beta_route); + assert_eq!(0.25, config.v3_api.series.shadow_sample_rate); + assert_eq!(vec!["datadoghq.eu"], config.v3_api.series.shadow_sites); assert_eq!( Some("https://app.datadoghq.eu"), config.v3_api.sketches.endpoints.first().map(String::as_str) ); } + #[test] + fn agent_v3_api_shadow_defaults_match_agent() { + let config = serde_yaml::from_str::("").expect("configuration should deserialize"); + + assert_eq!(0.001, config.v3_api.series.shadow_sample_rate); + assert_eq!(vec!["datadoghq.com"], config.v3_api.series.shadow_sites); + } + + #[test] + fn shadow_sample_matches_agent_threshold_behavior() { + assert!(!shadow_sample_matches(0.0, 0.0)); + assert!(shadow_sample_matches(0.5, 0.4)); + assert!(!shadow_sample_matches(0.5, 0.5)); + assert!(!shadow_sample_matches(0.5, 0.6)); + } + + #[test] + fn shadow_sampling_is_disabled_for_v1_series_baseline() { + assert!(series_shadow_config_for_endpoint(MetricsEndpoint::SeriesV2, 1.0).is_enabled()); + assert!(!series_shadow_config_for_endpoint(MetricsEndpoint::SeriesV1, 1.0).is_enabled()); + } + #[tokio::test] async fn create_v3_request_uses_configured_endpoint_uri() { let request = create_v3_request( @@ -1806,9 +1962,13 @@ serializer_experimental_use_v3_api: None, MetricsEncoderMode::Validation, MetricsEncoderMode::V2Only, - v3_endpoint_config, - V3PayloadLimits::new(usize::MAX, usize::MAX, 10_000, 10_000), - V3_SERIES_ENDPOINT_URI.to_string(), + V3RuntimeConfig { + endpoint_config: v3_endpoint_config, + payload_limits: V3PayloadLimits::new(usize::MAX, usize::MAX, 10_000, 10_000), + series_endpoint_uri: V3_SERIES_ENDPOINT_URI.to_string(), + shadow_series_endpoint_uri: "/api/intake/metrics/v3beta/series".to_string(), + series_shadow_config: SeriesShadowConfig::new(0.0), + }, telemetry, events_rx, payloads_tx, @@ -1864,6 +2024,143 @@ serializer_experimental_use_v3_api: .expect("request builder should stop cleanly"); } + #[tokio::test] + async fn shadow_sampled_series_flush_sends_v2_and_v3_beta_with_same_batch_id() { + let v2_endpoint_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let v2_series_builder = Some( + v2::create_v2_request_builder(MetricsEndpoint::SeriesV2, &v2_endpoint_config) + .await + .expect("V2 request builder should be created"), + ); + let v3_endpoint_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + let (events_tx, events_rx) = tokio::sync::mpsc::channel(1); + let (payloads_tx, mut payloads_rx) = tokio::sync::mpsc::channel(8); + + let request_builder_handle = tokio::spawn(run_request_builder( + v2_series_builder, + None, + MetricsEncoderMode::V2Only, + MetricsEncoderMode::V2Only, + V3RuntimeConfig { + endpoint_config: v3_endpoint_config, + payload_limits: V3PayloadLimits::new(usize::MAX, usize::MAX, 10_000, 10_000), + series_endpoint_uri: V3_SERIES_ENDPOINT_URI.to_string(), + shadow_series_endpoint_uri: "/api/intake/metrics/v3beta/custom".to_string(), + series_shadow_config: SeriesShadowConfig::new(1.0), + }, + telemetry, + events_rx, + payloads_tx, + Duration::from_millis(10), + false, + )); + + let mut events = EventsBuffer::default(); + assert!(events + .try_push(Event::Metric(Metric::counter("shadow.sampled", 1.0))) + .is_none()); + events_tx + .send(events) + .await + .expect("events should be sent to request builder"); + + let mut flushed_requests = Vec::new(); + for _ in 0..2 { + let payload = timeout(Duration::from_secs(1), payloads_rx.recv()) + .await + .expect("payload should arrive before timeout") + .expect("payload channel should remain open"); + let Payload::Http(http_payload) = payload else { + panic!("expected HTTP payload"); + }; + let (metadata, request) = http_payload.into_parts(); + let payload_info = *metadata + .get::() + .expect("metrics payload info should be present"); + let batch_id = request + .headers() + .get("X-Metrics-Request-ID") + .expect("shadow batch ID header should be present") + .to_str() + .expect("shadow batch ID should be valid header text") + .to_string(); + flushed_requests.push((request.uri().to_string(), payload_info, batch_id)); + } + + assert_eq!("/api/v2/series", flushed_requests[0].0); + assert_eq!(MetricsPayloadInfo::v2_shadow_series(), flushed_requests[0].1); + assert_eq!("/api/intake/metrics/v3beta/custom", flushed_requests[1].0); + assert_eq!(MetricsPayloadInfo::v3_shadow_series(), flushed_requests[1].1); + assert_eq!(flushed_requests[0].2, flushed_requests[1].2); + + drop(events_tx); + request_builder_handle + .await + .expect("request builder task should complete") + .expect("request builder should stop cleanly"); + } + + #[tokio::test] + async fn shadow_sample_rate_zero_sends_only_v2_without_validation_headers() { + let v2_endpoint_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let v2_series_builder = Some( + v2::create_v2_request_builder(MetricsEndpoint::SeriesV2, &v2_endpoint_config) + .await + .expect("V2 request builder should be created"), + ); + let v3_endpoint_config = EndpointConfiguration::new(CompressionScheme::noop(), 10_000, None); + let telemetry = ComponentTelemetry::from_builder(&MetricsBuilder::default()); + let (events_tx, events_rx) = tokio::sync::mpsc::channel(1); + let (payloads_tx, mut payloads_rx) = tokio::sync::mpsc::channel(8); + + let request_builder_handle = tokio::spawn(run_request_builder( + v2_series_builder, + None, + MetricsEncoderMode::V2Only, + MetricsEncoderMode::V2Only, + V3RuntimeConfig { + endpoint_config: v3_endpoint_config, + payload_limits: V3PayloadLimits::new(usize::MAX, usize::MAX, 10_000, 10_000), + series_endpoint_uri: V3_SERIES_ENDPOINT_URI.to_string(), + shadow_series_endpoint_uri: "/api/intake/metrics/v3beta/series".to_string(), + series_shadow_config: SeriesShadowConfig::new(0.0), + }, + telemetry, + events_rx, + payloads_tx, + Duration::from_millis(10), + false, + )); + + let mut events = EventsBuffer::default(); + assert!(events + .try_push(Event::Metric(Metric::counter("shadow.disabled", 1.0))) + .is_none()); + events_tx + .send(events) + .await + .expect("events should be sent to request builder"); + + let payload = timeout(Duration::from_secs(1), payloads_rx.recv()) + .await + .expect("payload should arrive before timeout") + .expect("payload channel should remain open"); + let Payload::Http(http_payload) = payload else { + panic!("expected HTTP payload"); + }; + let (_, request) = http_payload.into_parts(); + assert_eq!("/api/v2/series", request.uri()); + assert!(!request.headers().contains_key("X-Metrics-Request-ID")); + assert!(timeout(Duration::from_millis(50), payloads_rx.recv()).await.is_err()); + + drop(events_tx); + request_builder_handle + .await + .expect("request builder task should complete") + .expect("request builder should stop cleanly"); + } + fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool { haystack.windows(needle.len()).any(|window| window == needle) } @@ -1896,6 +2193,8 @@ mod config_smoke { structs::DATADOG_METRICS_CONFIGURATION, &[ "serializer_experimental_use_v3_api.sketches.beta_route", + "serializer_experimental_use_v3_api.sketches.shadow_sample_rate", + "serializer_experimental_use_v3_api.sketches.shadow_sites", "serializer_experimental_use_v3_api.sketches.use_beta", ], json!({}), diff --git a/lib/saluki-components/src/encoders/datadog/metrics/shadow.rs b/lib/saluki-components/src/encoders/datadog/metrics/shadow.rs new file mode 100644 index 00000000000..371aa66fcf6 --- /dev/null +++ b/lib/saluki-components/src/encoders/datadog/metrics/shadow.rs @@ -0,0 +1,39 @@ +use rand::RngExt as _; + +#[derive(Clone, Copy, Debug)] +pub(super) struct SeriesShadowConfig { + sample_rate: f64, +} + +impl SeriesShadowConfig { + pub(super) const fn new(sample_rate: f64) -> Self { + Self { sample_rate } + } + + pub(super) fn is_enabled(self) -> bool { + self.sample_rate > 0.0 + } + + fn sample(self) -> bool { + self.is_enabled() && shadow_sample_matches(self.sample_rate, rand::rng().random::()) + } +} + +#[derive(Debug, Default)] +pub(super) struct SeriesShadowState { + pub(super) active: Option, +} + +impl SeriesShadowState { + pub(super) fn ensure_decision(&mut self, config: SeriesShadowConfig) -> bool { + *self.active.get_or_insert_with(|| config.sample()) + } + + pub(super) fn reset(&mut self) { + self.active = None; + } +} + +pub(super) fn shadow_sample_matches(sample_rate: f64, sample: f64) -> bool { + sample_rate > 0.0 && sample < sample_rate +} diff --git a/lib/saluki-components/src/forwarders/datadog/mod.rs b/lib/saluki-components/src/forwarders/datadog/mod.rs index b60943bcf23..7f96cbfcf16 100644 --- a/lib/saluki-components/src/forwarders/datadog/mod.rs +++ b/lib/saluki-components/src/forwarders/datadog/mod.rs @@ -5,7 +5,7 @@ use saluki_common::buf::FrozenChunkedBytesBuffer; use saluki_config::GenericConfiguration; use saluki_core::{ components::{forwarders::*, ComponentContext}, - data_model::payload::PayloadType, + data_model::payload::{PayloadMetadata, PayloadType}, observability::ComponentMetricsExt as _, }; use saluki_error::GenericError; @@ -17,6 +17,7 @@ use tracing::debug; use crate::common::datadog::{ config::ForwarderConfiguration, io::TransactionForwarder, + protocol::MetricsPayloadInfo, telemetry::ComponentTelemetry, transaction::{Metadata, Transaction}, DEFAULT_INTAKE_COMPRESSED_SIZE_LIMIT, @@ -151,10 +152,7 @@ impl Forwarder for Datadog { maybe_payload = context.payloads().next() => match maybe_payload { Some(payload) => if let Some(http_payload) = payload.try_into_http_payload() { let (payload_meta, request) = http_payload.into_parts(); - let transaction_meta = Metadata::from_event_and_data_point_count( - payload_meta.event_count(), - payload_meta.data_point_count(), - ); + let transaction_meta = transaction_metadata_from_payload_metadata(&payload_meta); let transaction = Transaction::from_original(transaction_meta, request); forwarder.send_transaction(transaction).await?; @@ -173,6 +171,13 @@ impl Forwarder for Datadog { } } +fn transaction_metadata_from_payload_metadata(payload_meta: &PayloadMetadata) -> Metadata { + let mut transaction_meta = + Metadata::from_event_and_data_point_count(payload_meta.event_count(), payload_meta.data_point_count()); + transaction_meta.payload_info = payload_meta.get::().copied(); + transaction_meta +} + fn get_dd_endpoint_name(uri: &Uri) -> Option { match uri.path() { "/api/v2/logs" => Some(MetaString::from_static("logs_v2")), diff --git a/test/correctness/cases/dsd-plain-v3-validation/config.yaml b/test/correctness/cases/dsd-plain-v3-validation/config.yaml new file mode 100644 index 00000000000..aa81d7d304f --- /dev/null +++ b/test/correctness/cases/dsd-plain-v3-validation/config.yaml @@ -0,0 +1,19 @@ +type: correctness +runtime: docker +analysis_mode: metrics +baseline: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test +comparison: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test + - DD_DATA_PLANE_ENABLED=true + - DD_DATA_PLANE_STANDALONE_MODE=true + - DD_DATA_PLANE_DOGSTATSD_ENABLED=true + - DD_AGGREGATE_CONTEXT_LIMIT=500000 diff --git a/test/correctness/cases/dsd-plain-v3-validation/datadog.yaml b/test/correctness/cases/dsd-plain-v3-validation/datadog.yaml new file mode 100644 index 00000000000..98525edcdd0 --- /dev/null +++ b/test/correctness/cases/dsd-plain-v3-validation/datadog.yaml @@ -0,0 +1,36 @@ +# Using a fixed hostname is both required to avoid errors, and also will ensure consistent tags between DSD/ADP. +hostname: "correctness-testing" + +# Dummy API key. +api_key: dummy-api-key-correctness-testing + +# We have to specifically configure the health port to use. +health_port: 5555 + +# Point ourselves at the datadog-intake service. +dd_url: "http://datadog-intake:2049" + +# Turn off UDP and listen on a UDS socket instead. +dogstatsd_port: 0 +dogstatsd_socket: /airlock/metrics.sock + +# Ensure origin detection is disabled since we can't support it with ADP in standalone mode. +dogstatsd_origin_detection: false + +# Gauges can be processed out-of-order when multiple workers are used, while ADP does not use multiple workers, so ADP +# always ends up with the correct (last seen) value, while DSD might return the last seen value... or the value seen +# four updates ago, etc etc. +dogstatsd_workers_count: 1 + +# Enable V3 metrics encoding in validation mode: both V2 and V3 payloads are sent simultaneously, +# paired by X-Metrics-Request-ID. V3 payloads are counted in the metrics dump; V2 payloads are +# used only for comparison against V3 to validate encoding correctness. +serializer_experimental_use_v3_api: + series: + endpoints: + - "http://datadog-intake:2049" + validate: true + sketches: + endpoints: + - "http://datadog-intake:2049" + validate: true diff --git a/test/correctness/cases/dsd-plain-v3-validation/millstone.yaml b/test/correctness/cases/dsd-plain-v3-validation/millstone.yaml new file mode 100644 index 00000000000..8dae7d34cfc --- /dev/null +++ b/test/correctness/cases/dsd-plain-v3-validation/millstone.yaml @@ -0,0 +1,57 @@ +seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] +target: "unixgram:///$GROUP-airlock/metrics.sock" +aggregation_bucket_width_secs: 10 +volume: 10000 +corpus: + # TODO: This is a little confusing, because we're specifying the number of metrics to generate (which we _will_ + # honor faithfully) but since we're specifying the contexts count in the payload definition, we might not + # actually generate 10,000 unique contexts, but instead somewhere below 3,000, where each of them is repeated a + # few times to reach the total count. + # + # We need to figure that out, since the intent is that specifying a fixed count should lead to that many metrics + # (and no more) being generated, such that you could depend on that for testing purposes. + size: 10000 + payload: + dogstatsd: + contexts: + constant: 3000 + name_length: + inclusive: + min: 1 + max: 32 + tag_length: + inclusive: + min: 3 + max: 16 + tags_per_msg: + inclusive: + min: 2 + max: 8 + value: + float_probability: 0.5 + range: + inclusive: + min: -9999999 + max: 9999999 + multivalue_count: + inclusive: + min: 2 + max: 32 + multivalue_pack_probability: 0.08 + kind_weights: + metric: 100 + event: 0 + service_check: 0 + # Weights based on analyzing internal Datadog usage data of metric type for metrics sent to the Agent over DogStatsD. + metric_weights: + count: 208 + gauge: 66 + timer: 0 + distribution: 72 + # We specifically _don't_ want to generate sets, because we can't assert their correctness once they've been + # aggregated: a gauge is generated for each aggregator flush that represents the unique number of values in a + # given set, but in general, gauges are meant to be last-write-wins, so unless the metric names/tags can + # indicate that they're for a set, we can't know that it's safe for us to _aggregate_ the gauge values, and with + # our default behavior of taking the latest gauge value... we end up with non-deterministic results. + set: 0 + histogram: 1 diff --git a/test/correctness/cases/dsd-plain-v3/config.yaml b/test/correctness/cases/dsd-plain-v3/config.yaml new file mode 100644 index 00000000000..aa81d7d304f --- /dev/null +++ b/test/correctness/cases/dsd-plain-v3/config.yaml @@ -0,0 +1,19 @@ +type: correctness +runtime: docker +analysis_mode: metrics +baseline: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test +comparison: + image: saluki-images/datadog-agent:testing-release + files: + - datadog.yaml:/etc/datadog-agent/datadog.yaml + additional_env_vars: + - DD_API_KEY=correctness-test + - DD_DATA_PLANE_ENABLED=true + - DD_DATA_PLANE_STANDALONE_MODE=true + - DD_DATA_PLANE_DOGSTATSD_ENABLED=true + - DD_AGGREGATE_CONTEXT_LIMIT=500000 diff --git a/test/correctness/cases/dsd-plain-v3/datadog.yaml b/test/correctness/cases/dsd-plain-v3/datadog.yaml new file mode 100644 index 00000000000..467143d6cdb --- /dev/null +++ b/test/correctness/cases/dsd-plain-v3/datadog.yaml @@ -0,0 +1,34 @@ +# Using a fixed hostname is both required to avoid errors, and also will ensure consistent tags between DSD/ADP. +hostname: "correctness-testing" + +# Dummy API key. +api_key: dummy-api-key-correctness-testing + +# We have to specifically configure the health port to use. +health_port: 5555 + +# Point ourselves at the datadog-intake service. +dd_url: "http://datadog-intake:2049" + +# Turn off UDP and listen on a UDS socket instead. +dogstatsd_port: 0 +dogstatsd_socket: /airlock/metrics.sock + +# Ensure origin detection is disabled since we can't support it with ADP in standalone mode. +dogstatsd_origin_detection: false + +# Gauges can be processed out-of-order when multiple workers are used, while ADP does not use multiple workers, so ADP +# always ends up with the correct (last seen) value, while DSD might return the last seen value... or the value seen +# four updates ago, etc etc. +dogstatsd_workers_count: 1 + +# Enable V3 metrics encoding for all endpoints. +# +# Validation mode is off here so this case compares Agent V3 output against ADP V3 output directly. +serializer_experimental_use_v3_api: + series: + endpoints: + - "http://datadog-intake:2049" + sketches: + endpoints: + - "http://datadog-intake:2049" diff --git a/test/correctness/cases/dsd-plain-v3/millstone.yaml b/test/correctness/cases/dsd-plain-v3/millstone.yaml new file mode 100644 index 00000000000..8dae7d34cfc --- /dev/null +++ b/test/correctness/cases/dsd-plain-v3/millstone.yaml @@ -0,0 +1,57 @@ +seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] +target: "unixgram:///$GROUP-airlock/metrics.sock" +aggregation_bucket_width_secs: 10 +volume: 10000 +corpus: + # TODO: This is a little confusing, because we're specifying the number of metrics to generate (which we _will_ + # honor faithfully) but since we're specifying the contexts count in the payload definition, we might not + # actually generate 10,000 unique contexts, but instead somewhere below 3,000, where each of them is repeated a + # few times to reach the total count. + # + # We need to figure that out, since the intent is that specifying a fixed count should lead to that many metrics + # (and no more) being generated, such that you could depend on that for testing purposes. + size: 10000 + payload: + dogstatsd: + contexts: + constant: 3000 + name_length: + inclusive: + min: 1 + max: 32 + tag_length: + inclusive: + min: 3 + max: 16 + tags_per_msg: + inclusive: + min: 2 + max: 8 + value: + float_probability: 0.5 + range: + inclusive: + min: -9999999 + max: 9999999 + multivalue_count: + inclusive: + min: 2 + max: 32 + multivalue_pack_probability: 0.08 + kind_weights: + metric: 100 + event: 0 + service_check: 0 + # Weights based on analyzing internal Datadog usage data of metric type for metrics sent to the Agent over DogStatsD. + metric_weights: + count: 208 + gauge: 66 + timer: 0 + distribution: 72 + # We specifically _don't_ want to generate sets, because we can't assert their correctness once they've been + # aggregated: a gauge is generated for each aggregator flush that represents the unique number of values in a + # given set, but in general, gauges are meant to be last-write-wins, so unless the metric names/tags can + # indicate that they're for a set, we can't know that it's safe for us to _aggregate_ the gauge values, and with + # our default behavior of taking the latest gauge value... we end up with non-deterministic results. + set: 0 + histogram: 1