DataDog · rayz · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 23, 2026
diff --git a/Makefile b/Makefile
@@ -521,7 +521,7 @@ test-all: test test-property test-docs test-miri test-loom
 
 .PHONY: test-correctness
 test-correctness: ## Runs the complete correctness suite
-test-correctness: test-correctness-dsd-plain test-correctness-dsd-origin-detection test-correctness-otlp-metrics test-correctness-otlp-traces test-correctness-otlp-traces-ottl-filtering test-correctness-otlp-traces-ottl-transform
+test-correctness: test-correctness-dsd-plain test-correctness-dsd-origin-detection test-correctness-dsd-tag-filterlist test-correctness-otlp-metrics test-correctness-otlp-traces test-correctness-otlp-traces-ottl-filtering test-correctness-otlp-traces-ottl-transform
 
 .PHONY: test-correctness-dsd-plain
 test-correctness-dsd-plain: build-ground-truth
@@ -535,6 +535,12 @@ test-correctness-dsd-origin-detection: ## Runs the 'dsd-origin-detection' correc
 	@echo "[*] Running 'dsd-origin-detection' correctness test case..."
 	@target/release/ground-truth $(shell pwd)/test/correctness/dsd-origin-detection/config.yaml
 
+.PHONY: test-correctness-dsd-tag-filterlist
+test-correctness-dsd-tag-filterlist: build-ground-truth
+test-correctness-dsd-tag-filterlist: ## Runs the 'dsd-tag-filterlist' correctness test case
+	@echo "[*] Running 'dsd-tag-filterlist' correctness test case..."
+	@target/release/ground-truth $(shell pwd)/test/correctness/dsd-tag-filterlist/config.yaml
+
 .PHONY: test-correctness-otlp-metrics
 test-correctness-otlp-metrics: build-ground-truth
 test-correctness-otlp-metrics: ## Runs the 'otlp-metrics' correctness test case

diff --git a/bin/correctness/ground-truth/src/analysis/metrics/types.rs b/bin/correctness/ground-truth/src/analysis/metrics/types.rs
@@ -107,16 +107,16 @@ impl NormalizedMetric {
     /// # Errors
     ///
     /// If the raw values are empty, or if the values cannot be normalized, an error is returned.
-    pub fn try_from_values(
-        context: MetricContext, mut raw_values: Vec<(u64, MetricValue)>,
+    fn try_from_normalized_values(
+        context: NormalizedMetricContext, mut raw_values: Vec<(u64, MetricValue)>,
     ) -> Result<Self, GenericError> {
         // We need to first sort the raw values by timestamp, to ensure we have proper ordering semantics.
         raw_values.sort_by(|a, b| a.0.cmp(&b.0));
         let value = try_normalize_values(&raw_values)
             .with_error_context(|| format!("Failed to normalize values for metric '{}'", context.name()))?;
 
         Ok(Self {
-            context: NormalizedMetricContext::from_stele_context(context),
+            context,
             raw_values,
             value,
         })
@@ -176,23 +176,24 @@ impl NormalizedMetrics {
             return Err(generic_error!("Cannot normalize an empty set of metrics."));
         }
 
-        // Aggregate metric values by (context, type), using a `BTreeMap` so that we can get sorted metrics for ~free.
+        // Aggregate metric values by normalized (context, type), using a `BTreeMap` so that metric ordering is stable
+        // even when the original intake payload emits the same tag set in a different order.
         //
         // We group by type because metrics with the same context but different types (e.g., Count vs Rate) cannot be
         // merged together during normalization.
         let mut aggregated_context_values = BTreeMap::new();
 
         for metric in metrics {
-            let context = metric.context();
+            let context = NormalizedMetricContext::from_stele_context(metric.context().clone());
             let metric_type = metric_value_type(metric.values());
-            let key = (context.clone(), metric_type);
+            let key = (context, metric_type);
             let context_values = aggregated_context_values.entry(key).or_insert_with(Vec::new);
             context_values.extend_from_slice(metric.values());
         }
 
         let metrics = aggregated_context_values
             .into_iter()
-            .map(|((context, _type), values)| NormalizedMetric::try_from_values(context, values))
+            .map(|((context, _type), values)| NormalizedMetric::try_from_normalized_values(context, values))
             .try_fold(Vec::new(), |mut metrics, maybe_metric| {
                 metrics.push(maybe_metric?);
                 Ok::<_, GenericError>(metrics)
@@ -316,3 +317,42 @@ fn try_normalize_values(raw_values: &[(u64, MetricValue)]) -> Result<MetricValue
 
     Ok(current_value)
 }
+
+#[cfg(test)]
+mod tests {
+    use serde_json::json;
+    use stele::Metric;
+
+    use super::NormalizedMetrics;
+
+    fn metric(name: &str, tags: &[&str], points: &[(u64, f64)]) -> Metric {
+        serde_json::from_value(json!({
+            "context": {
+                "name": name,
+                "tags": tags,
+            },
+            "values": points
+                .iter()
+                .map(|(ts, value)| json!([ts, {"mtype": "Count", "value": value}]))
+                .collect::<Vec<_>>(),
+        }))
+        .expect("metric should deserialize")
+    }
+
+    #[test]
+    fn normalizes_context_order_before_grouping_metrics() {
+        let metrics = vec![
+            metric("tagfilter.miss", &["pod:pod-a", "env:prod"], &[(10, 2.0)]),
+            metric("tagfilter.miss", &["env:prod", "pod:pod-a"], &[(20, 3.0)]),
+        ];
+
+        let normalized = NormalizedMetrics::try_from_stele_metrics(&metrics).expect("metrics should normalize");
+
+        assert_eq!(normalized.len(), 1);
+        assert_eq!(
+            normalized.metrics()[0].context().to_string(),
+            "tagfilter.miss[env:prod, pod:pod-a]"
+        );
+        assert_eq!(normalized.metrics()[0].raw_values().len(), 2);
+    }
+}
diff --git a/lib/saluki-components/src/transforms/tag_filterlist/mod.rs b/lib/saluki-components/src/transforms/tag_filterlist/mod.rs
@@ -17,7 +17,7 @@ use saluki_core::{
         transforms::{Transform, TransformBuilder, TransformContext},
         ComponentContext,
     },
-    data_model::event::EventType,
+    data_model::event::{metric::Metric, EventType},
     topology::OutputDefinition,
 };
 use saluki_error::GenericError;
@@ -215,7 +215,7 @@ fn apply_tag_filter(tags: &SharedTagSet, is_exclude: bool, names: &HashSet<Strin
 /// If the metric name is not present in `filters`, the metric is left unchanged.
 /// If filtering would not change any tags, the metric context is left untouched (zero allocations).
 #[inline]
-pub fn filter_metric_tags(metric: &mut saluki_core::data_model::event::metric::Metric, filters: &CompiledFilters) {
+pub fn filter_metric_tags(metric: &mut Metric, filters: &CompiledFilters) {
     let Some((is_exclude, tag_names)) = filters.get(metric.context().name().as_ref()) else {
         return;
     };

diff --git a/test/correctness/dsd-tag-filterlist/config.yaml b/test/correctness/dsd-tag-filterlist/config.yaml
@@ -0,0 +1,22 @@
+analysis_mode: metrics
+millstone:
+  image: saluki-images/millstone:latest
+  config_path: millstone.yaml
+datadog_intake:
+  image: saluki-images/datadog-intake:latest
+  config_path: ../datadog-intake.yaml
+baseline:
+  image: registry.datadoghq.com/agent:7.76.2-jmx
+  files:
+    - datadog.yaml:/etc/datadog-agent/datadog.yaml
+  additional_env_vars:
+    - DD_API_KEY=correctness-test
+comparison:
+  image: saluki-images/datadog-agent:testing-release
+  files:
+    - datadog.yaml:/etc/datadog-agent/datadog.yaml
+  additional_env_vars:
+    - DD_API_KEY=correctness-test
+    - DD_DATA_PLANE_ENABLED=true
+    - DD_DATA_PLANE_DOGSTATSD_ENABLED=true
+    - DD_AGGREGATE_CONTEXT_LIMIT=500000
diff --git a/test/correctness/dsd-tag-filterlist/datadog.yaml b/test/correctness/dsd-tag-filterlist/datadog.yaml
@@ -0,0 +1,38 @@
+# Using a fixed hostname is both required to avoid errors, and also ensures
+# consistent tags between the baseline Agent and the ADP comparison target.
+hostname: "correctness-testing"
+
+# Dummy API key.
+api_key: dummy-api-key-correctness-testing
+
+# We have to specifically configure the health port to use.
+health_port: 5555
+
+# Point ourselves at the datadog-intake service.
+dd_url: "http://datadog-intake:2049"
+
+# Turn off UDP and listen on a UDS socket instead.
+dogstatsd_port: 0
+dogstatsd_socket: /airlock/metrics.sock
+
+# Keep origin detection enabled to stay closer to real DogStatsD behavior.
+#
+# Note: the current corpus still primarily exercises instrumented-tag filtering;
+# enabling origin detection did not change the final comparable metric set in the
+# latest parity run.
+dogstatsd_origin_detection: true
+
+# Gauges can be processed out-of-order when multiple workers are used, while ADP
+# does not use multiple workers. Keep worker count at 1 to remove that
+# difference from this parity test.
+dogstatsd_workers_count: 1
+
+metric_tag_filterlist:
+  - metric_name: tagfilter.exclude
+    action: exclude
+    tags:
+      - pod
+  - metric_name: tagfilter.include
+    action: include
+    tags:
+      - env
diff --git a/test/correctness/dsd-tag-filterlist/millstone.yaml b/test/correctness/dsd-tag-filterlist/millstone.yaml
@@ -0,0 +1,56 @@
+seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131]
+target: "unixgram:///airlock/metrics.sock"
+aggregation_bucket_width_secs: 10
+volume: 10000
+corpus:
+  size: 10000
+  payload:
+    dogstatsd:
+      contexts:
+        constant: 256
+      metric_names:
+        - "tagfilter.exclude"
+        - "tagfilter.include"
+        - "tagfilter.miss"
+      tag_names:
+        - "env"
+        - "pod"
+      tag_values:
+        - "prod"
+        - "staging"
+        - "pod-a"
+        - "pod-b"
+        - "pod-c"
+      name_length:
+        inclusive:
+          min: 1
+          max: 32
+      tag_length:
+        inclusive:
+          min: 3
+          max: 16
+      tags_per_msg:
+        constant: 2
+      unique_tag_ratio: 1.0
+      value:
+        float_probability: 0.5
+        range:
+          inclusive:
+            min: -9999999
+            max: 9999999
+      multivalue_count:
+        inclusive:
+          min: 2
+          max: 32
+      multivalue_pack_probability: 0.08
+      kind_weights:
+        metric: 100
+        event: 0
+        service_check: 0
+      metric_weights:
+        count: 0
+        gauge: 0
+        timer: 0
+        distribution: 1
+        set: 0
+        histogram: 0