-
Notifications
You must be signed in to change notification settings - Fork 1.4k
DADP-71 Add ADP point telemetry to Agent telemetry #50750
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
88c2f26
9097583
b95d0fe
2861fbe
14c6e6e
a846511
4537ed7
ad45fbb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ package telemetry | |
|
|
||
| import ( | ||
| "fmt" | ||
| "slices" | ||
| "strings" | ||
|
|
||
| dto "github.com/prometheus/client_model/go" | ||
|
|
@@ -25,8 +26,19 @@ const ( | |
| // CheckName is the name of the check | ||
| CheckName = "telemetry" | ||
| prefix = "datadog.agent." | ||
|
|
||
| domainLabel = "domain" | ||
| remoteAgentLabel = "remote_agent" | ||
|
|
||
| pointSentMetric = "point__sent" | ||
| pointDroppedMetric = "point__dropped" | ||
| ) | ||
|
|
||
| // regularRegistryMergeMetrics is intentionally small and explicit: regular registry telemetry is internal by default. | ||
| // Some remote agents, such as ADP, emit telemetry that overlaps with Core Agent default telemetry; only metrics listed | ||
| // here are folded into customer-facing datadog.agent.* telemetry. | ||
| var regularRegistryMergeMetrics = []string{pointSentMetric, pointDroppedMetric} | ||
|
|
||
| type checkImpl struct { | ||
| corechecks.CheckBase | ||
| telemetry telemetry.Component | ||
|
|
@@ -38,21 +50,177 @@ func (c *checkImpl) Run() error { | |
| return err | ||
| } | ||
|
|
||
| // Remote Agent Registry telemetry lives in the regular registry. Gather it on a best-effort basis so failures there | ||
| // do not prevent the customer-facing telemetry check from reporting Core Agent default telemetry values. | ||
| var regularMfs []*dto.MetricFamily | ||
| if gathered, err := c.telemetry.Gather(false); err != nil { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that this will now call out to remote agents via RAR every 15 seconds (default interval on
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But why is it important to change if it will be sent out only every 15m?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. COAT is emitted every 15m, but this PR also includes the telemetry in the |
||
| log.Warnf("failed to gather regular telemetry metrics for default telemetry merge: %v", err) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this fails it could get pretty noisy as it would emit every 15 seconds, wondering if I should remove it or make it debug level?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would vouch for debug level
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think if it is only every 15s it seems reasonable to me. The impact is that this telemetry is missing which is significant. |
||
| } else { | ||
| regularMfs = gathered | ||
| } | ||
|
|
||
| mergeLabelsByMetric := discoverMergeLabels(mfs, regularMfs) | ||
| mergedMetrics := collectMergeMetrics(mfs, false, mergeLabelsByMetric) | ||
| mergedMetrics.merge(collectMergeMetrics(regularMfs, true, mergeLabelsByMetric)) | ||
|
|
||
| sender, err := c.GetSender() | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| sender.SetNoIndex(true) | ||
|
|
||
| c.sendMergedMetrics(mergedMetrics, sender) | ||
| c.handleMetricFamilies(mfs, sender) | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| type mergeMetricSample struct { | ||
| tags []string | ||
| value float64 | ||
| } | ||
|
|
||
| type mergeMetricValues map[string]map[string]mergeMetricSample | ||
|
|
||
| func newMergeMetricValues() mergeMetricValues { | ||
| return make(mergeMetricValues) | ||
| } | ||
|
|
||
| func mergeKey(tags []string) string { | ||
| return strings.Join(tags, "\xff") | ||
| } | ||
|
|
||
| func (m mergeMetricValues) add(metricName string, tags []string, value float64) { | ||
| byKey := m[metricName] | ||
| if byKey == nil { | ||
| byKey = make(map[string]mergeMetricSample) | ||
| m[metricName] = byKey | ||
| } | ||
|
|
||
| key := mergeKey(tags) | ||
| sample := byKey[key] | ||
| if sample.tags == nil { | ||
| sample.tags = tags | ||
| } | ||
| sample.value += value | ||
| byKey[key] = sample | ||
| } | ||
|
|
||
| func (m mergeMetricValues) merge(other mergeMetricValues) { | ||
| for metricName, otherByKey := range other { | ||
| for _, sample := range otherByKey { | ||
| m.add(metricName, sample.tags, sample.value) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func labelValue(labels []*dto.LabelPair, name string) (string, bool) { | ||
| for _, label := range labels { | ||
| if label.GetName() == name { | ||
| return label.GetValue(), true | ||
| } | ||
| } | ||
| return "", false | ||
| } | ||
|
|
||
| func isMergedMetric(name string) bool { | ||
| return slices.Contains(regularRegistryMergeMetrics, name) | ||
| } | ||
|
|
||
| func mergeLabelNames(mfs []*dto.MetricFamily, metricName string) []string { | ||
| labelNames := make(map[string]struct{}) | ||
| for _, mf := range mfs { | ||
| if mf == nil || mf.GetName() != metricName { | ||
| continue | ||
| } | ||
| for _, metric := range mf.Metric { | ||
| if metric == nil { | ||
| continue | ||
| } | ||
| for _, label := range metric.Label { | ||
| name := label.GetName() | ||
| if name == "" || name == remoteAgentLabel { | ||
| continue | ||
| } | ||
| labelNames[name] = struct{}{} | ||
| } | ||
| } | ||
| } | ||
|
|
||
| names := make([]string, 0, len(labelNames)) | ||
| for name := range labelNames { | ||
| names = append(names, name) | ||
| } | ||
| slices.Sort(names) | ||
| return names | ||
| } | ||
|
|
||
| func discoverMergeLabels(defaultMfs, regularMfs []*dto.MetricFamily) map[string][]string { | ||
| labelsByMetric := make(map[string][]string, len(regularRegistryMergeMetrics)) | ||
| for _, metricName := range regularRegistryMergeMetrics { | ||
| labels := mergeLabelNames(defaultMfs, metricName) | ||
| if len(labels) == 0 { | ||
| // Prefer the default registry's label shape for compatibility. If it has no samples yet, fall back to the | ||
| // regular registry while still dropping remote_agent so customer-facing tags do not include attribution. | ||
| labels = mergeLabelNames(regularMfs, metricName) | ||
| } | ||
| labelsByMetric[metricName] = labels | ||
| } | ||
| return labelsByMetric | ||
| } | ||
|
|
||
| func mergeTags(labels []*dto.LabelPair, labelNames []string) []string { | ||
| tags := make([]string, 0, len(labelNames)) | ||
| for _, labelName := range labelNames { | ||
| value, _ := labelValue(labels, labelName) | ||
| tags = append(tags, fmt.Sprintf("%s:%s", labelName, value)) | ||
| } | ||
| return tags | ||
| } | ||
|
|
||
| func collectMergeMetrics(mfs []*dto.MetricFamily, requireRemoteAgent bool, labelsByMetric map[string][]string) mergeMetricValues { | ||
| values := newMergeMetricValues() | ||
|
|
||
| for _, mf := range mfs { | ||
| if mf == nil || mf.Name == nil || mf.Type == nil || !isMergedMetric(mf.GetName()) { | ||
| continue | ||
| } | ||
|
|
||
| if mf.GetType() != dto.MetricType_GAUGE { | ||
| log.Warnf("dropping telemetry merge metric %q with unsupported type %s", mf.GetName(), mf.GetType()) | ||
| continue | ||
| } | ||
|
|
||
| for _, metric := range mf.Metric { | ||
| if metric == nil || metric.Gauge == nil { | ||
| continue | ||
| } | ||
| if requireRemoteAgent { | ||
| if _, ok := labelValue(metric.Label, remoteAgentLabel); !ok { | ||
| continue | ||
| } | ||
| } | ||
| values.add(mf.GetName(), mergeTags(metric.Label, labelsByMetric[mf.GetName()]), metric.Gauge.GetValue()) | ||
| } | ||
| } | ||
|
|
||
| return values | ||
| } | ||
|
|
||
| func (c *checkImpl) sendMergedMetrics(values mergeMetricValues, sender sender.Sender) { | ||
| for _, metricName := range regularRegistryMergeMetrics { | ||
| for _, sample := range values[metricName] { | ||
| sender.Gauge(c.buildName(metricName), sample.value, "", sample.tags) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (c *checkImpl) handleMetricFamilies(mfs []*dto.MetricFamily, sender sender.Sender) { | ||
| for _, mf := range mfs { | ||
| if mf.Name == nil || mf.Type == nil || len(mf.Metric) == 0 { | ||
| // Merged metrics are emitted explicitly by sendMergedMetrics so overlapping regular-registry values can be included | ||
| // without changing customer-facing metric names or tags. | ||
| if mf == nil || mf.Name == nil || mf.Type == nil || len(mf.Metric) == 0 || isMergedMetric(mf.GetName()) { | ||
| continue | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My understanding is these will change only the COAT version of the metrics to start including these two tags. Since COAT is internal, strict compatibility on the shape of these metrics is not required
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should not be a problem