From 67e474da68fb335ac0068d8fe073fe8fe82f29b1 Mon Sep 17 00:00:00 2001 From: Pierre Gimalac <23154723+pgimalac@users.noreply.github.com> Date: Fri, 10 Apr 2026 08:46:44 +0000 Subject: [PATCH 1/3] AIX: add live process monitoring Cherry-picked and adapted from https://github.com/DataDog/datadog-agent/pull/47041. Process collection logic previously in the DataDog/gopsutil fork has been vendored directly into pkg/process/procutil/process_aix.go, reading /proc//psinfo without any dependency on DataDog/gopsutil. --- pkg/process/checks/process.go | 9 +- pkg/process/checks/process_rt.go | 9 +- pkg/process/checks/system_info.go | 2 +- pkg/process/checks/system_info_aix.go | 51 ++++++ pkg/process/procutil/process_aix.go | 214 +++++++++++++++++++++++ pkg/process/procutil/process_fallback.go | 2 +- 6 files changed, 281 insertions(+), 6 deletions(-) create mode 100644 pkg/process/checks/system_info_aix.go create mode 100644 pkg/process/procutil/process_aix.go diff --git a/pkg/process/checks/process.go b/pkg/process/checks/process.go index b8659e56c263..955b301a4347 100644 --- a/pkg/process/checks/process.go +++ b/pkg/process/checks/process.go @@ -495,6 +495,11 @@ func fmtProcesses( // Hide disallow-listed args if the Scrubber is enabled fp.Cmdline = scrubber.ScrubProcessCommand(fp) + var voluntaryCtxSwitches, involuntaryCtxSwitches uint64 + if fp.Stats.CtxSwitches != nil { + voluntaryCtxSwitches = uint64(fp.Stats.CtxSwitches.Voluntary) + involuntaryCtxSwitches = uint64(fp.Stats.CtxSwitches.Involuntary) + } proc := &model.Process{ Pid: fp.Pid, NsPid: fp.NsPid, @@ -506,8 +511,8 @@ func fmtProcesses( OpenFdCount: fp.Stats.OpenFdCount, State: model.ProcessState(model.ProcessState_value[fp.Stats.Status]), IoStat: formatIO(fp.Stats, lastProcs[fp.Pid].Stats.IOStat, now, lastRun), - VoluntaryCtxSwitches: uint64(fp.Stats.CtxSwitches.Voluntary), - InvoluntaryCtxSwitches: uint64(fp.Stats.CtxSwitches.Involuntary), + VoluntaryCtxSwitches: voluntaryCtxSwitches, + InvoluntaryCtxSwitches: involuntaryCtxSwitches, ContainerId: ctrByProc[int(fp.Pid)], ProcessContext: serviceExtractor.GetServiceContext(fp.Pid), // SERVICE DISCOVERY FIELDS diff --git a/pkg/process/checks/process_rt.go b/pkg/process/checks/process_rt.go index 113300fa6597..d8969e2d0224 100644 --- a/pkg/process/checks/process_rt.go +++ b/pkg/process/checks/process_rt.go @@ -124,6 +124,11 @@ func fmtProcessStats( ioStat = formatIO(fp, lastProcs[pid].IOStat, now, lastRun) } + var voluntaryCtxSwitches, involuntaryCtxSwitches uint64 + if fp.CtxSwitches != nil { + voluntaryCtxSwitches = uint64(fp.CtxSwitches.Voluntary) + involuntaryCtxSwitches = uint64(fp.CtxSwitches.Involuntary) + } stat := &model.ProcessStat{ Pid: pid, CreateTime: fp.CreateTime, @@ -134,8 +139,8 @@ func fmtProcessStats( OpenFdCount: fp.OpenFdCount, ProcessState: model.ProcessState(model.ProcessState_value[fp.Status]), IoStat: ioStat, - VoluntaryCtxSwitches: uint64(fp.CtxSwitches.Voluntary), - InvoluntaryCtxSwitches: uint64(fp.CtxSwitches.Involuntary), + VoluntaryCtxSwitches: voluntaryCtxSwitches, + InvoluntaryCtxSwitches: involuntaryCtxSwitches, ContainerId: pidToCid[int(pid)], } diff --git a/pkg/process/checks/system_info.go b/pkg/process/checks/system_info.go index f4a3c6d74b22..dbd6f64046b1 100644 --- a/pkg/process/checks/system_info.go +++ b/pkg/process/checks/system_info.go @@ -3,7 +3,7 @@ // This product includes software developed at Datadog (https://www.datadoghq.com/). // Copyright 2016-present Datadog, Inc. -//go:build !windows && !darwin +//go:build !windows && !darwin && !aix package checks diff --git a/pkg/process/checks/system_info_aix.go b/pkg/process/checks/system_info_aix.go new file mode 100644 index 000000000000..93ee22744d90 --- /dev/null +++ b/pkg/process/checks/system_info_aix.go @@ -0,0 +1,51 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build aix + +package checks + +import ( + model "github.com/DataDog/agent-payload/v5/process" + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/host" + "github.com/shirou/gopsutil/v4/mem" +) + +// CollectSystemInfo collects a set of system-level information that will not +// change until a restart. On AIX, fields that cannot be retrieved are left empty. +func CollectSystemInfo() (*model.SystemInfo, error) { + hi, err := host.Info() + if err != nil || hi == nil { + hi = &host.InfoStat{} + } + + cpuInfo, _ := cpu.Info() + + mi, err := mem.VirtualMemory() + if err != nil || mi == nil { + mi = &mem.VirtualMemoryStat{} + } + + cpus := make([]*model.CPUInfo, 0, len(cpuInfo)) + for _, c := range cpuInfo { + cpus = append(cpus, &model.CPUInfo{ + Cores: c.Cores, + }) + } + + return &model.SystemInfo{ + Uuid: hi.HostID, + Os: &model.OSInfo{ + Name: hi.OS, + Platform: hi.Platform, + Family: hi.PlatformFamily, + Version: hi.PlatformVersion, + KernelVersion: hi.KernelVersion, + }, + Cpus: cpus, + TotalMemory: int64(mi.Total), + }, nil +} diff --git a/pkg/process/procutil/process_aix.go b/pkg/process/procutil/process_aix.go new file mode 100644 index 000000000000..741de63e8dd8 --- /dev/null +++ b/pkg/process/procutil/process_aix.go @@ -0,0 +1,214 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build aix + +package procutil + +// Process collection for AIX via /proc//psinfo. +// +// AIX exposes a binary psinfo_t structure at /proc//psinfo (world-readable, +// 448 bytes, big-endian). This file reads that structure directly without +// shelling out or requiring elevated privileges. +// +// The struct layout is taken from /usr/include/sys/procfs.h on AIX 7.x ppc64. + +import ( + "encoding/binary" + "errors" + "fmt" + "os" + "strconv" + "strings" + "time" +) + +// prTimestruc64 mirrors AIX timestruc64_t: { tv_sec int64; tv_nsec int32; _pad uint32 } +type prTimestruc64 struct { + Sec int64 + Nsec int32 + _ uint32 +} + +// lwpSinfo mirrors the representative LWP entry inside psinfo_t (120 bytes). +type lwpSinfo struct { + LwpID uint64 + Addr uint64 + Wchan uint64 + Flag uint32 + Wtype uint8 + State int8 + Sname byte // process state character: 'O','R','S','Z', etc. + Nice uint8 + Pri int32 + Policy uint32 + Clname [8]byte + Onpro int32 + Bindpro int32 + Ptid uint32 + _ uint32 + _ [7]uint64 +} + +// psinfo mirrors AIX psinfo_t from /usr/include/sys/procfs.h (448 bytes, big-endian ppc64). +type psinfo struct { + Flag uint32 + Flag2 uint32 + Nlwp uint32 // number of threads + _ uint32 + Uid uint64 + Euid uint64 + Gid uint64 + Egid uint64 + Pid uint64 + Ppid uint64 + Pgid uint64 + Sid uint64 + Ttydev uint64 + Addr uint64 + Size uint64 // virtual memory size in pages + Rssize uint64 // resident set size in pages + Start prTimestruc64 // process start time + Time prTimestruc64 // combined user+system CPU time + Cid uint16 + _ uint16 + Argc uint32 + Argv uint64 + Envp uint64 + Fname [16]byte // executable name, null-terminated + Psargs [80]byte // process args, space-separated, null-terminated + _ [8]uint64 + Lwp lwpSinfo // representative LWP +} + +func readPsinfo(pid int32) (*psinfo, error) { + f, err := os.Open(fmt.Sprintf("/proc/%d/psinfo", pid)) + if err != nil { + return nil, err + } + defer f.Close() + + var psi psinfo + if err := binary.Read(f, binary.BigEndian, &psi); err != nil { + return nil, err + } + return &psi, nil +} + +func nullTermBytes(b []byte) string { + for i, c := range b { + if c == 0 { + return string(b[:i]) + } + } + return string(b) +} + +func listPIDs() ([]int32, error) { + dir, err := os.Open("/proc") + if err != nil { + return nil, err + } + defer dir.Close() + + names, err := dir.Readdirnames(-1) + if err != nil { + return nil, err + } + + pids := make([]int32, 0, len(names)) + for _, name := range names { + pid, err := strconv.ParseInt(name, 10, 32) + if err != nil { + continue // skip non-numeric entries (e.g. "net", "sys") + } + pids = append(pids, int32(pid)) + } + return pids, nil +} + +func psinfoToProcess(psi *psinfo, pid int32, pageSize int) *Process { + name := nullTermBytes(psi.Fname[:]) + args := nullTermBytes(psi.Psargs[:]) + + var cmdline []string + if args != "" { + cmdline = strings.Fields(args) + } + + cpuSecs := float64(psi.Time.Sec) + float64(psi.Time.Nsec)/1e9 + + return &Process{ + Pid: pid, + Ppid: int32(psi.Ppid), + Name: name, + Cmdline: cmdline, + Uids: []int32{int32(psi.Uid), int32(psi.Euid), int32(psi.Uid), int32(psi.Euid)}, + Gids: []int32{int32(psi.Gid), int32(psi.Egid), int32(psi.Gid), int32(psi.Egid)}, + Stats: &Stats{ + CreateTime: psi.Start.Sec * 1000, // milliseconds since epoch + Status: string([]byte{psi.Lwp.Sname}), + Nice: int32(int8(psi.Lwp.Nice)), + NumThreads: int32(psi.Nlwp), + CPUTime: &CPUTimesStat{ + // psinfo only provides combined user+sys time; no per-category split. + User: cpuSecs, + }, + MemInfo: &MemoryInfoStat{ + RSS: uint64(psi.Rssize) * uint64(pageSize), + VMS: uint64(psi.Size) * uint64(pageSize), + }, + // IOStat, MemInfoEx, CtxSwitches are not available without root on AIX. + }, + } +} + +// NewProcessProbe returns a Probe for AIX. +func NewProcessProbe(options ...Option) Probe { + p := &probe{} + for _, option := range options { + option(p) + } + return p +} + +type probe struct{} + +func (p *probe) Close() {} + +func (p *probe) ProcessesByPID(_ time.Time, _ bool) (map[int32]*Process, error) { + pids, err := listPIDs() + if err != nil { + return nil, fmt.Errorf("aix ProcessesByPID: could not list pids: %w", err) + } + + pageSize := os.Getpagesize() + result := make(map[int32]*Process, len(pids)) + for _, pid := range pids { + psi, err := readPsinfo(pid) + if err != nil { + // Process may have exited between listPIDs and now; skip it. + continue + } + result[pid] = psinfoToProcess(psi, pid, pageSize) + } + return result, nil +} + +func (p *probe) StatsForPIDs(_ []int32, _ time.Time) (map[int32]*Stats, error) { + procs, err := p.ProcessesByPID(time.Now(), false) + if err != nil { + return nil, err + } + stats := make(map[int32]*Stats, len(procs)) + for pid, proc := range procs { + stats[pid] = proc.Stats + } + return stats, nil +} + +func (p *probe) StatsWithPermByPID(_ []int32) (map[int32]*StatsWithPerm, error) { + return nil, errors.New("StatsWithPermByPID is not implemented on AIX") +} diff --git a/pkg/process/procutil/process_fallback.go b/pkg/process/procutil/process_fallback.go index 56511232cd34..eaf943750d83 100644 --- a/pkg/process/procutil/process_fallback.go +++ b/pkg/process/procutil/process_fallback.go @@ -3,7 +3,7 @@ // This product includes software developed at Datadog (https://www.datadoghq.com/). // Copyright 2016-present Datadog, Inc. -//go:build !linux && !windows && !darwin +//go:build !linux && !windows && !darwin && !aix package procutil From 9a299b9f1164b4b4fe2b1c01acc6399defba17b1 Mon Sep 17 00:00:00 2001 From: Pierre Gimalac <23154723+pgimalac@users.noreply.github.com> Date: Tue, 14 Apr 2026 12:16:39 +0000 Subject: [PATCH 2/3] fix(aix): use correct KB multiplier for RSS/VMS in psinfo pr_size and pr_rssize in AIX psinfo_t are in kilobytes, not pages. Using os.Getpagesize() (4096 on ppc64) inflated memory values by 4x. --- pkg/process/procutil/process_aix.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/process/procutil/process_aix.go b/pkg/process/procutil/process_aix.go index 741de63e8dd8..302cb4c8c9ee 100644 --- a/pkg/process/procutil/process_aix.go +++ b/pkg/process/procutil/process_aix.go @@ -129,7 +129,7 @@ func listPIDs() ([]int32, error) { return pids, nil } -func psinfoToProcess(psi *psinfo, pid int32, pageSize int) *Process { +func psinfoToProcess(psi *psinfo, pid int32) *Process { name := nullTermBytes(psi.Fname[:]) args := nullTermBytes(psi.Psargs[:]) @@ -157,8 +157,9 @@ func psinfoToProcess(psi *psinfo, pid int32, pageSize int) *Process { User: cpuSecs, }, MemInfo: &MemoryInfoStat{ - RSS: uint64(psi.Rssize) * uint64(pageSize), - VMS: uint64(psi.Size) * uint64(pageSize), + // pr_size and pr_rssize are in kilobytes on AIX. + RSS: psi.Rssize * 1024, + VMS: psi.Size * 1024, }, // IOStat, MemInfoEx, CtxSwitches are not available without root on AIX. }, @@ -184,7 +185,6 @@ func (p *probe) ProcessesByPID(_ time.Time, _ bool) (map[int32]*Process, error) return nil, fmt.Errorf("aix ProcessesByPID: could not list pids: %w", err) } - pageSize := os.Getpagesize() result := make(map[int32]*Process, len(pids)) for _, pid := range pids { psi, err := readPsinfo(pid) @@ -192,7 +192,7 @@ func (p *probe) ProcessesByPID(_ time.Time, _ bool) (map[int32]*Process, error) // Process may have exited between listPIDs and now; skip it. continue } - result[pid] = psinfoToProcess(psi, pid, pageSize) + result[pid] = psinfoToProcess(psi, pid) } return result, nil } From 7b9bb45efe1a7adacf733c659296e731aaa4b89e Mon Sep 17 00:00:00 2001 From: Pierre Gimalac <23154723+pgimalac@users.noreply.github.com> Date: Wed, 15 Apr 2026 08:25:48 +0000 Subject: [PATCH 3/3] chore: add missing component logic --- comp/process/agent/agent_aix.go | 54 ++++++++++++++++++++++++++++ comp/process/agent/agent_fallback.go | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 comp/process/agent/agent_aix.go diff --git a/comp/process/agent/agent_aix.go b/comp/process/agent/agent_aix.go new file mode 100644 index 000000000000..236316788ef4 --- /dev/null +++ b/comp/process/agent/agent_aix.go @@ -0,0 +1,54 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +//go:build aix + +package agent + +import ( + "sync" + + "github.com/DataDog/datadog-agent/comp/core/config" + log "github.com/DataDog/datadog-agent/comp/core/log/def" + "github.com/DataDog/datadog-agent/comp/process/types" + "github.com/DataDog/datadog-agent/pkg/config/setup" + "github.com/DataDog/datadog-agent/pkg/util/flavor" +) + +var ( + // enabled variable to ensure value returned by Enabled() persists when Enabled() is called multiple times + enabled bool + // Once module variable, exported for testing + Once sync.Once +) + +func enabledHelper(config config.Component, _ []types.CheckComponent, l log.Component) bool { + // never run the process component in the cluster worker + if setup.IsCLCRunner(config) { + return false + } + + switch flavor.GetFlavor() { + case flavor.ProcessAgent: + // Process checks always run in the core agent on AIX. + // There is no NPM or other standalone-only check on AIX. + l.Info("The process checks will run in the core agent via the process-component") + return false + case flavor.DefaultAgent: + return true + default: + return false + } +} + +// Enabled determines whether the process agent component is enabled. +// Enabled will only be run once, to prevent duplicate logging. +// On AIX, process checks always run in the core agent. +func Enabled(config config.Component, checkComponents []types.CheckComponent, l log.Component) bool { + Once.Do(func() { + enabled = enabledHelper(config, checkComponents, l) + }) + return enabled +} diff --git a/comp/process/agent/agent_fallback.go b/comp/process/agent/agent_fallback.go index 43daf32d7ae3..69d127f19812 100644 --- a/comp/process/agent/agent_fallback.go +++ b/comp/process/agent/agent_fallback.go @@ -3,7 +3,7 @@ // This product includes software developed at Datadog (https://www.datadoghq.com/). // Copyright 2024-present Datadog, Inc. -//go:build !linux +//go:build !linux && !aix package agent