From 182a9c5834474dcd4db8fdb31959b4a5e1220dbd Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Thu, 11 Jun 2026 16:07:56 +0800 Subject: [PATCH 1/8] chore(inputs): update dashboard and doc for inputs --- inputs/activemq/README.md | 14 +- inputs/activemq/README_CN.md | 3 + inputs/activemq/dashboard.json | 142 +++ inputs/aliyun/README.md | 10 +- inputs/amd_rocm_smi/README_CN.md | 53 + inputs/amd_rocm_smi/dashboard.json | 160 +++ inputs/apache/README.md | 48 +- inputs/apache/README_CN.md | 41 + inputs/apache/dashboard.json | 106 ++ inputs/appdynamics/README.md | 39 + inputs/appdynamics/README_CN.md | 39 + inputs/appdynamics/dashboard.json | 88 ++ inputs/arp_packet/README.md | 57 +- inputs/arp_packet/README_CN.md | 47 + inputs/arp_packet/dashboard.json | 52 + inputs/bind/README.md | 34 +- inputs/bind/README_CN.md | 39 + inputs/bind/dashboard.json | 124 ++ inputs/bitbucket/README.md | 14 +- inputs/bitbucket/README_CN.md | 3 + inputs/bitbucket/dashboard.json | 75 ++ inputs/cadvisor/README.md | 83 +- inputs/cadvisor/README_CN.md | 102 ++ inputs/cadvisor/dashboard.json | 88 ++ inputs/cassandra/README.md | 15 +- inputs/cassandra/README_CN.md | 3 + inputs/cassandra/dashboard.json | 93 ++ inputs/clickhouse/README.md | 6 +- inputs/clickhouse/README_CN.md | 62 + inputs/clickhouse/dashboard.json | 88 ++ inputs/cloudwatch/README_CN.md | 83 ++ inputs/cloudwatch/dashboard.json | 88 ++ inputs/conntrack/README.md | 24 +- inputs/conntrack/README_CN.md | 20 + inputs/conntrack/dashboard.json | 52 + inputs/consul/README_CN.md | 52 + inputs/consul/dashboard.json | 106 ++ inputs/cpu/README.md | 29 +- inputs/cpu/README_CN.md | 28 + inputs/cpu/dashboard.json | 88 ++ inputs/dcgm/README.md | 51 + inputs/dcgm/README_CN.md | 51 + inputs/dcgm/dashboard.json | 106 ++ inputs/disk/README.md | 33 +- inputs/disk/README_CN.md | 32 + inputs/disk/dashboard.json | 88 ++ inputs/diskio/README.md | 22 +- inputs/diskio/README_CN.md | 21 + inputs/diskio/dashboard.json | 88 ++ inputs/dns_query/README.md | 178 +-- inputs/dns_query/README_CN.md | 48 + inputs/dns_query/dashboard.json | 52 + inputs/docker/README.md | 64 +- inputs/docker/README_CN.md | 69 + inputs/docker/dashboard.json | 88 ++ inputs/elasticsearch/README.md | 19 +- inputs/exec/README.md | 149 +-- inputs/exec/README_CN.md | 62 + inputs/exec/dashboard.json | 28 + inputs/filecount/README.md | 2 +- inputs/filecount/README_CN.md | 48 + inputs/filecount/dashboard.json | 52 + inputs/gnmi/README_CN.md | 93 ++ inputs/gnmi/dashboard.json | 28 + inputs/googlecloud/README.md | 83 +- inputs/googlecloud/README_CN.md | 55 + inputs/googlecloud/dashboard.json | 28 + inputs/greenplum/README.md | 32 + inputs/greenplum/README_CN.md | 32 + inputs/greenplum/dashboard.json | 52 + inputs/hadoop_hdfs/README.md | 19 +- inputs/hadoop_hdfs/README_CN.md | 18 + inputs/hadoop_hdfs/dashboard.json | 28 + inputs/http_response/README.md | 17 + inputs/huatuo/dashboard.json | 28 + inputs/ipvs/README.md | 116 +- inputs/ipvs/README_CN.md | 55 + inputs/ipvs/dashboard.json | 88 ++ inputs/jboss/README.md | 20 +- inputs/jboss/README_CN.md | 19 + inputs/jboss/dashboard.json | 28 + inputs/jenkins/README.md | 61 + inputs/jenkins/README_CN.md | 61 + inputs/jenkins/dashboard.json | 124 ++ inputs/jolokia/README.md | 10 + inputs/jolokia/README_CN.md | 10 + inputs/jolokia/dashboard.json | 28 + inputs/jolokia_agent/README.md | 52 + inputs/jolokia_agent/README_CN.md | 52 + inputs/jolokia_agent/dashboard.json | 28 + inputs/jolokia_proxy/README.md | 46 + inputs/jolokia_proxy/README_CN.md | 46 + inputs/jolokia_proxy/dashboard.json | 28 + inputs/kafka/README.md | 4 +- inputs/kafka_connect/README.md | 20 +- inputs/kafka_connect/README_CN.md | 19 + inputs/kafka_connect/dashboard.json | 28 + inputs/kernel/README.md | 28 +- inputs/kernel/README_CN.md | 27 + inputs/kernel/dashboard.json | 88 ++ inputs/kernel_vmstat/README.md | 138 +- inputs/kernel_vmstat/README_CN.md | 126 ++ inputs/kernel_vmstat/dashboard.json | 57 + inputs/kube_proxy/README.md | 23 + inputs/kube_proxy/README_CN.md | 23 + inputs/kube_proxy/dashboard.json | 610 +++++++++ inputs/kubelet/README.md | 24 + inputs/kubelet/README_CN.md | 24 + inputs/kubelet/dashboard.json | 444 +++++++ inputs/kubernetes/README.md | 2 +- inputs/ldap/README.md | 145 +-- inputs/ldap/README_CN.md | 53 + inputs/ldap/dashboard.json | 106 ++ inputs/linux_sysctl_fs/README.md | 33 +- inputs/linux_sysctl_fs/README_CN.md | 32 + inputs/linux_sysctl_fs/dashboard.json | 103 ++ inputs/mem/README.md | 33 +- inputs/mem/README_CN.md | 32 + inputs/mem/dashboard.json | 93 ++ inputs/mongodb/README.md | 25 +- inputs/mtail/README_CN.md | 283 +++++ inputs/mtail/Readme.md | 248 ++-- inputs/mtail/dashboard.json | 34 + inputs/mysql/README.md | 730 +++++++++-- inputs/nats/README.md | 37 + inputs/nats/README_CN.md | 37 + inputs/nats/dashboard.json | 116 ++ inputs/net/README.md | 31 +- inputs/net/README_CN.md | 32 + inputs/net/dashboard.json | 108 ++ inputs/netstat/README.md | 37 +- inputs/netstat/README_CN.md | 36 + inputs/netstat/dashboard.json | 88 ++ inputs/netstat_filter/README.md | 81 +- inputs/netstat_filter/README_CN.md | 50 + inputs/netstat_filter/dashboard.json | 88 ++ inputs/nfsclient/README.md | 48 + inputs/nfsclient/README_CN.md | 48 + inputs/nfsclient/dashboard.json | 93 ++ inputs/nginx/README.md | 14 +- inputs/node_exporter/README.md | 36 + inputs/node_exporter/README_CN.md | 36 + inputs/node_exporter/dashboard.json | 98 ++ inputs/nsq/README.md | 98 +- inputs/nsq/README_CN.md | 54 + inputs/nsq/dashboard.json | 93 ++ inputs/nvidia_smi/README.md | 46 +- inputs/nvidia_smi/README_CN.md | 43 + inputs/nvidia_smi/dashboard.json | 88 ++ inputs/oracle/README.md | 2 - inputs/phpfpm/README.md | 63 +- inputs/phpfpm/README_CN.md | 57 + inputs/phpfpm/dashboard.json | 98 ++ inputs/ping/README.md | 25 - inputs/processes/README.md | 38 +- inputs/processes/README_CN.md | 37 + inputs/processes/dashboard.json | 93 ++ inputs/prometheus/README.md | 71 +- inputs/prometheus/README_CN.md | 44 + inputs/prometheus/dashboard.json | 34 + inputs/rabbitmq/README.md | 2 +- inputs/redfish/README.md | 58 + inputs/redfish/README_CN.md | 58 + inputs/redfish/dashboard.json | 52 + inputs/redis/README.md | 120 ++ inputs/redis/alerts.json | 2 +- inputs/redis_sentinel/README.md | 42 +- inputs/redis_sentinel/README_CN.md | 41 + inputs/redis_sentinel/dashboard.json | 88 ++ inputs/redis_sentinel/redis_sentinel.go | 2 +- inputs/rocketmq_offset/README.md | 37 + inputs/rocketmq_offset/README_CN.md | 37 + inputs/rocketmq_offset/dashboard.json | 70 ++ inputs/self_metrics/README.md | 36 + inputs/self_metrics/README_CN.md | 36 + inputs/self_metrics/dashboard.json | 88 ++ inputs/smart/README.md | 315 +---- inputs/smart/README_CN.md | 60 + inputs/smart/dashboard.json | 88 ++ inputs/snmp/README.md | 55 +- inputs/snmp/README_CN.md | 76 ++ inputs/snmp/dashboard.json | 88 ++ inputs/snmp_trap/README.md | 10 +- inputs/snmp_trap/README_CN.md | 72 ++ inputs/snmp_trap/dashboard.json | 70 ++ inputs/snmp_zabbix/README.md | 1523 +---------------------- inputs/snmp_zabbix/README_CN.md | 1514 ++++++++++++++++++++++ inputs/snmp_zabbix/dashboard.json | 70 ++ inputs/sockstat/README.md | 65 +- inputs/sockstat/README_CN.md | 42 + inputs/sockstat/dashboard.json | 108 ++ inputs/sqlserver/README.md | 4 +- inputs/supervisor/README.md | 157 +-- inputs/supervisor/README_CN.md | 127 ++ inputs/supervisor/dashboard.json | 52 + inputs/systemd/README.md | 57 +- inputs/systemd/README_CN.md | 54 + inputs/systemd/dashboard.json | 70 ++ inputs/tengine/README.md | 128 +- inputs/tengine/README_CN.md | 61 + inputs/tengine/dashboard.json | 98 ++ inputs/tpl/README.md | 14 +- inputs/tpl/README_CN.md | 13 + inputs/tpl/dashboard.json | 34 + inputs/vsphere/README.md | 63 + inputs/vsphere/README_CN.md | 63 + inputs/vsphere/dashboard.json | 1116 +++++++++++++++++ inputs/weblogic/README.md | 16 +- inputs/weblogic/README_CN.md | 15 + inputs/weblogic/dashboard.json | 52 + inputs/whois/README.md | 39 +- inputs/whois/README_CN.md | 25 + inputs/whois/dashboard.json | 34 + inputs/xskyapi/README.md | 43 + inputs/xskyapi/README_CN.md | 43 + inputs/xskyapi/dashboard.json | 70 ++ 216 files changed, 14910 insertions(+), 3230 deletions(-) create mode 100644 inputs/activemq/README_CN.md create mode 100644 inputs/activemq/dashboard.json create mode 100644 inputs/amd_rocm_smi/README_CN.md create mode 100644 inputs/amd_rocm_smi/dashboard.json create mode 100644 inputs/apache/README_CN.md create mode 100644 inputs/apache/dashboard.json create mode 100644 inputs/appdynamics/README.md create mode 100644 inputs/appdynamics/README_CN.md create mode 100644 inputs/appdynamics/dashboard.json create mode 100644 inputs/arp_packet/README_CN.md create mode 100644 inputs/arp_packet/dashboard.json create mode 100644 inputs/bind/README_CN.md create mode 100644 inputs/bind/dashboard.json create mode 100644 inputs/bitbucket/README_CN.md create mode 100644 inputs/bitbucket/dashboard.json create mode 100644 inputs/cadvisor/README_CN.md create mode 100644 inputs/cadvisor/dashboard.json create mode 100644 inputs/cassandra/README_CN.md create mode 100644 inputs/cassandra/dashboard.json create mode 100644 inputs/clickhouse/README_CN.md create mode 100644 inputs/clickhouse/dashboard.json create mode 100644 inputs/cloudwatch/README_CN.md create mode 100644 inputs/cloudwatch/dashboard.json create mode 100644 inputs/conntrack/README_CN.md create mode 100644 inputs/conntrack/dashboard.json create mode 100644 inputs/consul/README_CN.md create mode 100644 inputs/consul/dashboard.json create mode 100644 inputs/cpu/README_CN.md create mode 100644 inputs/cpu/dashboard.json create mode 100644 inputs/dcgm/README.md create mode 100644 inputs/dcgm/README_CN.md create mode 100644 inputs/dcgm/dashboard.json create mode 100644 inputs/disk/README_CN.md create mode 100644 inputs/disk/dashboard.json create mode 100644 inputs/diskio/README_CN.md create mode 100644 inputs/diskio/dashboard.json create mode 100644 inputs/dns_query/README_CN.md create mode 100644 inputs/dns_query/dashboard.json create mode 100644 inputs/docker/README_CN.md create mode 100644 inputs/docker/dashboard.json create mode 100644 inputs/exec/README_CN.md create mode 100644 inputs/exec/dashboard.json create mode 100644 inputs/filecount/README_CN.md create mode 100644 inputs/filecount/dashboard.json create mode 100644 inputs/gnmi/README_CN.md create mode 100644 inputs/gnmi/dashboard.json create mode 100644 inputs/googlecloud/README_CN.md create mode 100644 inputs/googlecloud/dashboard.json create mode 100644 inputs/greenplum/README.md create mode 100644 inputs/greenplum/README_CN.md create mode 100644 inputs/greenplum/dashboard.json create mode 100644 inputs/hadoop_hdfs/README_CN.md create mode 100644 inputs/hadoop_hdfs/dashboard.json create mode 100644 inputs/huatuo/dashboard.json create mode 100644 inputs/ipvs/README_CN.md create mode 100644 inputs/ipvs/dashboard.json create mode 100644 inputs/jboss/README_CN.md create mode 100644 inputs/jboss/dashboard.json create mode 100644 inputs/jenkins/README.md create mode 100644 inputs/jenkins/README_CN.md create mode 100644 inputs/jenkins/dashboard.json create mode 100644 inputs/jolokia/README.md create mode 100644 inputs/jolokia/README_CN.md create mode 100644 inputs/jolokia/dashboard.json create mode 100644 inputs/jolokia_agent/README.md create mode 100644 inputs/jolokia_agent/README_CN.md create mode 100644 inputs/jolokia_agent/dashboard.json create mode 100644 inputs/jolokia_proxy/README.md create mode 100644 inputs/jolokia_proxy/README_CN.md create mode 100644 inputs/jolokia_proxy/dashboard.json create mode 100644 inputs/kafka_connect/README_CN.md create mode 100644 inputs/kafka_connect/dashboard.json create mode 100644 inputs/kernel/README_CN.md create mode 100644 inputs/kernel/dashboard.json create mode 100644 inputs/kernel_vmstat/README_CN.md create mode 100644 inputs/kernel_vmstat/dashboard.json create mode 100644 inputs/kube_proxy/README.md create mode 100644 inputs/kube_proxy/README_CN.md create mode 100644 inputs/kube_proxy/dashboard.json create mode 100644 inputs/kubelet/README.md create mode 100644 inputs/kubelet/README_CN.md create mode 100644 inputs/kubelet/dashboard.json create mode 100644 inputs/ldap/README_CN.md create mode 100644 inputs/ldap/dashboard.json create mode 100644 inputs/linux_sysctl_fs/README_CN.md create mode 100644 inputs/linux_sysctl_fs/dashboard.json create mode 100644 inputs/mem/README_CN.md create mode 100644 inputs/mem/dashboard.json create mode 100644 inputs/mtail/README_CN.md create mode 100644 inputs/mtail/dashboard.json create mode 100644 inputs/nats/README.md create mode 100644 inputs/nats/README_CN.md create mode 100644 inputs/nats/dashboard.json create mode 100644 inputs/net/README_CN.md create mode 100644 inputs/net/dashboard.json create mode 100644 inputs/netstat/README_CN.md create mode 100644 inputs/netstat/dashboard.json create mode 100644 inputs/netstat_filter/README_CN.md create mode 100644 inputs/netstat_filter/dashboard.json create mode 100644 inputs/nfsclient/README.md create mode 100644 inputs/nfsclient/README_CN.md create mode 100644 inputs/nfsclient/dashboard.json create mode 100644 inputs/node_exporter/README.md create mode 100644 inputs/node_exporter/README_CN.md create mode 100644 inputs/node_exporter/dashboard.json create mode 100644 inputs/nsq/README_CN.md create mode 100644 inputs/nsq/dashboard.json create mode 100644 inputs/nvidia_smi/README_CN.md create mode 100644 inputs/nvidia_smi/dashboard.json create mode 100644 inputs/phpfpm/README_CN.md create mode 100644 inputs/phpfpm/dashboard.json create mode 100644 inputs/processes/README_CN.md create mode 100644 inputs/processes/dashboard.json create mode 100644 inputs/prometheus/README_CN.md create mode 100644 inputs/prometheus/dashboard.json create mode 100644 inputs/redfish/README.md create mode 100644 inputs/redfish/README_CN.md create mode 100644 inputs/redfish/dashboard.json create mode 100644 inputs/redis_sentinel/README_CN.md create mode 100644 inputs/redis_sentinel/dashboard.json create mode 100644 inputs/rocketmq_offset/README.md create mode 100644 inputs/rocketmq_offset/README_CN.md create mode 100644 inputs/rocketmq_offset/dashboard.json create mode 100644 inputs/self_metrics/README.md create mode 100644 inputs/self_metrics/README_CN.md create mode 100644 inputs/self_metrics/dashboard.json create mode 100644 inputs/smart/README_CN.md create mode 100644 inputs/smart/dashboard.json create mode 100644 inputs/snmp/README_CN.md create mode 100644 inputs/snmp/dashboard.json create mode 100644 inputs/snmp_trap/README_CN.md create mode 100644 inputs/snmp_trap/dashboard.json create mode 100644 inputs/snmp_zabbix/README_CN.md create mode 100644 inputs/snmp_zabbix/dashboard.json create mode 100644 inputs/sockstat/README_CN.md create mode 100644 inputs/sockstat/dashboard.json create mode 100644 inputs/supervisor/README_CN.md create mode 100644 inputs/supervisor/dashboard.json create mode 100644 inputs/systemd/README_CN.md create mode 100644 inputs/systemd/dashboard.json create mode 100644 inputs/tengine/README_CN.md create mode 100644 inputs/tengine/dashboard.json create mode 100644 inputs/tpl/README_CN.md create mode 100644 inputs/tpl/dashboard.json create mode 100644 inputs/vsphere/README.md create mode 100644 inputs/vsphere/README_CN.md create mode 100644 inputs/vsphere/dashboard.json create mode 100644 inputs/weblogic/README_CN.md create mode 100644 inputs/weblogic/dashboard.json create mode 100644 inputs/whois/README_CN.md create mode 100644 inputs/whois/dashboard.json create mode 100644 inputs/xskyapi/README.md create mode 100644 inputs/xskyapi/README_CN.md create mode 100644 inputs/xskyapi/dashboard.json diff --git a/inputs/activemq/README.md b/inputs/activemq/README.md index fa65bb3d4..9e0251998 100644 --- a/inputs/activemq/README.md +++ b/inputs/activemq/README.md @@ -1,3 +1,13 @@ -# activemq +# ActiveMQ -ActiveMQ 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[activemq.toml](../../conf/input.jolokia_agent_misc/activemq.toml) +ActiveMQ can be monitored using the `jolokia_agent` plugin, which retrieves metrics by reading JMX data. + +For configuration details, please refer to: [activemq.toml](../../conf/input.jolokia_agent_misc/activemq.toml). + +## Metrics + +Once configured via the Jolokia Agent plugin, Categraf will export the following types of metrics: +- **Broker Metrics**: e.g., `activemq_broker_TotalMessageCount`, `activemq_broker_TotalConsumerCount` +- **Queue Metrics**: e.g., `activemq_queue_QueueSize`, `activemq_queue_ConsumerCount` +- **Topic Metrics**: e.g., `activemq_topic_EnqueueCount`, `activemq_topic_DequeueCount` +- **JVM Metrics**: Generic Java Runtime metrics such as Garbage Collection, Memory Heap, etc. diff --git a/inputs/activemq/README_CN.md b/inputs/activemq/README_CN.md new file mode 100644 index 000000000..fa65bb3d4 --- /dev/null +++ b/inputs/activemq/README_CN.md @@ -0,0 +1,3 @@ +# activemq + +ActiveMQ 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[activemq.toml](../../conf/input.jolokia_agent_misc/activemq.toml) diff --git a/inputs/activemq/dashboard.json b/inputs/activemq/dashboard.json new file mode 100644 index 000000000..087ab97df --- /dev/null +++ b/inputs/activemq/dashboard.json @@ -0,0 +1,142 @@ +{ + "title": "ActiveMQ", + "uid": "16eab841", + "tags": [ + "activemq" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Total Message Count", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 8, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "activemq_broker_TotalMessageCount", + "legendFormat": "{{brokerName}}", + "refId": "A" + } + ] + }, + { + "title": "Total Consumer Count", + "type": "timeseries", + "gridPos": { + "x": 8, + "y": 0, + "w": 8, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "activemq_broker_TotalConsumerCount", + "legendFormat": "{{brokerName}}", + "refId": "A" + } + ] + }, + { + "title": "Memory Percent Usage", + "type": "timeseries", + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "activemq_broker_MemoryPercentUsage", + "legendFormat": "{{brokerName}}", + "refId": "A" + } + ] + }, + { + "title": "Queue Size", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "activemq_queue_QueueSize", + "legendFormat": "{{destinationName}}", + "refId": "A" + } + ] + }, + { + "title": "Queue Consumer Count", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "activemq_queue_ConsumerCount", + "legendFormat": "{{destinationName}}", + "refId": "A" + } + ] + }, + { + "title": "Topic Enqueue Count", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 6, + "targets": [ + { + "expr": "activemq_topic_EnqueueCount", + "legendFormat": "{{destinationName}}", + "refId": "A" + } + ] + }, + { + "title": "Topic Dequeue Count", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 12, + "h": 8 + }, + "id": 7, + "targets": [ + { + "expr": "activemq_topic_DequeueCount", + "legendFormat": "{{destinationName}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/aliyun/README.md b/inputs/aliyun/README.md index 82df7488b..e4003c117 100644 --- a/inputs/aliyun/README.md +++ b/inputs/aliyun/README.md @@ -24,13 +24,21 @@ RAM 用户授权。RAM 用户调用云监控 API 前,需要所属的阿里云 4. 配置 +不同namespace附件label的权限点 + - rds: + - `rds:DescribeDBInstances` `acs:rds:{#regionId}:{#accountId}:dbinstance/{#dbinstanceId}` [链接](https://next.api.aliyun.com/document/Rds/2014-08-15/DescribeDBInstances) + - `rds:ListTagResources` `acs:rds:{#regionId}:{#accountId}:dbinstance/{#dbinstanceId}` [链接](https://next.api.aliyun.com/document/Rds/2014-08-15/ListTagResources) + - polardb: `acs:DescribeDBClusters` `acs:polardb:*:*:dbcluster` [链接](https://help.aliyun.com/document_detail/118034.html?spm=a2c4g.98094.0.0) + - kvstore: `DescribeInstances` `acs:kvstore:$regionid:$accountid:instance/$instanceid` [链接](https://help.aliyun.com/apsara/enterprise/v_3_18_0/kvstore/enterprise-developer-guide/api-authentication-rules.html) + - ecs: `DescribeInstances` `acs:ecs:$regionid:$accountid:instance/*` [链接](https://help.aliyun.com/document_detail/25497.html?spm=a2c4g.25506.0.0) + ```toml # # categraf采集周期,阿里云指标的粒度一般是60秒,建议设置不要少于60秒 # interval = 60 [[instances]] ## 阿里云资源所处的region ## endpoint region 参考 https://help.aliyun.com/document_detail/28616.html#section-72p-xhs-6qt -region="cn-beijing" +regions=["cn-beijing","cn-shanghai"] endpoint="metrics.cn-hangzhou.aliyuncs.com" ## 填入你的acces_key_id access_key_id="" diff --git a/inputs/amd_rocm_smi/README_CN.md b/inputs/amd_rocm_smi/README_CN.md new file mode 100644 index 000000000..6438d46bd --- /dev/null +++ b/inputs/amd_rocm_smi/README_CN.md @@ -0,0 +1,53 @@ +# AMD ROCm System Management Interface (SMI) 采集插件 + +该插件 fork 自 [telegraf/amd_rocm_smi](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/amd_rocm_smi) + +此插件通过执行 [`rocm-smi`][1] 命令来获取 AMD GPU 的状态指标,包括显存使用、GPU 使用率、温度等。 + +[1]: https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools + +## 配置说明 + +```toml +# 使用 rocm-smi 命令查询 AMD 显卡统计信息 +# bin_path = "/opt/rocm/bin/rocm-smi" +# 如果不设置 bin_path,则不会进行采集 + +## 可选: GPU 轮询的超时时间 +# timeout = "5s" +``` + +## 采集指标 + +- 测量名称: `amd_rocm_smi` + - 标签 (Tags) + - `name` (rocm-smi 可执行文件分配的显卡名称) + - `gpu_id` (rocm-smi 识别的 GPU ID) + - `gpu_unique_id` (GPU 的唯一 ID) + + - 字段 (Fields) + - `driver_version` (整数) + - `fan_speed` (整数,风扇转速百分比) + - `memory_total` (整数 B,显存总量) + - `memory_used` (整数 B,已用显存) + - `memory_free` (整数 B,空闲显存) + - `temperature_sensor_edge` (浮点数,摄氏度) + - `temperature_sensor_junction` (浮点数,结温摄氏度) + - `temperature_sensor_memory` (浮点数,显存温度摄氏度) + - `utilization_gpu` (整数,GPU 使用率百分比) + - `utilization_memory` (整数,显存使用率百分比) + - `clocks_current_sm` (整数,Mhz) + - `clocks_current_memory` (整数,Mhz) + - `power_draw` (浮点数,瓦特) + +## 故障排除 + +如果遇到问题,可以尝试手动运行完整的 `rocm-smi` 命令来检查输出结果。 + +Linux 环境下: + +```sh +rocm-smi rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json +``` + +如果在 GitHub 提交 issue,请附上此命令的输出结果以及您所使用的 ROCm 版本。 diff --git a/inputs/amd_rocm_smi/dashboard.json b/inputs/amd_rocm_smi/dashboard.json new file mode 100644 index 000000000..4963f1360 --- /dev/null +++ b/inputs/amd_rocm_smi/dashboard.json @@ -0,0 +1,160 @@ +{ + "title": "AMD ROCm SMI", + "uid": "adadd6dc", + "tags": [ + "amd rocm smi" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "GPU Utilization", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "amd_rocm_smi_utilization_gpu", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + }, + { + "title": "Memory Utilization", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "amd_rocm_smi_utilization_memory", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + }, + { + "title": "Memory Used (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "amd_rocm_smi_memory_used", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + }, + { + "title": "Memory Free (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 8, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "amd_rocm_smi_memory_free", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + }, + { + "title": "Power Draw (W)", + "type": "timeseries", + "gridPos": { + "x": 8, + "y": 8, + "w": 8, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "amd_rocm_smi_power_draw", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + }, + { + "title": "Temperature (Edge)", + "type": "timeseries", + "gridPos": { + "x": 16, + "y": 8, + "w": 8, + "h": 8 + }, + "id": 6, + "targets": [ + { + "expr": "amd_rocm_smi_temperature_sensor_edge", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + }, + { + "title": "Temperature (Junction)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 8, + "h": 8 + }, + "id": 7, + "targets": [ + { + "expr": "amd_rocm_smi_temperature_sensor_junction", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + }, + { + "title": "Fan Speed (%)", + "type": "timeseries", + "gridPos": { + "x": 8, + "y": 16, + "w": 8, + "h": 8 + }, + "id": 8, + "targets": [ + { + "expr": "amd_rocm_smi_fan_speed", + "legendFormat": "{{name}} ({{gpu_id}})", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/apache/README.md b/inputs/apache/README.md index f414f6e84..971dbe4cc 100644 --- a/inputs/apache/README.md +++ b/inputs/apache/README.md @@ -1,23 +1,41 @@ -forked from [apache/README.md](https://github.com/Lusitaniae/apache_exporter/tree/master/README.md) +# Apache -``` +This plugin collects metrics from the Apache HTTP Server by parsing the output of the `mod_status` module. -[[instances]] -## apache 如何设置server-status页面 https://statuslist.app/apache/apache-status-page-simple-setup-guide/ +## Configuration -## 这里填写apache server-status页面的地址 -# scrape_uri = "http://localhost/server-status/?auto" +To use this plugin, you must enable `mod_status` in your Apache configuration and make it accessible. It is highly recommended to append `?auto` to the scrape URI to get the machine-readable output. -## 是否覆盖host -# host_override = "" +```toml +[[instances]] +# The URL to the server-status page. +# Ensure that '?auto' is appended to the URL. +scrape_uri = "http://localhost/server-status/?auto" -## 是否跳过https证书验证 +# Optional: Override the Host header +# host_override = "example.com" + +# Optional: Skip TLS verification # insecure = false +``` + +### Apache mod_status Setup + +Enable the `mod_status` module in your `httpd.conf` or `apache2.conf`: + +```apache + + SetHandler server-status + Require local + +``` + +Restart Apache for the changes to take effect. -## 自定义请求header -# custom_headers = {} +## Metrics -## 日志级别 -# level: debug,info,warn,error -# log_level = "info" -``` \ No newline at end of file +- `apache_accesses_total`: Current total accesses +- `apache_workers`: Apache worker states (busy, idle, etc.) +- `apache_scoreboard`: Number of workers in each state +- `apache_up`: Indicates whether the Apache server was reachable +- `apache_uptime_seconds_total`: Current uptime in seconds diff --git a/inputs/apache/README_CN.md b/inputs/apache/README_CN.md new file mode 100644 index 000000000..c903feba0 --- /dev/null +++ b/inputs/apache/README_CN.md @@ -0,0 +1,41 @@ +# Apache 采集插件 + +此插件通过解析 Apache HTTP Server 的 `mod_status` 模块输出,来获取服务器的运行状态和性能指标。 + +## 配置说明 + +要使用此插件,您必须在 Apache 配置中启用 `mod_status` 模块,并确保 Categraf 能够访问该状态页面。建议在 URL 末尾加上 `?auto` 参数,以便获取机器可读的纯文本格式。 + +```toml +[[instances]] +# server-status 页面的 URL +# 请务必带上 '?auto' 参数 +scrape_uri = "http://localhost/server-status/?auto" + +# 可选: 覆盖请求的 Host 头 +# host_override = "example.com" + +# 可选: 跳过 TLS 证书校验 +# insecure = false +``` + +### Apache mod_status 模块配置 + +在您的 `httpd.conf` 或 `apache2.conf` 文件中添加/取消注释以下内容以启用 `mod_status`: + +```apache + + SetHandler server-status + Require local + +``` + +修改后,请重启 Apache 服务以使配置生效。 + +## 采集指标 + +- `apache_accesses_total`: 服务器总处理请求数 +- `apache_workers`: Apache 各类 worker 的数量 (例如 busy, idle) +- `apache_scoreboard`: 处于不同状态(如读、写、保持连接等)的 worker 数量 +- `apache_up`: Apache 状态页是否可以正常连通 +- `apache_uptime_seconds_total`: Apache 运行时间(秒) diff --git a/inputs/apache/dashboard.json b/inputs/apache/dashboard.json new file mode 100644 index 000000000..bb5101a96 --- /dev/null +++ b/inputs/apache/dashboard.json @@ -0,0 +1,106 @@ +{ + "title": "Apache", + "uid": "2f5c1438", + "tags": [ + "apache" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Total Accesses", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "apache_accesses_total", + "legendFormat": "Total Accesses", + "refId": "A" + } + ] + }, + { + "title": "Uptime", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "apache_uptime_seconds_total", + "legendFormat": "Uptime", + "refId": "A" + } + ] + }, + { + "title": "Busy Workers", + "type": "timeseries", + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "apache_workers{state=\"busy\"}", + "legendFormat": "Busy Workers", + "refId": "A" + } + ] + }, + { + "title": "Idle Workers", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 8, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "apache_workers{state=\"idle\"}", + "legendFormat": "Idle Workers", + "refId": "A" + } + ] + }, + { + "title": "Scoreboard", + "type": "timeseries", + "gridPos": { + "x": 8, + "y": 8, + "w": 8, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "apache_scoreboard", + "legendFormat": "{{state}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/appdynamics/README.md b/inputs/appdynamics/README.md new file mode 100644 index 000000000..1cbaa839f --- /dev/null +++ b/inputs/appdynamics/README.md @@ -0,0 +1,39 @@ +# AppDynamics + +The AppDynamics plugin fetches metrics from the AppDynamics REST API and converts them into Prometheus metrics format. + +## Configuration + +The plugin uses a base URL and a list of variables to dynamically construct the API queries. It parses the JSON response from AppDynamics and exports the specified values. + +```toml +[[instances]] +# The base URL of the AppDynamics REST API. +url_base = "http://your-appdynamics-controller:8090/controller/rest/applications/{{.app_id}}/metric-data?metric-path={{.metric_path}}&time-range-type=BETWEEN_TIMES&start-time=$START_TIME&end-time=$END_TIME&output=JSON" + +# Variables to inject into the url_base template +url_vars = [ + { app_id = "123", metric_path = "Overall Application Performance|Calls per Minute" }, + { app_id = "123", metric_path = "Overall Application Performance|Average Response Time (ms)" } +] + +# Authentication credentials +username = "user@tenant" +password = "your-password" + +# Filters specify which fields from the AppDynamics metric payload to extract. +# Available filters: "current", "max", "min", "count", "sum", "value". +# If empty, defaults to "current". +filters = ["current", "sum"] + +# Timeout and scraping frequencies +timeout = "5s" +delay = "1m" +period = "1m" +``` + +## Metrics + +- `up`: Indicates if the AppDynamics API endpoint was reachable and returned a valid response. +- `appdynamics_{metric_name}_{filter}`: Dynamically generated metric based on the `metric_path`. `metric_name` is derived from the last segment of the `metric_path` converted to snake_case. + - Example: For `Overall Application Performance|Calls per Minute` with filter `current`, it will produce `appdynamics_calls_per_minute_current`. diff --git a/inputs/appdynamics/README_CN.md b/inputs/appdynamics/README_CN.md new file mode 100644 index 000000000..ba02789db --- /dev/null +++ b/inputs/appdynamics/README_CN.md @@ -0,0 +1,39 @@ +# AppDynamics 采集插件 + +此插件通过调用 AppDynamics REST API 来抓取监控指标,并将 JSON 响应转换为 Categraf 可用的 Prometheus 指标格式。 + +## 配置说明 + +插件通过配置 `url_base` 模板和 `url_vars` 变量列表来动态拼装 API 请求地址。由于 AppDynamics API 需要指定时间范围,插件会自动替换请求中的 `$START_TIME` 和 `$END_TIME` 占位符。 + +```toml +[[instances]] +# AppDynamics Controller REST API 的基础请求模板 +url_base = "http://your-appdynamics-controller:8090/controller/rest/applications/{{.app_id}}/metric-data?metric-path={{.metric_path}}&time-range-type=BETWEEN_TIMES&start-time=$START_TIME&end-time=$END_TIME&output=JSON" + +# 注入到 url_base 模板中的变量列表,可以配置多个查询任务 +url_vars = [ + { app_id = "123", metric_path = "Overall Application Performance|Calls per Minute" }, + { app_id = "123", metric_path = "Overall Application Performance|Average Response Time (ms)" } +] + +# 接口基础认证 (Basic Auth) +username = "user@tenant" +password = "your-password" + +# 需要提取的指标字段类型。 +# 可选项: "current", "max", "min", "count", "sum", "value"。 +# 若不指定,默认提取 "current"。 +filters = ["current", "sum"] + +# 可选: 网络与采集时间参数 +timeout = "5s" +delay = "1m" +period = "1m" +``` + +## 采集指标 + +- `up`: AppDynamics 接口是否连通并返回了正常数据(1 为正常,0 为失败)。 +- `appdynamics_{metric_name}_{filter}`: 动态生成的指标。`metric_name` 是从 API 路径中最后一个层级提取并经过 `snake_case` 转换得来的。 + - 例如:当请求路径为 `Overall Application Performance|Calls per Minute`,且 filter 包含 `current` 时,将产生指标 `appdynamics_calls_per_minute_current`。 diff --git a/inputs/appdynamics/dashboard.json b/inputs/appdynamics/dashboard.json new file mode 100644 index 000000000..f8876d4f4 --- /dev/null +++ b/inputs/appdynamics/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "AppDynamics", + "uid": "56586b1c", + "tags": [ + "appdynamics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "AppDynamics API Health", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "up{job=\"appdynamics\"}", + "legendFormat": "{{metric_path}}", + "refId": "A" + } + ] + }, + { + "title": "AppDynamics Calls per Minute", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "appdynamics_calls_per_minute_current", + "legendFormat": "{{metric_path}}", + "refId": "A" + } + ] + }, + { + "title": "Average Response Time (ms)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "appdynamics_average_response_time__ms__current", + "legendFormat": "{{metric_path}}", + "refId": "A" + } + ] + }, + { + "title": "Errors per Minute", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "appdynamics_errors_per_minute_current", + "legendFormat": "{{metric_path}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/arp_packet/README.md b/inputs/arp_packet/README.md index cf17762e1..88c7ce0e6 100644 --- a/inputs/arp_packet/README.md +++ b/inputs/arp_packet/README.md @@ -1,30 +1,47 @@ -# 调整间隔时间 -如有诉求对此插件本身的采集间隔时间调整的话就启用,单位为秒 +# ARP Packet + +This plugin captures ARP request and response packets on a specified network interface using a BPF filter, keeping track of the total packet counts for the local IP address. + +> **Note**: Running this plugin requires packet capture capabilities (e.g., running as root or having `CAP_NET_RAW` capability) and libpcap dependencies. + +## Configuration + +```toml +# Collection interval in seconds interval = 15 -# 获取被监控端设备的网卡名称 -可用以下命令获取网卡名称列表 +[[instances]] +# The name of the network interface to monitor +eth_device = "eth0" ``` -ip addr | grep '^[0-9]' |awk -F':' '{print $2}' +### Finding the Interface Name + +You can use the following command to get a list of available network interfaces: + +```sh +ip addr | grep '^[0-9]' | awk -F':' '{print $2}' +``` +Example output: +```text lo eth0 - br-153e7f4f0c83 - br-2f302c2a8faa - br-5ae0cdb82efc - br-68cba8773a8c - br-c50ca3122079 docker0 - br-fd769e4347bd - veth944ac75@if52 -``` -# 在数组instances中启用eth_device -将以上获取的网卡列表,根据自己的诉求填入,如eth0 -``` -eth_device="eth0" ``` -# 测试是否能获取到值 -``` -./categraf --test --inputs arp_packet +Select the appropriate interface (e.g., `eth0`) and set it in the `eth_device` parameter. + +## Metrics + +- `arp_packet_request_num`: Total number of ARP requests sent from the monitored interface. +- `arp_packet_response_num`: Total number of ARP responses received on the monitored interface. + +All metrics include the `sourceAddr` tag, which contains the bound local IPv4 address. + +## Testing + +You can use the following command to test if the plugin is successfully capturing ARP packets: + +```sh +./categraf --test --inputs arp_packet ``` diff --git a/inputs/arp_packet/README_CN.md b/inputs/arp_packet/README_CN.md new file mode 100644 index 000000000..b155cfdf6 --- /dev/null +++ b/inputs/arp_packet/README_CN.md @@ -0,0 +1,47 @@ +# ARP Packet 采集插件 + +该插件通过监听指定的网卡,使用 BPF 过滤器捕获 ARP 请求和响应包,从而统计本地 IP 地址发出的 ARP 包数量。 + +> 注意:运行该插件需要 Categraf 拥有捕获网络数据包的权限(例如 root 权限或 CAP_NET_RAW 权限),且系统依赖 libpcap。 + +## 配置说明 + +```toml +# 采集间隔时间 (单位: 秒) +interval = 15 + +[[instances]] +# 被监控端设备的网卡名称 +eth_device = "eth0" +``` + +### 获取网卡名称 + +您可以使用以下命令获取可用的网卡名称列表: + +```sh +ip addr | grep '^[0-9]' | awk -F':' '{print $2}' +``` +示例输出: +```text + lo + eth0 + docker0 +``` + +根据您的实际情况,将目标网卡(如 `eth0`)填入 `eth_device` 参数中。 + +## 采集指标 + +- `arp_packet_request_num`: 监听网卡上累计发出的 ARP 请求数 +- `arp_packet_response_num`: 监听网卡上累计收到的 ARP 响应数 + +所有指标会附带标签 `sourceAddr`,表示绑定的本地 IPv4 地址。 + +## 测试 + +您可以使用以下命令单独测试该插件能否正常获取到值: + +```sh +./categraf --test --inputs arp_packet +``` diff --git a/inputs/arp_packet/dashboard.json b/inputs/arp_packet/dashboard.json new file mode 100644 index 000000000..5753041c2 --- /dev/null +++ b/inputs/arp_packet/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "ARP Packet", + "uid": "dadb7b1e", + "tags": [ + "arp packet" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "ARP Packet Request Num", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "arp_packet_request_num", + "legendFormat": "{{sourceAddr}}", + "refId": "A" + } + ] + }, + { + "title": "ARP Packet Response Num", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "arp_packet_response_num", + "legendFormat": "{{sourceAddr}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/bind/README.md b/inputs/bind/README.md index 1c69dc2eb..ad3244b35 100644 --- a/inputs/bind/README.md +++ b/inputs/bind/README.md @@ -1,13 +1,39 @@ -forked from [telegraf/snmp](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/bind) +# BIND 9 Input Plugin -配置示例 +This plugin is forked from [telegraf/bind](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/bind). + +This plugin reads statistics from BIND 9's Statistics Channel (XML or JSON format), collecting metrics on DNS queries, server status, and memory context. + +## Configuration + +To use this plugin, you must configure the `statistics-channels` in your BIND 9 `named.conf`: + +```text +statistics-channels { + inet 127.0.0.1 port 8053 allow { 127.0.0.1; }; +}; ``` + +Then configure Categraf as follows: + +```toml [[instances]] +# URL to the BIND 9 statistics channel (XML/JSON supported) urls = [ - #"http://localhost:8053/xml/v3", + "http://localhost:8053/xml/v3", + # "http://localhost:8053/json/v1" ] timeout = "5s" +# Set to true to collect detailed memory context metrics gather_memory_contexts = true +# Set to true to collect metrics per view gather_views = true -``` \ No newline at end of file +``` + +## Metrics + +- `bind_server_*`: Global server metrics, such as total requests, queries, success, nxrrset, failure, recursion, etc. +- `bind_memory_context_*`: Internal memory usage by various BIND modules (requires `gather_memory_contexts = true`). +- `bind_view_*`: Per-view query metrics (requires `gather_views = true`). +- `bind_up`: Whether the statistics channel was reachable. diff --git a/inputs/bind/README_CN.md b/inputs/bind/README_CN.md new file mode 100644 index 000000000..40d25b126 --- /dev/null +++ b/inputs/bind/README_CN.md @@ -0,0 +1,39 @@ +# BIND 9 采集插件 + +该插件 fork 自 [telegraf/bind](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/bind)。 + +此插件通过读取 BIND 9 的 XML 或 JSON 状态统计接口(Statistics Channel)来收集 DNS 查询、服务器状态以及内存等指标数据。 + +## 配置说明 + +要使用此插件,您需要在 `named.conf` 中配置统计通道 (statistics-channel),例如: + +```text +statistics-channels { + inet 127.0.0.1 port 8053 allow { 127.0.0.1; }; +}; +``` + +然后在 Categraf 中进行如下配置: + +```toml +[[instances]] +# BIND 9 状态接口地址,支持 XML/JSON +urls = [ + "http://localhost:8053/xml/v3", + # "http://localhost:8053/json/v1" +] + +timeout = "5s" +# 是否采集详细的内存上下文指标 +gather_memory_contexts = true +# 是否采集视图 (views) 相关指标 +gather_views = true +``` + +## 采集指标 + +- `bind_server_*`: BIND 服务器的全局请求数、查询数、成功/失败/拒绝的解析数等。 +- `bind_memory_context_*`: BIND 内部各模块的内存使用量(需开启 `gather_memory_contexts`)。 +- `bind_view_*`: 按 DNS View 统计的查询数据(需开启 `gather_views`)。 +- `bind_up`: 目标统计接口是否可达。 \ No newline at end of file diff --git a/inputs/bind/dashboard.json b/inputs/bind/dashboard.json new file mode 100644 index 000000000..5980cd4e0 --- /dev/null +++ b/inputs/bind/dashboard.json @@ -0,0 +1,124 @@ +{ + "title": "BIND 9", + "uid": "6f9a94e0", + "tags": [ + "bind 9" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "BIND Status", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 4 + }, + "id": 1, + "targets": [ + { + "expr": "bind_up", + "legendFormat": "{{url}}", + "refId": "A" + } + ] + }, + { + "title": "Queries Total", + "type": "timeseries", + "gridPos": { + "x": 8, + "y": 0, + "w": 8, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "bind_server_queries", + "legendFormat": "Queries", + "refId": "A" + } + ] + }, + { + "title": "Requests Total", + "type": "timeseries", + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "bind_server_requests", + "legendFormat": "Requests", + "refId": "A" + } + ] + }, + { + "title": "Responses Total", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 8, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "bind_server_responses", + "legendFormat": "Responses", + "refId": "A" + } + ] + }, + { + "title": "Total Use Memory (Context)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 12, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "bind_memory_context_TotalUse", + "legendFormat": "Total Use", + "refId": "A" + } + ] + }, + { + "title": "In Use Memory (Context)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 16, + "w": 12, + "h": 8 + }, + "id": 6, + "targets": [ + { + "expr": "bind_memory_context_InUse", + "legendFormat": "In Use", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/bitbucket/README.md b/inputs/bitbucket/README.md index c0a61ddb7..bed72bf74 100644 --- a/inputs/bitbucket/README.md +++ b/inputs/bitbucket/README.md @@ -1,3 +1,13 @@ -# bitbucket +# Bitbucket -bitbucket 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[bitbucket.toml](../../conf/input.jolokia_agent_misc/bitbucket.toml) +Bitbucket can be monitored using the `jolokia_agent` plugin, which retrieves metrics by reading JMX data from Atlassian Bitbucket. + +For configuration details, please refer to: [bitbucket.toml](../../conf/input.jolokia_agent_misc/bitbucket.toml). + +## Metrics + +Once configured via the Jolokia Agent plugin, Categraf will export the following types of metrics: +- **JVM Metrics**: e.g., `bitbucket_jvm_operatingsystem_*`, `bitbucket_jvm_memory_*`, `bitbucket_jvm_thread_*` +- **Webhooks**: e.g., `bitbucket_webhooks_*` +- **Atlassian Bitbucket Metrics**: e.g., `bitbucket_atlassian_*` +- **Thread Pools**: e.g., `bitbucket_thread_pools_*` diff --git a/inputs/bitbucket/README_CN.md b/inputs/bitbucket/README_CN.md new file mode 100644 index 000000000..c0a61ddb7 --- /dev/null +++ b/inputs/bitbucket/README_CN.md @@ -0,0 +1,3 @@ +# bitbucket + +bitbucket 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[bitbucket.toml](../../conf/input.jolokia_agent_misc/bitbucket.toml) diff --git a/inputs/bitbucket/dashboard.json b/inputs/bitbucket/dashboard.json new file mode 100644 index 000000000..96de17207 --- /dev/null +++ b/inputs/bitbucket/dashboard.json @@ -0,0 +1,75 @@ +{ + "title": "Bitbucket (Jolokia)", + "uid": "c29859b5", + "tags": [ + "bitbucket (jolokia)" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "JVM Heap Memory Usage", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "bitbucket_jvm_memory_HeapMemoryUsage_used", + "legendFormat": "Used", + "refId": "A" + }, + { + "expr": "bitbucket_jvm_memory_HeapMemoryUsage_max", + "legendFormat": "Max", + "refId": "B" + } + ] + }, + { + "title": "JVM Thread Count", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "bitbucket_jvm_thread_ThreadCount", + "legendFormat": "Threads", + "refId": "A" + } + ] + }, + { + "title": "System Load Average", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "bitbucket_jvm_operatingsystem_SystemLoadAverage", + "legendFormat": "Load", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/cadvisor/README.md b/inputs/cadvisor/README.md index 56e6070ef..7454103d7 100644 --- a/inputs/cadvisor/README.md +++ b/inputs/cadvisor/README.md @@ -1,6 +1,6 @@ -## cadvisor +# cAdvisor Input Plugin -cadvisor 采集插件, 采集cadvisor 数据,如果是通过kubelet采集,可以附加pod的label和annotation +The cAdvisor input plugin collects metrics from cAdvisor. If it is collected via `kubelet`, it can optionally append pod labels and annotations. ## Configuration @@ -9,34 +9,36 @@ cadvisor 采集插件, 采集cadvisor 数据,如果是通过kubelet采集, # interval = 15 [[instances]] -# 填写kubelet的ip和port +# Specify the kubelet IP and port url = "https://1.2.3.4:10250/metrics/cadvisor" -# 如果path为空, 会自动补齐为/metrics/cadvisor +# If the path is empty, it will be automatically appended as /metrics/cadvisor # url = "https://1.2.3.4:10250" -# 如果是通过kubelet采集,可以附加pod的label和annotation + +# If collecting via kubelet, you can append pod labels and annotations type = "kubelet" -# 直接采集cadvisor , type 设置为cadvisor +# If collecting directly from cAdvisor, set type to "cadvisor" #url = "http://1.2.3.4:8080/metrics" #type = "cadvisor" -# url_label_key 和 url_label_value 用法参加下面说明 +# Usage of url_label_key and url_label_value is explained below url_label_key = "instance" url_label_value = "{{.Host}}" -# # 认证的token 或者token file + +# Authentication token or token file #bearer_token_string = "eyJhblonglongXXX.eyJplonglongYYY.oQsXlonglongZ-Z-Z" bearer_token_file = "/path/to/token/file" -# 需要忽略的label key +# Label keys to ignore ignore_label_keys = ["id","name", "container_label*"] -# 只采集那些label key, 建议保持为空,采集所有的label。 优先级高于ignore_label_keys。 -# 放开choose_label_keys配置时,如果不使用["*"],需要里面包含"pod","namespace",否则采集不到pod标签,例如:["app","pod","namespace"] +# Label keys to explicitly choose. It is recommended to leave this empty to collect all labels. +# This takes precedence over ignore_label_keys. #choose_label_keys = ["*"] timeout = "3s" # # Optional TLS Config -# # 想跳过自签证书,use_tls 记得要配置为true +# # Set use_tls to true if you want to skip self-signed certificates use_tls = true # tls_min_version = "1.2" # tls_ca = "/etc/categraf/ca.pem" @@ -46,58 +48,31 @@ use_tls = true insecure_skip_verify = true ``` -## url_label_key 和 url_label_value 用法 +## `url_label_key` and `url_label_value` Usage + ```toml -# 从URL中提取Host部分,放到instance label中 -# 假设 url =https://1.2.3.4:10250/metrics/cadvisor -# 最终附加的label为 instance=1.2.3.4:10250 +# Extract the Host part from the URL and put it into the instance label +# Assuming url = https://1.2.3.4:10250/metrics/cadvisor +# The final appended label will be instance=1.2.3.4:10250 url_label_key = "instance" url_label_value = "{{.Host}}" ``` -如果 scheme 部分和 path 部分都想取,可以这么写: +If you want to include both the scheme and the path, you can format it like this: ```toml url_label_value = "{{.Scheme}}://{{.Host}}{{.Path}}" ``` -相关变量是用这个方法生成的,供大家参考: - -```go -func (ul *UrlLabel) GenerateLabel(u *url.URL) (string, string, error) { - if ul.LabelValue == "" { - return ul.LabelKey, u.String(), nil - } - - dict := map[string]string{ - "Scheme": u.Scheme, - "Host": u.Host, - "Hostname": u.Hostname(), - "Port": u.Port(), - "Path": u.Path, - "Query": u.RawQuery, - "Fragment": u.Fragment, - } - - var buffer bytes.Buffer - err := ul.LabelValueTpl.Execute(&buffer, dict) - if err != nil { - return "", "", err - } - - return ul.LabelKey, buffer.String(), nil -} -``` - -以 `http://1.2.3.4:8080/search?q=keyword#results` 为例, 变量及其值如下: +The related variables are generated using the URL template fields: -|variable|value| +| variable | value | |---|---| -|{{.Scheme}}|http| -|{{.Host}} |1.2.3.4:8080| -|{{.Hostname}}|1.2.3.4| -|{{.Port}}|8080| -|{{.Path}}|search| -|{{.Query}}|q=keyword| -|{{.Fragment}}| results| +| `{{.Scheme}}` | http | +| `{{.Host}}` | 1.2.3.4:8080 | +| `{{.Hostname}}` | 1.2.3.4 | +| `{{.Port}}` | 8080 | +| `{{.Path}}` | search | +| `{{.Query}}` | q=keyword | +| `{{.Fragment}}` | results | diff --git a/inputs/cadvisor/README_CN.md b/inputs/cadvisor/README_CN.md new file mode 100644 index 000000000..dfe20e46c --- /dev/null +++ b/inputs/cadvisor/README_CN.md @@ -0,0 +1,102 @@ +## cadvisor + +cadvisor 采集插件, 采集cadvisor 数据,如果是通过kubelet采集,可以附加pod的label和annotation + +## Configuration + +```toml +# # collect interval +# interval = 15 + +[[instances]] +# 填写kubelet的ip和port +url = "https://1.2.3.4:10250/metrics/cadvisor" +# 如果path为空, 会自动补齐为/metrics/cadvisor +# url = "https://1.2.3.4:10250" +# 如果是通过kubelet采集,可以附加pod的label和annotation +type = "kubelet" + +# 直接采集cadvisor , type 设置为cadvisor +#url = "http://1.2.3.4:8080/metrics" +#type = "cadvisor" + +# url_label_key 和 url_label_value 用法参加下面说明 +url_label_key = "instance" +url_label_value = "{{.Host}}" +# # 认证的token 或者token file +#bearer_token_string = "eyJhblonglongXXX.eyJplonglongYYY.oQsXlonglongZ-Z-Z" +bearer_token_file = "/path/to/token/file" + +# 需要忽略的label key +ignore_label_keys = ["id","name", "container_label*"] +# 只采集那些label key, 建议保持为空,采集所有的label。 优先级高于ignore_label_keys。 +#choose_label_keys = ["*"] + +timeout = "3s" + +# # Optional TLS Config +# # 想跳过自签证书,use_tls 记得要配置为true +use_tls = true +# tls_min_version = "1.2" +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +insecure_skip_verify = true +``` + +## url_label_key 和 url_label_value 用法 +```toml +# 从URL中提取Host部分,放到instance label中 +# 假设 url =https://1.2.3.4:10250/metrics/cadvisor +# 最终附加的label为 instance=1.2.3.4:10250 + +url_label_key = "instance" +url_label_value = "{{.Host}}" +``` + +如果 scheme 部分和 path 部分都想取,可以这么写: + +```toml +url_label_value = "{{.Scheme}}://{{.Host}}{{.Path}}" +``` + +相关变量是用这个方法生成的,供大家参考: + +```go +func (ul *UrlLabel) GenerateLabel(u *url.URL) (string, string, error) { + if ul.LabelValue == "" { + return ul.LabelKey, u.String(), nil + } + + dict := map[string]string{ + "Scheme": u.Scheme, + "Host": u.Host, + "Hostname": u.Hostname(), + "Port": u.Port(), + "Path": u.Path, + "Query": u.RawQuery, + "Fragment": u.Fragment, + } + + var buffer bytes.Buffer + err := ul.LabelValueTpl.Execute(&buffer, dict) + if err != nil { + return "", "", err + } + + return ul.LabelKey, buffer.String(), nil +} +``` + +以 `http://1.2.3.4:8080/search?q=keyword#results` 为例, 变量及其值如下: + +|variable|value| +|---|---| +|{{.Scheme}}|http| +|{{.Host}} |1.2.3.4:8080| +|{{.Hostname}}|1.2.3.4| +|{{.Port}}|8080| +|{{.Path}}|search| +|{{.Query}}|q=keyword| +|{{.Fragment}}| results| \ No newline at end of file diff --git a/inputs/cadvisor/dashboard.json b/inputs/cadvisor/dashboard.json new file mode 100644 index 000000000..e833711a4 --- /dev/null +++ b/inputs/cadvisor/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "cAdvisor", + "uid": "1a468b55", + "tags": [ + "cadvisor" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Container CPU Usage", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total[5m])", + "legendFormat": "{{container}}", + "refId": "A" + } + ] + }, + { + "title": "Container Memory Usage", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "container_memory_usage_bytes", + "legendFormat": "{{container}}", + "refId": "A" + } + ] + }, + { + "title": "Container Network Rx Bytes", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(container_network_receive_bytes_total[5m])", + "legendFormat": "{{container}}", + "refId": "A" + } + ] + }, + { + "title": "Container Network Tx Bytes", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(container_network_transmit_bytes_total[5m])", + "legendFormat": "{{container}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/cassandra/README.md b/inputs/cassandra/README.md index ac0ae655b..17e7f188a 100644 --- a/inputs/cassandra/README.md +++ b/inputs/cassandra/README.md @@ -1,3 +1,14 @@ -# cassandra +# Cassandra -cassandra 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[cassandra.toml](../../conf/input.jolokia_agent_misc/cassandra.toml) +Cassandra can be monitored using the `jolokia_agent` plugin by reading JMX metrics exposed by the Apache Cassandra JVM. + +For configuration details, please refer to: [cassandra.toml](../../conf/input.jolokia_agent_misc/cassandra.toml). + +## Metrics + +When configured via the Jolokia Agent plugin, Categraf will export the following metrics: +- **JVM Memory & GC**: e.g., `java_Memory_*`, `java_GarbageCollector_*` +- **Cassandra Cache**: e.g., `cassandra_Cache_*` +- **Cassandra Client & Requests**: e.g., `cassandra_Client_*`, `cassandra_ClientRequest_*` +- **Cassandra Storage & Compaction**: e.g., `cassandra_Storage_*`, `cassandra_Compaction_*` +- **Cassandra Column Family**: e.g., `cassandra_ColumnFamily_*` diff --git a/inputs/cassandra/README_CN.md b/inputs/cassandra/README_CN.md new file mode 100644 index 000000000..ac0ae655b --- /dev/null +++ b/inputs/cassandra/README_CN.md @@ -0,0 +1,3 @@ +# cassandra + +cassandra 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[cassandra.toml](../../conf/input.jolokia_agent_misc/cassandra.toml) diff --git a/inputs/cassandra/dashboard.json b/inputs/cassandra/dashboard.json new file mode 100644 index 000000000..ce723720b --- /dev/null +++ b/inputs/cassandra/dashboard.json @@ -0,0 +1,93 @@ +{ + "title": "Cassandra (Jolokia)", + "uid": "57a607e6", + "tags": [ + "cassandra (jolokia)" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "JVM Heap Memory Usage", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "java_Memory_HeapMemoryUsage_used", + "legendFormat": "Used", + "refId": "A" + }, + { + "expr": "java_Memory_HeapMemoryUsage_max", + "legendFormat": "Max", + "refId": "B" + } + ] + }, + { + "title": "Cassandra Compaction Tasks Pending", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "cassandra_Compaction_PendingTasks_Value", + "legendFormat": "Pending Tasks", + "refId": "A" + } + ] + }, + { + "title": "Cassandra Storage Load", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "cassandra_Storage_Load_Count", + "legendFormat": "Load", + "refId": "A" + } + ] + }, + { + "title": "Cassandra Active Clients", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "cassandra_Client_connectedNativeClients_Value", + "legendFormat": "Connected Native Clients", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/clickhouse/README.md b/inputs/clickhouse/README.md index 1727c2288..135307e20 100644 --- a/inputs/clickhouse/README.md +++ b/inputs/clickhouse/README.md @@ -74,9 +74,9 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. # cluster_exclude = [] ## Optional TLS Config - # tls_ca = "/etc/categraf/ca.pem" - # tls_cert = "/etc/categraf/cert.pem" - # tls_key = "/etc/categraf/key.pem" + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = false diff --git a/inputs/clickhouse/README_CN.md b/inputs/clickhouse/README_CN.md new file mode 100644 index 000000000..35e3c9f64 --- /dev/null +++ b/inputs/clickhouse/README_CN.md @@ -0,0 +1,62 @@ +# ClickHouse 采集插件 + +该插件用于从 [ClickHouse](https://github.com/ClickHouse/ClickHouse) 数据库服务器收集统计数据指标。 + +## 配置说明 + +```toml +# # collect interval +# interval = 15 + +# 从一个或多个 ClickHouse 服务器读取指标 +[[instances]] + ## 用于在 ClickHouse 服务器上进行授权的用户名 + username = "default" + + ## 用于在 ClickHouse 服务器上进行授权的密码 + # password = "" + + ## 获取指标时的 HTTP(s) 超时时间 + ## 包含连接时间、重定向时间以及读取响应体的时间。 + # timeout = 5 + + ## 要抓取指标的服务器列表 + ## 通过 HTTP(s) ClickHouse 接口抓取指标 + servers = ["http://127.0.0.1:8123"] + + ## 如果将 auto_discovery 设置为 true,插件会尝试连接到集群中可用的所有服务器 + ## (使用上面配置的 username 和 password),并通过 system.clusters 系统表获取服务器列表。 + # auto_discovery = true + + ## 当 auto_discovery 为 true 时,使用 cluster_include 过滤要包含的集群名称 + ## (相当于 SQL 里的 WHERE cluster IN (...)) + # cluster_include = [] + + ## 当 auto_discovery 为 true 时,使用 cluster_exclude 排除指定的集群名称 + ## (相当于 SQL 里的 WHERE cluster NOT IN (...)) + # cluster_exclude = [] + + ## 可选的 TLS 配置 + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## 忽略自签证书的安全校验 + # insecure_skip_verify = false +``` + +## 采集指标 + +所有指标主要来自 ClickHouse 的系统表 (如 `system.metrics`, `system.events` 等),指标分类如下: + +- `clickhouse_events`: 来源于 `system.events` +- `clickhouse_metrics`: 来源于 `system.metrics` +- `clickhouse_asynchronous_metrics`: 来源于 `system.asynchronous_metrics` +- `clickhouse_tables`: 包含数据库、表名、行数、数据大小 (`bytes`, `parts`, `rows`) +- `clickhouse_zookeeper`: ZooKeeper 状态指标 (如 `root_nodes`) +- `clickhouse_replication_queue`: 复制队列指标 (如 `too_many_tries_replicas`) +- `clickhouse_detached_parts`: 隔离的分区指标 (`detached_parts`) +- `clickhouse_dictionaries`: 字典信息指标 (`is_loaded`, `bytes_allocated`) +- `clickhouse_mutations`: 数据变更(Mutations)任务信息 (`running`, `failed`, `completed`) +- `clickhouse_disks`: 磁盘容量相关 (`free_space_percent`, `keep_free_space_percent`) +- `clickhouse_processes`: 查询进程耗时百分位数 (`percentile_50`, `percentile_90`, `longest_running`) +- `clickhouse_text_log`: 日志统计 (`messages_last_10_min`) diff --git a/inputs/clickhouse/dashboard.json b/inputs/clickhouse/dashboard.json new file mode 100644 index 000000000..e458a760e --- /dev/null +++ b/inputs/clickhouse/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "ClickHouse", + "uid": "35c094f4", + "tags": [ + "clickhouse" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "ClickHouse Insert Queries", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(clickhouse_events_InsertQuery[5m])", + "legendFormat": "{{source}}", + "refId": "A" + } + ] + }, + { + "title": "ClickHouse Select Queries", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(clickhouse_events_SelectQuery[5m])", + "legendFormat": "{{source}}", + "refId": "A" + } + ] + }, + { + "title": "ClickHouse Memory Tracking", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "clickhouse_metrics_MemoryTracking", + "legendFormat": "{{source}}", + "refId": "A" + } + ] + }, + { + "title": "ClickHouse Max Part Count", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "clickhouse_asynchronous_metrics_MaxPartCountForPartition", + "legendFormat": "{{source}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/cloudwatch/README_CN.md b/inputs/cloudwatch/README_CN.md new file mode 100644 index 000000000..1f186f60c --- /dev/null +++ b/inputs/cloudwatch/README_CN.md @@ -0,0 +1,83 @@ +# Amazon CloudWatch 采集插件 + +该插件 fork 自 [telegraf/cloudwatch](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/cloudwatch),用于从 Amazon CloudWatch 提取指标统计数据。 + +## 认证方式 + +此插件使用凭据链来与 CloudWatch API 进行认证。插件将按以下顺序尝试进行身份验证: + +1. 如果指定了 `role_arn`,则通过 STS 承担角色(源凭证根据后续规则评估) +2. 如果配置了 `access_key`, `secret_key` 和 `token`,则使用显式凭据 +3. 如果配置了 `profile`,则使用共享配置中的身份凭证 +4. 环境变量(如 `AWS_ACCESS_KEY_ID` 等) +5. 共享凭据文件 (`~/.aws/credentials`) +6. EC2 实例配置文件 (IAM 角色) + +## 配置说明 + +```toml +# 从 Amazon CloudWatch 提取指标统计数据 +[[instances]] + ## Amazon 区域 + region = "us-east-1" + + ## Amazon 凭据配置 (可选) + # access_key = "" + # secret_key = "" + # token = "" + # role_arn = "" + # web_identity_token_file = "" + # role_session_name = "" + # profile = "" + # shared_credential_file = "" + + ## 获取指标的周期 (必需) + ## 必须是 60 秒的倍数。 + period = "5m" + + ## 采集延迟时间 (必需) + ## 用于应对 CloudWatch API 中的指标生成延迟。 + delay = "5m" + + ## 建议: 将 interval 设置为 period 的倍数,以避免数据遗漏或重复抓取。 + interval = "5m" + + ## 指标所在的 Namespace 列表 (必需) + namespaces = ["AWS/ELB"] + + ## 请求 CloudWatch API 的速率限制 + # ratelimit = 25 + + ## CloudWatch HTTP 客户端超时时间 + # timeout = "5s" + + ## 指标配置过滤 + ## 默认拉取整个 Namespace 下的所有指标 + # [[instances.metrics]] + # names = ["Latency", "RequestCount"] + # + # ## 指定指标获取的统计信息 + # # statistic_include = ["average", "sum", "minimum", "maximum", "sample_count"] + # + # ## Dimension (维度) 过滤条件 + # [[instances.metrics.dimensions]] + # name = "LoadBalancerName" + # value = "p-example" +``` + +## 采集指标 + +监控的每个 CloudWatch Namespace 会作为 measurement,并提取相应的统计字段(命名为 `snake_case`): + +- `cloudwatch_{namespace}` + - `{metric}_sum` (总和) + - `{metric}_average` (平均值) + - `{metric}_minimum` (最小值) + - `{metric}_maximum` (最大值) + - `{metric}_sample_count` (采样数) + +### 标签 (Tags) + +所有的指标都会被打上以下标签: +- `region`: CloudWatch 所在的区域 +- `{dimension-name}`: 维度名称及对应的值 diff --git a/inputs/cloudwatch/dashboard.json b/inputs/cloudwatch/dashboard.json new file mode 100644 index 000000000..85fc6bd9c --- /dev/null +++ b/inputs/cloudwatch/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "AWS CloudWatch", + "uid": "3be5e187", + "tags": [ + "aws cloudwatch" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "AWS ELB Latency Average", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "cloudwatch_aws_elb_latency_average", + "legendFormat": "{{load_balancer_name}}", + "refId": "A" + } + ] + }, + { + "title": "AWS ELB Request Count", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "cloudwatch_aws_elb_request_count_sum", + "legendFormat": "{{load_balancer_name}}", + "refId": "A" + } + ] + }, + { + "title": "AWS EC2 CPU Utilization Average", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "cloudwatch_aws_ec2_cpu_utilization_average", + "legendFormat": "{{instance_id}}", + "refId": "A" + } + ] + }, + { + "title": "AWS RDS CPU Utilization Average", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "cloudwatch_aws_rds_cpu_utilization_average", + "legendFormat": "{{db_instance_identifier}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/conntrack/README.md b/inputs/conntrack/README.md index ec17df037..62ea27090 100644 --- a/inputs/conntrack/README.md +++ b/inputs/conntrack/README.md @@ -1,16 +1,20 @@ -# conntrack +# Conntrack Input Plugin -运维老鸟应该会遇到 conntrack table full 的报错吧,这个插件就是用于监控 conntrack 的情况, forked from [telegraf/conntrack](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/conntrack) +This plugin monitors the connection tracking (conntrack) table on Linux servers. It is forked from `telegraf/conntrack`. -## Measurements & Fields +System administrators often encounter the `nf_conntrack: table full, dropping packet` error. This plugin helps you monitor the usage of the conntrack table in real-time to prevent such issues. -- conntrack - - ip_conntrack_count (int, count): the number of entries in the conntrack table - - ip_conntrack_max (int, size): the max capacity of the conntrack table +## Metrics -## 告警 +All metrics are recorded under the `conntrack` measurement: +- `conntrack_ip_conntrack_count`: The current number of entries in the conntrack table. +- `conntrack_ip_conntrack_max`: The maximum capacity of the conntrack table. + +## Alerting Recommendation + +You can configure an alerting rule in your monitoring system (like Prometheus or Nightingale) to trigger an alert when the conntrack table is close to being full: + +```promql +conntrack_ip_conntrack_count / conntrack_ip_conntrack_max > 0.8 ``` -100 * conntrack_ip_conntrack_count / conntrack_ip_conntrack_max > 0.8 -100 * conntrack_nf_conntrack_count / conntrack_nf_conntrack_max > 0.8 -``` \ No newline at end of file diff --git a/inputs/conntrack/README_CN.md b/inputs/conntrack/README_CN.md new file mode 100644 index 000000000..8de201c57 --- /dev/null +++ b/inputs/conntrack/README_CN.md @@ -0,0 +1,20 @@ +# Conntrack 采集插件 + +该插件用于监控 Linux 服务器上的 connection tracking (conntrack) 表的状态。该项目 fork 自 `telegraf/conntrack`。 + +运维人员经常会遇到 `nf_conntrack: table full, dropping packet` 的报错,这个插件可以帮助您实时监控 conntrack 表的使用情况。 + +## 采集指标 + +所有指标将附带在 `conntrack` 这个 measurement 下: + +- `conntrack_ip_conntrack_count`: 当前 conntrack 表中的连接条目数 (count)。 +- `conntrack_ip_conntrack_max`: 当前 conntrack 表的最大容量限制 (size)。 + +## 告警配置建议 + +您可以在夜莺或 Prometheus 中配置如下的告警规则,以便在 conntrack 表即将被填满时收到告警通知: + +```promql +conntrack_ip_conntrack_count / conntrack_ip_conntrack_max > 0.8 +``` \ No newline at end of file diff --git a/inputs/conntrack/dashboard.json b/inputs/conntrack/dashboard.json new file mode 100644 index 000000000..cf061ea21 --- /dev/null +++ b/inputs/conntrack/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "Conntrack", + "uid": "3fbb5bac", + "tags": [ + "conntrack" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Conntrack Usage (%)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "conntrack_ip_conntrack_count / conntrack_ip_conntrack_max * 100", + "legendFormat": "Usage %", + "refId": "A" + } + ] + }, + { + "title": "Conntrack Count", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "conntrack_ip_conntrack_count", + "legendFormat": "Count", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/consul/README_CN.md b/inputs/consul/README_CN.md new file mode 100644 index 000000000..453b9478e --- /dev/null +++ b/inputs/consul/README_CN.md @@ -0,0 +1,52 @@ +# Consul 采集插件 + +该插件用于收集 Consul 注册的所有健康检查的状态信息以及 Consul 自身的集群状态。插件通过 [Consul API](https://www.consul.io/docs/agent/http/health.html#health_state) 获取数据。 + +## 配置说明 + +```toml +# 采集 Consul 中注册的服务和健康检查状态 +[[instances]] + ## Consul Server 地址 + # address = "localhost:8500" + + ## Consul Server 的 URI 协议,支持 "http" 或 "https" + # scheme = "http" + + ## 发送请求时使用的 ACL token + # token = "" + + ## HTTP 基础认证 (Basic Authentication) 的用户名和密码 + # username = "" + # password = "" + + ## 指定查询的数据中心 (Datacenter) + # datacenter = "" + + ## 可选的 TLS 配置 + # tls_ca = "/etc/categraf/ca.pem" + # tls_cert = "/etc/categraf/cert.pem" + # tls_key = "/etc/categraf/key.pem" + ## 忽略自签证书的安全校验 + # insecure_skip_verify = true +``` + +## 采集指标 + +| 指标名 | 说明 | +| ----------------------------- | ----------------------------------------------------------------------------------------------------- | +| `consul_up` | 上一次对 Consul 的查询是否成功 (1 为成功,0 为失败)。 | +| `consul_scrape_use_seconds` | 抓取耗时 (秒)。 | +| `consul_raft_peers` | Raft 集群中 Peer (Server) 的数量。 | +| `consul_raft_leader` | 根据当前节点的状态,Raft 集群是否有 Leader。 | +| `consul_serf_lan_members` | LAN 集群中的成员数量。 | +| `consul_serf_lan_member_status` | 集群成员状态。1=Alive, 2=Leaving, 3=Left, 4=Failed。 | +| `consul_serf_wan_member_status` | WAN 集群中的成员状态。1=Alive, 2=Leaving, 3=Left, 4=Failed。 | +| `consul_catalog_services` | 集群中的服务数量。 | +| `consul_service_tag` | 服务的标签 (Tags)。 | +| `consul_health_node_status` | 节点相关的健康检查状态。 | +| `consul_health_service_status` | 服务相关的健康检查状态。 | +| `consul_service_checks` | 链接 Service ID 和 Check Name (如果可用)。 | +| `consul_catalog_kv` | Consul KV (Key/Value) 存储中的数据 (只收集值为数值类型的 Key)。 | + +同时,还会暴露部分 Consul Agent 原生指标,具体详见 [Agent Metrics](https://developer.hashicorp.com/consul/api-docs/agent#view-metrics)。 diff --git a/inputs/consul/dashboard.json b/inputs/consul/dashboard.json new file mode 100644 index 000000000..71ee09455 --- /dev/null +++ b/inputs/consul/dashboard.json @@ -0,0 +1,106 @@ +{ + "title": "Consul", + "uid": "a3fd9a39", + "tags": [ + "consul" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Consul Up", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 4 + }, + "id": 1, + "targets": [ + { + "expr": "consul_up", + "legendFormat": "{{address}}", + "refId": "A" + } + ] + }, + { + "title": "Consul Catalog Services", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 4 + }, + "id": 2, + "targets": [ + { + "expr": "consul_catalog_services", + "legendFormat": "{{address}}", + "refId": "A" + } + ] + }, + { + "title": "Consul LAN Members", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "consul_serf_lan_members", + "legendFormat": "{{address}}", + "refId": "A" + } + ] + }, + { + "title": "Consul Raft Peers", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "consul_raft_peers", + "legendFormat": "{{address}}", + "refId": "A" + } + ] + }, + { + "title": "Consul Health Service Status (Critical)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 24, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "consul_health_service_status{status=\"critical\"}", + "legendFormat": "{{service_name}} on {{node}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/cpu/README.md b/inputs/cpu/README.md index db6f7b315..d6f967958 100644 --- a/inputs/cpu/README.md +++ b/inputs/cpu/README.md @@ -1,13 +1,28 @@ -# cpu +# CPU Input Plugin -CPU 采集插件很简单,自动采集本机 CPU 的使用率、空闲率等等,默认采集的是整机的,如果想采集单核的,就开启这个配置: +The CPU input plugin automatically collects various CPU metrics of the local machine, such as CPU usage, idle percentage, and system time. -```ini -collect_per_cpu = true +By default, the plugin only collects global (total) CPU metrics. If you want to collect metrics for each individual CPU core, you can enable it in the configuration. + +## Configuration + +```toml +[[instances]] +# Whether to collect metrics for each individual CPU core +collect_per_cpu = false ``` -其中 CPU 使用率的指标名字是 cpu_usage_active +When `collect_per_cpu = true` is enabled, the metrics will include a `cpu` tag (e.g., `cpu="cpu0"`, `cpu="cpu1"`) to distinguish between different cores. The global summary metrics typically use the `cpu="cpu-total"` tag. + +## Metrics + +Common metrics include but are not limited to: +- `cpu_usage_active`: The active CPU time percentage (100 - idle) +- `cpu_usage_user`: CPU time spent in user space +- `cpu_usage_system`: CPU time spent in kernel space +- `cpu_usage_idle`: CPU idle time percentage +- `cpu_usage_iowait`: CPU time spent waiting for I/O operations -## 监控大盘 +## Dashboard -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +It is recommended to integrate OS-level metrics (CPU, Mem, Disk, etc.) into a unified System Dashboard. However, a dedicated CPU reference Dashboard is also provided here for independent viewing. diff --git a/inputs/cpu/README_CN.md b/inputs/cpu/README_CN.md new file mode 100644 index 000000000..777cd9059 --- /dev/null +++ b/inputs/cpu/README_CN.md @@ -0,0 +1,28 @@ +# CPU 采集插件 + +CPU 采集插件主要用于自动收集本机 CPU 的使用率、空闲率等各项指标。 + +默认情况下,插件只采集整机 (Global) 的汇总指标。如果需要采集单个 CPU 核心的独立指标,可以通过配置开启。 + +## 配置说明 + +```toml +[[instances]] +# 是否采集每个独立 CPU 核心的指标 +collect_per_cpu = false +``` + +开启 `collect_per_cpu = true` 后,各项指标会带有 `cpu` 标签(例如 `cpu="cpu0"`, `cpu="cpu1"`),以此来区分不同的核心;整机的汇总指标通常会带 `cpu="cpu-total"` 标签。 + +## 采集指标 + +常见指标包括但不限于: +- `cpu_usage_active`: CPU 活跃时间占比 (100 - idle) +- `cpu_usage_user`: 用户态消耗的 CPU 时间占比 +- `cpu_usage_system`: 内核态消耗的 CPU 时间占比 +- `cpu_usage_idle`: CPU 空闲时间占比 +- `cpu_usage_iowait`: CPU 等待 I/O 的时间占比 + +## 监控大盘 + +建议将 OS 级别的监控 (如 CPU、Mem、Disk 等) 整合到统一的 System Dashboard 中。但为了方便独立查看,这里也提供了一份专门针对 CPU 的参考 Dashboard。 \ No newline at end of file diff --git a/inputs/cpu/dashboard.json b/inputs/cpu/dashboard.json new file mode 100644 index 000000000..060fcf9cf --- /dev/null +++ b/inputs/cpu/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "CPU Metrics", + "uid": "54db23f5", + "tags": [ + "cpu metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "CPU Active Usage (%)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "cpu_usage_active{cpu=\"cpu-total\"}", + "legendFormat": "Total Active", + "refId": "A" + } + ] + }, + { + "title": "CPU IOWait (%)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "cpu_usage_iowait{cpu=\"cpu-total\"}", + "legendFormat": "Total IOWait", + "refId": "A" + } + ] + }, + { + "title": "CPU System Time (%)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "cpu_usage_system{cpu=\"cpu-total\"}", + "legendFormat": "Total System", + "refId": "A" + } + ] + }, + { + "title": "CPU User Time (%)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "cpu_usage_user{cpu=\"cpu-total\"}", + "legendFormat": "Total User", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/dcgm/README.md b/inputs/dcgm/README.md new file mode 100644 index 000000000..c534abffa --- /dev/null +++ b/inputs/dcgm/README.md @@ -0,0 +1,51 @@ +# DCGM Input Plugin + +This plugin collects hardware monitoring metrics for NVIDIA GPUs by integrating the core logic of the official NVIDIA `dcgm-exporter`. Using the Data Center GPU Manager (DCGM), the plugin gathers detailed hardware statistics including GPU temperature, power usage, frame buffer (memory) usage, GPU utilization, and XID errors. + +> Note: This plugin is only included when Categraf is compiled with the `dcgm` build tag (e.g., `go build -tags "dcgm"`). + +## Configuration + +```toml +[[instances]] + # Path to the DCGM collectors CSV configuration file, which defines the FieldIDs to monitor. + # Example: "/etc/categraf/dcgm/default-counters.csv" + collectors = "/etc/categraf/dcgm/default-counters.csv" + + # Whether Categraf is running in a Kubernetes environment + kubernetes = false + # Type of GPU ID resolution in k8s (e.g., "uid") + kubernetes-gpu-id-type = "uid" + + # Device selection string, e.g., "f" (flex, default), "g" (all GPUs), "i" (GPU instances) + devices = "f" + + # Whether to use fake GPUs (useful for testing and development) + fake-gpus = false + + # Optional: Connect to a remote hostengine + # remote-hostengine-info = "localhost:5555" + + # You can declare the collector CSV file inline directly in the config + # [instances.collector_files] + # "/etc/categraf/dcgm/default-counters.csv" = """ + # DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C) + # DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + # """ +``` + +## Metrics + +All metrics will be tagged with identifiers such as `gpu`, `UUID`, `device`. Common metrics include: + +- `DCGM_FI_DEV_GPU_TEMP`: GPU temperature (in Celsius) +- `DCGM_FI_DEV_POWER_USAGE`: Real-time power draw (in Watts) +- `DCGM_FI_DEV_GPU_UTIL`: GPU compute utilization (%) +- `DCGM_FI_DEV_MEM_COPY_UTIL`: Frame buffer read/write utilization (%) +- `DCGM_FI_DEV_FB_USED`: Frame buffer memory used (MB) +- `DCGM_FI_DEV_FB_FREE`: Frame buffer memory free (MB) +- `DCGM_FI_DEV_XID_ERRORS`: Number of XID hardware/driver errors encountered by the GPU + +## Dashboard + +A standard DCGM Dashboard is provided as a reference, covering essential monitoring panels like GPU Utilization, Power Usage, Frame Buffer Memory, and Temperature. diff --git a/inputs/dcgm/README_CN.md b/inputs/dcgm/README_CN.md new file mode 100644 index 000000000..4a5aad969 --- /dev/null +++ b/inputs/dcgm/README_CN.md @@ -0,0 +1,51 @@ +# DCGM 采集插件 + +该插件用于采集 NVIDIA GPU 的核心监控指标,其底层集成了 NVIDIA 官方的 `dcgm-exporter` 逻辑。利用 Data Center GPU Manager (DCGM),插件能够收集包括 GPU 温度、功率、显存使用率、核心利用率以及 XID 错误等详细的硬件统计数据。 + +> 注意:此插件仅在编译时带上 `dcgm` build tag (例如: `go build -tags "dcgm"`) 时才会生效。 + +## 配置说明 + +```toml +[[instances]] + # 定义要抓取的 DCGM collectors 配置文件路径(用于定义哪些 FieldID 会被抓取) + # 例如:"/etc/categraf/dcgm/default-counters.csv" + collectors = "/etc/categraf/dcgm/default-counters.csv" + + # 是否在 Kubernetes 环境下运行 + kubernetes = false + # k8s gpu id 解析模式 (例如 "uid" 等) + kubernetes-gpu-id-type = "uid" + + # 设置要监控的 GPU 设备范围,例如 "f" (flex), "g" (所有 GPU), "i" (GPU 实例) + devices = "f" + + # 设置是否启用假数据 (常用于测试) + fake-gpus = false + + # 可选:连接到远端的 hostengine + # remote-hostengine-info = "localhost:5555" + + # 直接在配置文件中内联声明 collector 文件内容 + # [instances.collector_files] + # "/etc/categraf/dcgm/default-counters.csv" = """ + # DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C) + # DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + # """ +``` + +## 采集指标 + +所有指标将附带如 `gpu`, `UUID`, `device` 等标签,常见的核心指标包括: + +- `DCGM_FI_DEV_GPU_TEMP`: GPU 当前温度 (摄氏度) +- `DCGM_FI_DEV_POWER_USAGE`: GPU 实时功耗 (瓦特) +- `DCGM_FI_DEV_GPU_UTIL`: GPU 核心计算利用率 (%) +- `DCGM_FI_DEV_MEM_COPY_UTIL`: 显存读写利用率 (%) +- `DCGM_FI_DEV_FB_USED`: 已使用的显存大小 (MB) +- `DCGM_FI_DEV_FB_FREE`: 剩余空闲的显存大小 (MB) +- `DCGM_FI_DEV_XID_ERRORS`: GPU 发生的 XID 硬件/驱动错误次数 + +## 监控大盘 + +本插件提供了一个标准的 DCGM Dashboard 参考,主要涵盖 GPU 利用率、功耗、显存使用和温度监控。 diff --git a/inputs/dcgm/dashboard.json b/inputs/dcgm/dashboard.json new file mode 100644 index 000000000..474c90364 --- /dev/null +++ b/inputs/dcgm/dashboard.json @@ -0,0 +1,106 @@ +{ + "title": "NVIDIA DCGM Metrics", + "uid": "8605fd74", + "tags": [ + "nvidia dcgm metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "GPU Utilization (%)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_UTIL", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ] + }, + { + "title": "GPU Power Usage (W)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "DCGM_FI_DEV_POWER_USAGE", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ] + }, + { + "title": "GPU Memory Used (MB)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "DCGM_FI_DEV_FB_USED", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ] + }, + { + "title": "GPU Temperature (C)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_TEMP", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ] + }, + { + "title": "GPU XID Errors", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 24, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "DCGM_FI_DEV_XID_ERRORS", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/disk/README.md b/inputs/disk/README.md index 69c9a03e3..1fb2ac078 100644 --- a/inputs/disk/README.md +++ b/inputs/disk/README.md @@ -1,7 +1,32 @@ -# disk +# Disk Input Plugin -该插件采集磁盘利用率、inode利用率等,默认配置就是推荐配置,如果有发现不符合预期的情况再考虑调整。 +The Disk input plugin gathers metrics about disk usage across different partitions. +It collects information such as total disk space, used space, free space, disk usage percentage, and inode usage percentage. -## 监控大盘 +The default configuration is already the recommended setting for most environments and generally does not need to be modified. If you notice unexpected file systems being monitored (e.g., too many virtual file systems), you can adjust the filtering options like `ignore_fs`. -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +## Configuration + +```toml +[[instances]] + # List of filesystem types to ignore + # ignore_fs = [...] +``` + +## Metrics + +Common metrics include but are not limited to: +- `disk_total`: Total disk space on the partition (Bytes) +- `disk_used`: Used disk space on the partition (Bytes) +- `disk_free`: Free available disk space (Bytes) +- `disk_used_percent`: Percentage of used disk space (%) +- `disk_inodes_total`: Total number of inodes +- `disk_inodes_used`: Number of used inodes +- `disk_inodes_free`: Number of free inodes +- `disk_inodes_used_percent`: Percentage of used inodes (%) + +All metrics will include tags such as `device`, `fstype`, `mode`, and `path`. + +## Dashboard + +It is recommended to integrate OS-level metrics (CPU, Mem, Disk, etc.) into a unified System Dashboard. However, a dedicated Disk usage reference Dashboard is also provided here for independent viewing. diff --git a/inputs/disk/README_CN.md b/inputs/disk/README_CN.md new file mode 100644 index 000000000..26d4ed8e8 --- /dev/null +++ b/inputs/disk/README_CN.md @@ -0,0 +1,32 @@ +# Disk 采集插件 + +Disk 采集插件主要用于收集操作系统的磁盘分区使用情况。 +该插件能够采集包括磁盘总容量、已用容量、剩余容量、磁盘使用率以及 Inode 的使用率等信息。 + +默认配置已经是推荐的通用配置,一般情况下无需修改。如果您发现收集到的文件系统存在不符合预期的情况(例如收集了太多不必要的虚拟文件系统),可以调整配置中的过滤项(如 `ignore_fs`)。 + +## 配置说明 + +```toml +[[instances]] + # 是否仅忽略在 ignore_fs 列表中明确配置的文件系统,设为 false 表示忽略常见的虚拟文件系统 + # ignore_fs = [...] +``` + +## 采集指标 + +常见指标包括但不限于: +- `disk_total`: 磁盘分区总容量 (Bytes) +- `disk_used`: 磁盘分区已用容量 (Bytes) +- `disk_free`: 磁盘分区剩余可用容量 (Bytes) +- `disk_used_percent`: 磁盘容量使用率 (%) +- `disk_inodes_total`: Inode 总数 +- `disk_inodes_used`: 已使用的 Inode 数量 +- `disk_inodes_free`: 剩余的 Inode 数量 +- `disk_inodes_used_percent`: Inode 使用率 (%) + +所有指标都会带上 `device`, `fstype`, `mode`, `path` 等标签。 + +## 监控大盘 + +建议将 OS 级别的监控 (如 CPU、Mem、Disk 等) 整合到统一的 System Dashboard 中。但为了方便独立查看,这里也提供了一份专门针对 Disk 分区使用情况的参考 Dashboard。 \ No newline at end of file diff --git a/inputs/disk/dashboard.json b/inputs/disk/dashboard.json new file mode 100644 index 000000000..11ee1b874 --- /dev/null +++ b/inputs/disk/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "Disk Usage Metrics", + "uid": "79722b25", + "tags": [ + "disk usage metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Disk Space Used (%)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "disk_used_percent", + "legendFormat": "{{device}} on {{path}}", + "refId": "A" + } + ] + }, + { + "title": "Disk Inodes Used (%)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "disk_inodes_used_percent", + "legendFormat": "{{device}} on {{path}}", + "refId": "A" + } + ] + }, + { + "title": "Disk Space Used (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "disk_used", + "legendFormat": "{{device}} on {{path}}", + "refId": "A" + } + ] + }, + { + "title": "Disk Space Total (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "disk_total", + "legendFormat": "{{device}} on {{path}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/diskio/README.md b/inputs/diskio/README.md index dbbd4c77f..3ca0691f5 100644 --- a/inputs/diskio/README.md +++ b/inputs/diskio/README.md @@ -1,7 +1,21 @@ -# diskio +# DiskIO Input Plugin -采集硬盘IO的情况 +The DiskIO input plugin collects block device I/O read and write statistics. +By analyzing these metrics, you can identify disk I/O bottlenecks, measure I/O throughput, and monitor operation latency. -## 监控大盘 +## Metrics -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +Common metrics include but are not limited to: +- `diskio_read_bytes`: Total number of bytes read from the device +- `diskio_write_bytes`: Total number of bytes written to the device +- `diskio_reads`: Total number of completed read operations +- `diskio_writes`: Total number of completed write operations +- `diskio_read_time`: Total time spent in read operations (ms) +- `diskio_write_time`: Total time spent in write operations (ms) +- `diskio_io_time`: Total time spent doing I/O operations (ms) + +All metrics will include the `name` tag (e.g., `sda`, `vda`) to identify the block device. + +## Dashboard + +It is recommended to integrate OS-level metrics (CPU, Mem, Disk, DiskIO, etc.) into a unified System Dashboard. However, a dedicated DiskIO reference Dashboard is also provided here for independent viewing. diff --git a/inputs/diskio/README_CN.md b/inputs/diskio/README_CN.md new file mode 100644 index 000000000..7dcd81c99 --- /dev/null +++ b/inputs/diskio/README_CN.md @@ -0,0 +1,21 @@ +# DiskIO 采集插件 + +DiskIO 采集插件主要用于收集硬盘 (Block Devices) 的底层 I/O 读写情况。 +通过分析这些指标,可以了解系统的磁盘读写瓶颈、I/O 吞吐量以及 I/O 操作的延迟。 + +## 采集指标 + +常见指标包括但不限于: +- `diskio_read_bytes`: 从设备读取的总字节数 +- `diskio_write_bytes`: 写入设备的总字节数 +- `diskio_reads`: 成功完成的读取操作总次数 +- `diskio_writes`: 成功完成的写入操作总次数 +- `diskio_read_time`: 读取操作消耗的总时间 (毫秒) +- `diskio_write_time`: 写入操作消耗的总时间 (毫秒) +- `diskio_io_time`: I/O 请求消耗的总时间 (毫秒) + +所有指标都会带上 `name` (如 `sda`, `vda`) 等标签。 + +## 监控大盘 + +建议将 OS 级别的监控 (如 CPU、Mem、Disk、DiskIO 等) 整合到统一的 System Dashboard 中。但为了方便独立查看,这里也提供了一份专门针对 DiskIO 性能的参考 Dashboard。 \ No newline at end of file diff --git a/inputs/diskio/dashboard.json b/inputs/diskio/dashboard.json new file mode 100644 index 000000000..c51a370cf --- /dev/null +++ b/inputs/diskio/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "DiskIO Metrics", + "uid": "a3ada9b4", + "tags": [ + "diskio metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "DiskIO Read Bytes/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(diskio_read_bytes[5m])", + "legendFormat": "{{name}}", + "refId": "A" + } + ] + }, + { + "title": "DiskIO Write Bytes/s", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(diskio_write_bytes[5m])", + "legendFormat": "{{name}}", + "refId": "A" + } + ] + }, + { + "title": "DiskIO Read Time (ms)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(diskio_read_time[5m])", + "legendFormat": "{{name}}", + "refId": "A" + } + ] + }, + { + "title": "DiskIO Write Time (ms)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(diskio_write_time[5m])", + "legendFormat": "{{name}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/dns_query/README.md b/inputs/dns_query/README.md index d493d393b..403214967 100644 --- a/inputs/dns_query/README.md +++ b/inputs/dns_query/README.md @@ -1,166 +1,48 @@ -# 应用场景 -一般用于对DNS服务器的响应监测,帮助运维快速定位网络问题。 +# DNS Query Input Plugin -# 部署场景 -不需要每台虚拟机都启用此插件,建议是独立或复合的某一台虚拟机启用此插件。 +The DNS Query input plugin is used to continuously monitor the response quality of DNS servers. It helps operators quickly locate network latency and resolution errors caused by DNS queries. -# 配置场景 -``` -本配置启用或数据定义如下功能: -使用本机DNS查询域名解析质量。 -使用外部DNS查询域名解析质量。 -使用不同记录类型进行DNS查询。 -每种查询都设置超时时间5秒。 -增加自定义标签,可通过自定义标签筛选数据及更加精确的告警推送。 -在domains字段处增加自己想要被DNS查询的域名,一般填写公司业务系统的域名或第三方依赖的业务系统。 -``` - -# 修改dns_query.toml文件配置 +**Deployment Recommendation:** +It is not necessary to enable this plugin on every machine. We recommend enabling it on core gateway nodes, specific network probe VMs, or central monitoring nodes to regularly query critical domain names. -``` 以下文件内容配置作为参考 -[root@aliyun input.dns_query]# cat dns_query.toml -# # collect interval -# interval = 15 +## Configuration +```toml [[instances]] -# # append some labels for series -labels = { cloud="huaweicloud", region="huabei-beijing-4",azone="az1", product="n9e" } - -# # interval = global.interval * interval_times -# interval_times = 1 - -# # -auto_detect_local_dns_server = true - -### A record - -## servers to query -servers = ["223.5.5.5","114.114.114.114","119.29.29.29"] - -## Network is the network protocol name. -# network = "udp" - -## Domains or subdomains to query. -domains = ["www.huaweicloud.com", "www.baidu.com", "www.tapd.cn"] - -## Query record type. -## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV. -record_type = "A" - -## Dns server port. -# port = 53 - -## Query timeout in seconds. -timeout = 5 - - -### CNAME record - -[[instances]] -# # append some labels for series -labels = { cloud="huaweicloud", region="huabei-beijing-4",azone="az1", product="n9e" } - -# # interval = global.interval * interval_times -# interval_times = 1 - -# # -auto_detect_local_dns_server = false - -## servers to query -servers = ["223.5.5.5","114.114.114.114","119.29.29.29"] - -## Network is the network protocol name. -# network = "udp" - -## Domains or subdomains to query. -domains = ["www.huaweicloud.com", "www.baidu.com", "www.tapd.cn"] + # Automatically use the DNS servers from the local machine's /etc/resolv.conf + auto_detect_local_dns_server = true -## Query record type. -## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV. -record_type = "CNAME" + ## Manually specify external DNS servers to query + servers = ["223.5.5.5", "114.114.114.114", "119.29.29.29"] -## Dns server port. -# port = 53 + ## Network protocol to use, such as "udp" or "tcp" + # network = "udp" -## Query timeout in seconds. -timeout = 5 + ## List of domains or subdomains to query + domains = ["www.huaweicloud.com", "www.baidu.com", "api.yourcompany.com"] + ## Query record type (A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV) + record_type = "A" -### NS record + ## DNS server port + # port = 53 -[[instances]] -# # append some labels for series -labels = { cloud="huaweicloud", region="huabei-beijing-4",azone="az1", product="n9e" } - -# # interval = global.interval * interval_times -# interval_times = 1 - -# # -auto_detect_local_dns_server = false - -## servers to query -servers = ["223.5.5.5","114.114.114.114","119.29.29.29"] - -## Network is the network protocol name. -# network = "udp" - -## Domains or subdomains to query. -domains = ["www.huaweicloud.com", "www.baidu.com", "www.tapd.cn"] - -## Query record type. -## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV. -record_type = "NS" - -## Dns server port. -# port = 53 - -## Query timeout in seconds. -timeout = 5 -``` - -# 测试配置 + ## Query timeout in seconds + timeout = 5 ``` -./categraf --test --inputs dns_query -....... A记录同理就省略 -20:51:34 dns_query_rcode_value agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.tapd.cn product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0 -20:51:34 dns_query_result_code agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.tapd.cn product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0 -20:51:34 dns_query_query_time_ms agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.tapd.cn product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 33.500371 - -20:51:34 dns_query_rcode_value agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.baidu.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0 -20:51:34 dns_query_result_code agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.baidu.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0 -20:51:34 dns_query_query_time_ms agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.baidu.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 34.328242 - -20:51:34 dns_query_rcode_value agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.huaweicloud.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0 -20:51:34 dns_query_result_code agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.huaweicloud.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0 -20:51:34 dns_query_query_time_ms agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.huaweicloud.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 -..... -``` -# 重启服务 -``` -重启categraf服务生效 -systemctl daemon-reload && systemctl restart categraf && systemctl status categraf +If you need to query different record types (e.g., `A` records and `CNAME` records), you can configure multiple `[[instances]]` blocks. -查看启动日志是否有错误 -journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!" -``` +## Metrics -# 检查数据呈现 -等待1-2分钟后数据就会在图表中展示出来,如图: -![image](https://user-images.githubusercontent.com/12181410/220353480-e17a7822-7ccc-4fdf-b18b-a0be84cd5550.png) +- `dns_query_query_time_ms`: The latency/response time of the DNS resolution in milliseconds. +- `dns_query_result_code`: The result code of the probe execution (0 means success, non-zero indicates an exception like timeout or connection failure). +- `dns_query_rcode_value`: The standard DNS protocol response code (e.g., NOERROR, NXDOMAIN, SERVFAIL). -# 监控告警规则配置 -``` -个人经验仅供参考,一般DNS解析延迟时间: -超过2000毫秒,为P2级别,启用企业微信应用推送告警,3分钟内恢复发出恢复告警。 -超过5000毫秒,为P1级别,启用电话语音告警&企业微信应用告警,3分钟内恢复发出恢复告警。 +All metrics include tags such as `server`, `domain`, and `record_type`, allowing for granular analysis per DNS server or domain. -为什么会这么考量设计? -在用到DNS监控时,一般公司业务是遍布全国的,然而全国各个地区在解析DNS存在各种场景因素导致的DNS问题(如DNS被劫持、片区DNS服务器故障等),所以需要以高级别对待。 -从收到告警到恢复告警设置3分钟的意图是防止期间是短暂时间有问题,同时也给SLA(99.99%)给足处理时长。 -``` +## Alerting Recommendations -# 监控图表配置 -``` -先略过 -``` +- **P2 Alert**: Trigger when `dns_query_query_time_ms > 2000` ms. +- **P1 Alert**: Trigger when `dns_query_query_time_ms > 5000` ms. +- **Critical Alert**: Trigger when `dns_query_result_code != 0`, indicating DNS resolution failure. diff --git a/inputs/dns_query/README_CN.md b/inputs/dns_query/README_CN.md new file mode 100644 index 000000000..c0fb9edff --- /dev/null +++ b/inputs/dns_query/README_CN.md @@ -0,0 +1,48 @@ +# DNS Query 采集插件 + +DNS Query 采集插件用于对 DNS 服务器的响应质量进行持续监测,帮助运维人员快速定位域名解析带来的网络延迟和解析错误问题。 + +**部署建议:** +不需要在所有机器上启用此插件,建议在核心网关节点、特定的网络探针虚拟机或复合监控节点上启用,定期拨测关键依赖的域名即可。 + +## 配置说明 + +```toml +[[instances]] + # 是否自动使用本机的 /etc/resolv.conf 中的 DNS 服务器进行查询 + auto_detect_local_dns_server = true + + ## 手动指定要查询的外部 DNS 服务器 (当上一项为 false 时生效) + servers = ["223.5.5.5", "114.114.114.114", "119.29.29.29"] + + ## 指定查询协议,如 "udp" 或 "tcp" + # network = "udp" + + ## 需要重点监测的域名列表 + domains = ["www.huaweicloud.com", "www.baidu.com", "api.yourcompany.com"] + + ## 查询记录的类型 (A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV) + record_type = "A" + + ## DNS 服务端口 + # port = 53 + + ## DNS 查询的超时时间 (秒) + timeout = 5 +``` + +如果需要拨测不同类型的记录(如 `A` 记录和 `CNAME` 记录),可以配置多个 `[[instances]]` 块。 + +## 采集指标 + +- `dns_query_query_time_ms`: DNS 解析延迟时间 (毫秒) +- `dns_query_result_code`: 探测过程的结果码 (0 为成功,非 0 为异常,如超时、无法连接等) +- `dns_query_rcode_value`: DNS 协议标准返回的响应码 (如 NOERROR, NXDOMAIN, SERVFAIL 等) + +所有指标都会带上 `server`, `domain`, `record_type` 等标签,方便按照特定 DNS 服务器或域名进行聚合分析。 + +## 告警建议 + +- 当 `dns_query_query_time_ms > 2000` 毫秒时,可以作为 P2 级别告警。 +- 当 `dns_query_query_time_ms > 5000` 毫秒时,可以作为 P1 级别告警。 +- 当 `dns_query_result_code != 0` 时,说明 DNS 解析失败,需立即介入。 diff --git a/inputs/dns_query/dashboard.json b/inputs/dns_query/dashboard.json new file mode 100644 index 000000000..1d65dc501 --- /dev/null +++ b/inputs/dns_query/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "DNS Query Quality", + "uid": "82f9567e", + "tags": [ + "dns query quality" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "DNS Query Time (ms)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "dns_query_query_time_ms", + "legendFormat": "{{domain}} on {{server}} ({{record_type}})", + "refId": "A" + } + ] + }, + { + "title": "DNS Result Code", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "dns_query_result_code", + "legendFormat": "{{domain}} on {{server}} ({{record_type}})", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/docker/README.md b/inputs/docker/README.md index 1609b2900..f8094c343 100644 --- a/inputs/docker/README.md +++ b/inputs/docker/README.md @@ -1,42 +1,68 @@ -# docker +# Docker Input Plugin -forked from [telegraf/inputs.docker](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/docker) +The Docker input plugin collects performance metrics (CPU, Memory, Network, Block I/O, state, etc.) from locally running Docker containers. This plugin is forked from `telegraf/inputs.docker`. +## Differences from Telegraf -## change +1. The `container_id` is exposed as a Tag (Label) instead of a Field to enable granular querying and aggregation. +2. Several less commonly used metrics have been removed to reduce storage pressure on the time-series database. -1. Using `container_id` as label not field -1. Some metrics have been deleted +## Configuration -## 容器ID标签 +```toml +[[instances]] + # The API Endpoint for the Docker Daemon + # Supports unix:// or tcp:// protocols + endpoint = "unix:///var/run/docker.sock" -通过下面两个配置来控制 container_id 这个标签: + # Timeout for metrics gathering + timeout = "5s" -```ini -container_id_label_enable = true -container_id_label_short_style = false + # Whether to include the container_id as a tag + container_id_label_enable = true + + # Whether to truncate the container_id to 12 characters + container_id_label_short_style = false ``` -默认 container_id_label_enable 设置为 true,表示启用,即会把容器ID放到标签里,container_id_label_short_style 是短格式,容器ID很长,如果把 short_style 设置为 true,就会只截取前面12位 +### Disabling the Plugin -## 权限问题 +If you wish to disable this plugin, you can do so using either of the following methods: +- **Method 1**: Rename the `conf/input.docker` directory so that it no longer starts with `input.`. +- **Method 2**: Leave the `endpoint` configuration field empty. -Categraf 最好是用 root 账号来运行,否则,请求 docker.sock 可能会遇到权限问题,需要把 Categraf 的运行账号,加到 docker group 中,假设 Categraf 使用 categraf 账号运行: +## FAQ -``` +### 1. Permission Issues + +Categraf requires permission to read the docker socket (`unix:///var/run/docker.sock`). It is recommended to run Categraf as `root`. +If you prefer to run Categraf as a non-root user (e.g., `categraf`), you must add that user to the `docker` group: + +```bash sudo usermod -aG docker categraf ``` -## 运行在容器里 +### 2. Running Categraf Inside a Container + +If Categraf itself is running inside a Docker container, you must mount the host's docker socket into the Categraf container so it can access the Docker Daemon API. -如果 Categraf 运行在容器中,docker 的 unix socket 就需要挂到 Categraf 的容器里,比如通过 `-v /var/run/docker.sock:/var/run/docker.sock` 这样的参数来启动 Categraf 的容器。如果是在 compose 环境下,也可以在 docker compose 配置中加上 volume 的配置: +**Via Docker CLI:** +```bash +docker run -v /var/run/docker.sock:/var/run/docker.sock ... +``` +**Via Docker Compose:** ```yaml volumes: - /var/run/docker.sock:/var/run/docker.sock ``` -## 停用该插件 +## Metrics -- 方法一:把 `input.docker` 目录改个别的名字,不用 `input.` 打头 -- 方法二:docker.toml 中的 endpoint 配置留空 \ No newline at end of file +The plugin collects comprehensive container resource usage. Key metrics include: +- `docker_container_cpu_usage_percent`: Container CPU usage percentage +- `docker_container_mem_usage_percent`: Container Memory usage percentage +- `docker_container_mem_limit`: Container Memory limit (Bytes) +- `docker_container_net_rx_bytes`: Container network received bytes +- `docker_container_net_tx_bytes`: Container network transmitted bytes +- `docker_container_status`: The running status of the container diff --git a/inputs/docker/README_CN.md b/inputs/docker/README_CN.md new file mode 100644 index 000000000..c0fac76b3 --- /dev/null +++ b/inputs/docker/README_CN.md @@ -0,0 +1,69 @@ +# Docker 采集插件 + +Docker 采集插件用于收集本地运行的 Docker 容器的状态、CPU、内存、网络及块设备 I/O 等性能指标。该插件基于 `telegraf/inputs.docker` 进行改造 (fork)。 + +## 差异说明 + +与 Telegraf 官方插件的主要差异: +1. 使用了 `container_id` 作为指标的 Tag (Label),而不是 Field,以方便更细粒度的聚合查询。 +2. 精简了部分不常用的指标以降低时序数据库的存储压力。 + +## 配置说明 + +```toml +[[instances]] + # Docker Daemon 的 API Endpoint + # 支持 unix:// 或 tcp:// 协议 + endpoint = "unix:///var/run/docker.sock" + + # 采集超时时间 + timeout = "5s" + + # 控制是否启用 container_id 作为指标的标签 + container_id_label_enable = true + + # 是否截断 container_id (如果设为 true,则只取前 12 位) + container_id_label_short_style = false +``` + +### 停用插件 + +如果你想停用该插件,有以下两种推荐方式: +- **方法一**:将 `conf/input.docker` 目录重命名(去掉 `input.` 前缀)。 +- **方法二**:将配置中的 `endpoint` 字段留空。 + +## 常见问题解答 (FAQ) + +### 1. 权限问题 + +Categraf 在尝试连接 `unix:///var/run/docker.sock` 时通常需要特权。建议使用 `root` 用户运行 Categraf。 +如果您希望使用普通用户 (如 `categraf`) 运行,需要将该用户加入 `docker` 用户组: + +```bash +sudo usermod -aG docker categraf +``` + +### 2. 在容器内部运行 Categraf + +如果 Categraf 本身也是作为容器运行的,为了使其能够采集宿主机上的 Docker 信息,您必须将宿主机的 docker socket 挂载进容器: + +**使用 Docker CLI:** +```bash +docker run -v /var/run/docker.sock:/var/run/docker.sock ... +``` + +**使用 Docker Compose:** +```yaml +volumes: + - /var/run/docker.sock:/var/run/docker.sock +``` + +## 采集指标 + +该插件支持对容器的资源使用情况进行全方位监控。部分核心指标如下: +- `docker_container_cpu_usage_percent`: 容器 CPU 使用率 (%) +- `docker_container_mem_usage_percent`: 容器内存使用率 (%) +- `docker_container_mem_limit`: 容器内存限制配额 (Bytes) +- `docker_container_net_rx_bytes`: 容器网络接收字节数 (Bytes) +- `docker_container_net_tx_bytes`: 容器网络发送字节数 (Bytes) +- `docker_container_status`: 容器运行状态 (通常以枚举值或 boolean 形式体现) \ No newline at end of file diff --git a/inputs/docker/dashboard.json b/inputs/docker/dashboard.json new file mode 100644 index 000000000..7af01f5c5 --- /dev/null +++ b/inputs/docker/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "Docker Containers", + "uid": "8fe1ab62", + "tags": [ + "docker containers" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Docker CPU Usage (%)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "docker_container_cpu_usage_percent", + "legendFormat": "{{container_name}}", + "refId": "A" + } + ] + }, + { + "title": "Docker Memory Usage (%)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "docker_container_mem_usage_percent", + "legendFormat": "{{container_name}}", + "refId": "A" + } + ] + }, + { + "title": "Docker Net RX (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(docker_container_net_rx_bytes[5m])", + "legendFormat": "{{container_name}}", + "refId": "A" + } + ] + }, + { + "title": "Docker Net TX (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(docker_container_net_tx_bytes[5m])", + "legendFormat": "{{container_name}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/elasticsearch/README.md b/inputs/elasticsearch/README.md index fa1f9d28f..3e53bb86b 100644 --- a/inputs/elasticsearch/README.md +++ b/inputs/elasticsearch/README.md @@ -27,7 +27,7 @@ ES 7.x 支持基于角色的访问控制(RBACs)。`elasticsearch` 插件需 ### Metrics -#### `cluster_health = true` 和 `cluster_health_level = "cluster"` +#### `cluster_health = true` 和 `cluster_health_level = "cluster"` | 名称 | 类型 | 描述 | |-----------------------------------------------------------------|------------|--------------------------| @@ -483,20 +483,3 @@ ES 7.x 支持基于角色的访问控制(RBACs)。`elasticsearch` 插件需 | elasticsearch_slm_stats_snapshots_deleted_total | counter | 按策略删除的快照数 | | elasticsearch_slm_stats_snapshot_deletion_failures_total | counter | 按策略快照删除失败次数 | | elasticsearch_slm_stats_operation_mode | gauge | SLM操作模式(运行中,停止中,已停止) | - - -#### `num_most_recent_indices = 0` - - -| 设置日期类动态索引可取前"num_most_recent_indices"个最新index的指标数据 -| 可极大减少历史动态索引导致的大指标量级 -| 可与“indices_include”配置一起使用 - -#### `dynamic_index_matcher_regexp` = ["(?P(?:\\d{4}|\\d{2})[.-]?(?:\\d{2})[.-]?(?:\\d{2})?[.-]?(?:\\d{2})?)$","[\\.-._]\\d+(\\.\\d+){0,2}$"] -| 与num_most_recent_indices 配合使用,用于指定动态索引的匹配逻辑,默认值: -| ["(?P(?:\\d{4}|\\d{2})[.-]?(?:\\d{2})[.-]?(?:\\d{2})?[.-]?(?:\\d{2})?)$","[\\.-._]\\d+(\\.\\d+){0,2}$"] -| 支持匹配 //YYYY.MM.DD 或 YYYY-MM-DD 或 YYYYMMDD 或 YYYY-MM-DD-HH -| //YYYY.MM 或 YYYY-MM 或 YYYYMM 或YYYYMMDDHH -| //YY.MM.DD 或 YY-MM-DD 或 YYMMDD 或YYYY.MM.DD.HH -| //v1_001 v1_002 -->v1* v0.1 v0.2 -->v0* v5.2.3 v5.2.4 -->v5* -| 也可自行扩展 \ No newline at end of file diff --git a/inputs/exec/README.md b/inputs/exec/README.md index d5c4f0c30..389d840be 100644 --- a/inputs/exec/README.md +++ b/inputs/exec/README.md @@ -1,141 +1,62 @@ -# 应用场景 -``` -应用于input插件库exec目录之外的特殊或自定义实现指定业务的监控。 -监控脚本采集到监控数据之后通过相应的格式输出到stdout,categraf截获stdout内容,解析之后传给服务端, -脚本的输出格式支持3种:influx、falcon、prometheus,通过 exec.toml 的 `data_format` 配置告诉 Categraf。 -data_format有3个值,其用法为: -``` +# Exec Input Plugin -## influx +The Exec plugin runs user-defined monitoring scripts or programs, captures the data output to standard output (stdout), parses it, and sends it to the server. +This is one of Categraf's most flexible plugins, making it ideal for highly customized business monitoring scenarios that are not covered by the official plugin library. -influx 格式的内容规范: -``` +## Output Formats + +The executed script must print the monitoring data to stdout in one of the following 3 supported formats (configured via `data_format`): + +### 1. influx +```text mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3 ``` -- 首先mesurement,表示一个类别的监控指标,比如 connections; -- mesurement后面是逗号,逗号后面是标签,如果没有标签,则mesurement后面不需要逗号 -- 标签是k=v的格式,多个标签用逗号分隔,比如region=beijing,env=test -- 标签后面是空格 -- 空格后面是属性字段,多个属性字段用逗号分隔 -- 属性字段是字段名=值的格式,在categraf里值只能是数字 -最终,mesurement和各个属性字段名称拼接成metric名字 +- Measurement and tags are separated by a comma. +- Tags are separated by commas. +- A **space** separates the tags section and the fields section. +- The final metric name is usually a combination of the measurement and the field name. -## falcon -Open-Falcon的格式如下,举例: +### 2. prometheus +Directly output the standard Prometheus Exposition format: +```text +# HELP demo_http_requests_total Total number of http api requests +# TYPE demo_http_requests_total counter +demo_http_requests_total{api="add_product"} 4633433 +``` +Lines starting with `#` are ignored by Categraf. +### 3. falcon +Open-Falcon JSON format: ```json [ { "endpoint": "test-endpoint", "metric": "test-metric", - "timestamp": 1658490609, - "step": 60, "value": 1, - "counterType": "GAUGE", - "tags": "idc=lg,loc=beijing", - }, - { - "endpoint": "test-endpoint", - "metric": "test-metric2", - "timestamp": 1658490609, - "step": 60, - "value": 2, - "counterType": "GAUGE", - "tags": "idc=lg,loc=beijing", + "tags": "idc=lg,loc=beijing" } ] ``` -timestamp、step、counterType,这三个字段在categraf处理的时候会直接忽略掉,endpoint会放到labels里上报。 - -## prometheus -prometheus 格式大家不陌生了,比如我这里准备一个监控脚本,输出 prometheus 的格式数据: -```shell -#!/bin/sh - -echo '# HELP demo_http_requests_total Total number of http api requests' -echo '# TYPE demo_http_requests_total counter' -echo 'demo_http_requests_total{api="add_product"} 4633433' -``` -其中 `#` 注释的部分,其实会被 categraf 忽略,不要也罢,prometheus 协议的数据具体的格式,请大家参考 prometheus 官方文档 - +Fields like `timestamp`, `step`, and `counterType` are ignored. Categraf will assign the timestamp upon scraping. -# 部署场景 -一般在复合型用途或独立的虚拟机启用此插件。 - -# 前置条件 -``` -1.需使用人解读每个脚本或程序的逻辑,其脚本或程序顶部有大概作用的描述。 -``` - -# 配置场景 -本配置启用或数据定义如下功能: -增加自定义标签,可通过自定义标签筛选数据及更加精确的告警推送。 -响应超时时间为5秒。 -commands字段正确应用脚本所在位置。 - -# 修改exec.toml文件配置 -``` -[root@aliyun input.exec]# vi exec.toml - -# # collect interval -# interval = 15 +## Configuration +```toml [[instances]] -# # commands, support glob +# # Commands or script paths to execute. Shell globs are supported. commands = [ - "/opt/categraf/scripts/*/collect_*.sh" - #"/opt/categraf/scripts/*/collect_*.py" - #"/opt/categraf/scripts/*/collect_*.go" - #"/opt/categraf/scripts/*/collect_*.lua" - #"/opt/categraf/scripts/*/collect_*.java" - #"/opt/categraf/scripts/*/collect_*.bat" - #"/opt/categraf/scripts/*/collect_*.cmd" - #"/opt/categraf/scripts/*/collect_*.ps1" + "/opt/categraf/scripts/*/collect_*.sh", + "/opt/categraf/scripts/*/collect_*.py" ] -# # timeout for each command to complete +# # Timeout for script execution to prevent zombie processes. # timeout = 5 -# # interval = global.interval * interval_times -# interval_times = 1 - -# # mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3 +# # Format to parse the stdout data. Options: influx, prometheus, falcon data_format = "influx" ``` -# 测试配置 -``` -以cert/collect_cert_expiretime.sh为例: -sh /opt/categraf/cert/collect_cert_expiretime.sh 出现: -cert,cloud=huaweicloud,region=huabei-beijing-4,azone=az1,product=cert,domain_name=www.baidu.com expire_days=163 -cert,cloud=huaweicloud,region=huabei-beijing-4,azone=az1,product=cert,domain_name=www.weibo.com expire_days=85 -cert,cloud=huaweicloud,region=huabei-beijing-4,azone=az1,product=cert,domain_name=www.csdn.net expire_days=281 -``` - -# 重启服务 -``` -重启categraf服务生效 -systemctl daemon-reload && systemctl restart categraf && systemctl status categraf - -查看启动日志是否有错误 -journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!" -``` - -# 检查数据呈现 -如图: -![image](https://user-images.githubusercontent.com/12181410/220940504-04c47faa-790a-42c1-b3dd-1510ae55c217.png) +## Metrics and Dashboards -# 告警规则 -``` -脚本作用不同,规则就不同,先略过。 -``` - -# 监控图表 -``` -脚本作用不同,规则就不同,先略过。 -``` - -# 故障自愈 -``` -脚本作用不同,规则就不同,先略过。 -``` +Since the Exec plugin collects whatever metrics the user's scripts generate, there is no fixed list of metrics and no unified dashboard. +You should create your own dashboards and alert rules in Nightingale or Grafana based on the metric names output by your specific scripts. diff --git a/inputs/exec/README_CN.md b/inputs/exec/README_CN.md new file mode 100644 index 000000000..94cd167f7 --- /dev/null +++ b/inputs/exec/README_CN.md @@ -0,0 +1,62 @@ +# Exec 采集插件 + +Exec 插件主要用于执行用户自定义的监控脚本或程序,并将脚本输出到标准输出 (stdout) 的数据截获下来,解析后上报给服务端。 +这是 Categraf 最灵活的插件之一,适用于 Categraf 官方插件库之外的特殊或高度定制化的业务监控场景。 + +## 脚本输出格式 + +被执行的脚本必须将监控数据输出到标准输出,支持以下 3 种格式 (通过 `data_format` 参数配置): + +### 1. influx +```text +mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3 +``` +- 指标名 (mesurement) 和标签 (Tags) 之间用逗号分隔 +- 标签之间用逗号分隔 +- 标签和属性字段 (Fields) 之间用**空格**分隔 +- 最终的指标名会根据 `mesurement` 和 `field` 组合生成 + +### 2. prometheus +直接输出 Prometheus 的标准 Exposition 格式: +```text +# HELP demo_http_requests_total Total number of http api requests +# TYPE demo_http_requests_total counter +demo_http_requests_total{api="add_product"} 4633433 +``` +以 `#` 开头的行会被 Categraf 忽略。 + +### 3. falcon +Open-Falcon JSON 格式: +```json +[ + { + "endpoint": "test-endpoint", + "metric": "test-metric", + "value": 1, + "tags": "idc=lg,loc=beijing" + } +] +``` +`timestamp`, `step`, `counterType` 等字段会被忽略,Categraf 自身会重新打上时间戳并按照全局规则上报。 + +## 配置说明 + +```toml +[[instances]] +# # 要执行的命令或脚本路径,支持 shell 的 glob 通配符 +commands = [ + "/opt/categraf/scripts/*/collect_*.sh", + "/opt/categraf/scripts/*/collect_*.py" +] + +# # 脚本执行的超时时间,必须设置以防止僵尸进程 +# timeout = 5 + +# # 解析脚本输出的格式,可选值: influx, prometheus, falcon +data_format = "influx" +``` + +## 采集指标与大盘 + +由于 Exec 插件收集的指标完全由用户脚本决定,因此没有固定的采集指标列表和统一的监控大盘。 +您可以根据自己脚本输出的 metric name,在夜莺 (Nightingale) 或 Grafana 中自行绘制 Dashboard 和配置告警规则。 diff --git a/inputs/exec/dashboard.json b/inputs/exec/dashboard.json new file mode 100644 index 000000000..a94b8a8ce --- /dev/null +++ b/inputs/exec/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Exec Scrape Information", + "uid": "1023f3d1", + "tags": [ + "exec scrape information" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Exec Plugin Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/filecount/README.md b/inputs/filecount/README.md index 9bae8e8ae..2e75fb045 100644 --- a/inputs/filecount/README.md +++ b/inputs/filecount/README.md @@ -1,6 +1,6 @@ # Filecount Input Plugin -forked from [telegraf/inputs.filecount](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/filecount) +forked from telegraf/inputs.filecount Reports the number and total size of files in specified directories. diff --git a/inputs/filecount/README_CN.md b/inputs/filecount/README_CN.md new file mode 100644 index 000000000..358995372 --- /dev/null +++ b/inputs/filecount/README_CN.md @@ -0,0 +1,48 @@ +# Filecount 采集插件 + +该插件用于统计指定目录下符合条件的文件的数量和总大小。该插件 fork 自 `telegraf/inputs.filecount`。 + +## 配置说明 + +```toml +[[instances]] +# # 待统计的目标目录 +# 支持标准 glob 匹配规则,并增加 ** 作为超级通配符: +# /var/log/** -> 递归查找 /var/log 下的所有目录,并统计每个目录内的文件 +# /var/log/*/* -> 查找父目录为 /var/log 的所有目录,并统计每个目录内的文件 +# /var/log -> 统计 /var/log 及其所有子目录中的所有文件总和 +directories = ["/tmp", "/root"] + +# # 文件名匹配模式。默认为 "*"。 +file_name = "*" + +# # 是否统计子目录中的文件。默认为 true。 +recursive = true + +# # 是否仅统计普通文件 (排除目录、符号链接、Socket等)。默认为 true。 +regular_only = true + +# # 遍历目录树时是否跟随符号链接。默认为 false。 +follow_symlinks = false + +# # 按文件大小过滤。 +# 只有大于等于此大小的文件才会被统计。 +# 如果配置为负数,则只统计小于其绝对值的文件。 +# 支持的单位有 B, KiB, MiB, KB 等... +# 如果不带引号和单位,则默认为字节。 +size = "0B" + +# # 按修改时间过滤。 +# 只有在此时间之前(未被修改的时间超过该值)的文件才会被统计。 +# 如果配置为负数,则只统计在此时长内被修改过的文件。默认为 "0s"。 +mtime = "0s" +``` + +## 采集指标 + +所有指标将附带 `directory` 标签表示具体的目录路径。 + +- `filecount_count`: 匹配到的文件总数 +- `filecount_size_bytes`: 匹配到的文件总大小 (Bytes) +- `filecount_oldest_file_timestamp`: 最早创建/修改的文件的 Unix 时间戳 (纳秒) +- `filecount_newest_file_timestamp`: 最新创建/修改的文件的 Unix 时间戳 (纳秒) diff --git a/inputs/filecount/dashboard.json b/inputs/filecount/dashboard.json new file mode 100644 index 000000000..e534fb7e1 --- /dev/null +++ b/inputs/filecount/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "Directory File Count", + "uid": "bb9176a4", + "tags": [ + "directory file count" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "File Count", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "filecount_count", + "legendFormat": "{{directory}}", + "refId": "A" + } + ] + }, + { + "title": "Total File Size (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "filecount_size_bytes", + "legendFormat": "{{directory}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/gnmi/README_CN.md b/inputs/gnmi/README_CN.md new file mode 100644 index 000000000..22a925cbb --- /dev/null +++ b/inputs/gnmi/README_CN.md @@ -0,0 +1,93 @@ +# gNMI (gRPC Network Management Interface) 采集插件 + +该插件基于 [gNMI](https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-specification.md) 协议的 Subscribe 方法,订阅并采集网络设备的遥测 (Telemetry) 数据。 +插件支持 TLS 认证和加密,与设备供应商无关,支持任何兼容 gNMI 规范的平台。 + +该插件 fork 自 `telegraf/inputs.gnmi`。对于 Cisco 设备,它特别针对 Cisco IOS XR (64-bit) 6.5.1, Cisco NX-OS 9.3 以及 Cisco IOS XE 16.12 及以上版本产生的遥测数据进行了优化。 + +## 配置说明 + +```toml +# gNMI 遥测插件配置 +[[instances]] + ## gNMI gRPC 服务器的地址和端口 + addresses = ["1.2.3.4:5678"] + + ## 设备的认证凭据 + username = "admin" + password = "admin" + + ## 请求的 gNMI 编码格式 (可选: "proto", "json", "json_ietf", "bytes") + encoding = "proto" + + ## 发生故障后重新连接的等待时间 + redial = "10s" + + ## gRPC 的最大消息大小限制,默认 4MB + max_msg_size = "4194304" + + ## TLS 认证配置 (如果设备启用了 TLS) + # enable_tls = false + # tls_ca = "/etc/pki/ca.pem" + # tls_min_version = "TLS12" + # insecure_skip_verify = true # 跳过证书链和主机名验证 + + ## 如果在更新消息中没有前缀路径,是否尝试推断路径标签。 + ## 如果启用,则会使用更新中所有元素的公共路径。 + # guess_path_tag = false + + ## 定义额外的别名,用于将响应的路径映射到 measurement 的名称 + # [instances.aliases] + # ifcounters = "openconfig:/interfaces/interface/state/counters" + + ## 配置要订阅的遥测路径 + [[instances.subscription]] + ## 产生的数据将使用的 measurement 名称 (也就是指标前缀) + name = "ifcounters" + + ## 订阅的起源(Origin)和路径(Path) + ## origin 通常指设备实现的 YANG 数据模型,path 是类似于 XPath 的结构路径 + origin = "openconfig-interfaces" + path = "/interfaces/interface/state/counters" + + ## 订阅模式: "target_defined", "sample" (周期采样), "on_change" (变更时推送) + subscription_mode = "sample" + sample_interval = "10s" + + ## 如果你想把某个订阅路径的值作为其他指标的 Tag (标签),可以使用 tag_subscription + # [[instances.tag_subscription]] + # name = "descr" + # origin = "openconfig-interfaces" + # path = "/interfaces/interface/state" + # subscription_mode = "on_change" +``` + +## 采集指标 + +每配置一个 `[[instances.subscription]]`,插件就会生成对应的 Measurement。 +gNMI `SubscribeResponse` 的 Update 消息中,每个叶子节点 (Leaf) 的值都会转化为指标的值 (Field),路径键值对会被转化为标签 (Tag)。 + +## 监控大盘 + +由于 gNMI 的指标完全依赖于您订阅的 YANG 模型路径,指标名称不固定。因此没有提供统一的默认大盘。您需要根据具体的 `name` 配置在 Grafana/Nightingale 中自定义大盘。 + +## 故障排查排雷 + +某些设备 (比如 Juniper) 可能会返回与订阅路径不对应的杂散数据路径。在这种情况下,Categraf 无法确定响应应属哪个 `name`,您会看到 `empty metric-name warning` 警告。 + +为了避免这个问题,您可以使用 `[instances.aliases]` 将响应路径映射回正确的名称: + +```toml +[[instances]] + addresses = ["..."] + + [instances.aliases] + memory = "/components" + + [[instances.subscription]] + name = "memory" + origin = "openconfig" + path = "/junos/system/linecard/cpu/memory" + subscription_mode = "sample" + sample_interval = "60s" +``` diff --git a/inputs/gnmi/dashboard.json b/inputs/gnmi/dashboard.json new file mode 100644 index 000000000..2da88eed4 --- /dev/null +++ b/inputs/gnmi/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "gNMI Telemetry", + "uid": "b2faee7e", + "tags": [ + "gnmi telemetry" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "gNMI Plugin Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/googlecloud/README.md b/inputs/googlecloud/README.md index 8744ed1fa..8579b8e4b 100644 --- a/inputs/googlecloud/README.md +++ b/inputs/googlecloud/README.md @@ -1,40 +1,55 @@ +# Google Cloud Monitoring Input Plugin +This plugin pulls cloud resource monitoring metrics from the Google Cloud Platform (GCP) Cloud Monitoring API (formerly Stackdriver). -# GCP 指标获取插件 -需要权限 -```toml -https://www.googleapis.com/auth/monitoring.read -``` +## Prerequisites + +Before using this plugin, ensure that the provided GCP Service Account credentials have the following permission: +- `monitoring.read` (Monitoring Viewer) + +## Configuration -配置 ```toml -#采集周期,建议 >= 1分钟 -interval=60 +# Scrape interval. Since calling cloud provider APIs may incur costs and hit rate limits, it's recommended to set >= 60 seconds. +interval = 60 + [[instances]] -#配置 project_id -project_id="your-project-id" -#配置认证的key文件 -credentials_file="/path/to/your/key.json" -#或者配置认证的JSON -credentials_json="xxx" - -# 指标的end time = now - delay -#delay="2m" -# 指标的start time = now - deley - period -#period="1m" -# 过滤器 -#filter="metric.type=\"compute.googleapis.com/instance/cpu/utilization\" AND resource.labels.zone=\"asia-northeast1-a\"" -# 请求超时时间 -#timeout="5s" -# 指标列表的缓存时长 ,filter为空时 启用 -#cache_ttl="1h" - -# 给gce的instance_name 取个别名,放到label中 -#gce_host_tag="xxx" -# 每次最多有多少请求同时发起 -#request_inflight=30 - -# request_inflight 取值(0,100] -# 想配置更大的值 ,前提是你知道你在做什么 -force_request_inflight= 200 +# # Your GCP Project ID +project_id = "your-project-id" + +# # Absolute path to your service account credentials JSON file +credentials_file = "/path/to/your/key.json" + +# # Or configure the JSON string directly here +# credentials_json = "{...}" + +# # Data delay time: Actual query End Time = Now - delay (prevents fetching empty data due to GCP write delays) +# delay = "2m" + +# # Time window span for the query: Start Time = Now - delay - period +# period = "1m" + +# # Metric filter (If left empty, it will default to fetching most metrics in the project, which can be very expensive) +# filter = "metric.type=\"compute.googleapis.com/instance/cpu/utilization\" AND resource.labels.zone=\"asia-northeast1-a\"" + +# # HTTP request timeout +# timeout = "5s" + +# # Cache TTL for GCP metric metadata (effective when filter is empty, avoids frequent metadata requests) +# cache_ttl = "1h" + +# # Extract the GCE (Google Compute Engine) instance_name as an alias and append it as a new Tag to the metrics +# gce_host_tag = "gce_host" + +# # Maximum concurrent requests (default is 30, valid range is 1 to 100) +# request_inflight = 30 + +# # Force setting a larger concurrent request limit (only if you know what you are doing and won't hit GCP quotas) +# force_request_inflight = 200 ``` + +## Metrics and Dashboards + +Because this plugin is essentially a proxy for the Google Cloud Monitoring API, the collected metrics depend entirely on your `filter` rules and the GCP services you use. Therefore: +- Metric names will be automatically mapped from GCP's Metric Types. +- There is no single predefined dashboard. You will need to build custom dashboards in your monitoring system tailored to your specific workloads (e.g., GCE, Cloud SQL). diff --git a/inputs/googlecloud/README_CN.md b/inputs/googlecloud/README_CN.md new file mode 100644 index 000000000..14e8892e6 --- /dev/null +++ b/inputs/googlecloud/README_CN.md @@ -0,0 +1,55 @@ +# Google Cloud 监控指标采集插件 + +该插件用于从 Google Cloud Platform (GCP) 的 Cloud Monitoring API (原 Stackdriver) 中拉取云资源的监控指标。 + +## 前置条件 + +使用该插件前,您需要确保提供的 GCP 服务账号 (Service Account) 凭证拥有以下权限: +- `monitoring.read` (监控查看者) + +## 配置说明 + +```toml +# 采集周期,因为调用云厂商 API 可能存在计费和限流限制,建议设置 >= 60 秒 +interval = 60 + +[[instances]] +# # 您的 GCP 项目 ID +project_id = "your-project-id" + +# # 认证凭据 JSON 文件的绝对路径 +credentials_file = "/path/to/your/key.json" + +# # 或者直接在此处配置 JSON 字符串内容 +# credentials_json = "{...}" + +# # 数据延迟时间:实际查询的 End Time = Now - delay (防止由于 GCP 数据写入延迟导致拉空数据) +# delay = "2m" + +# # 查询的时间窗口跨度:Start Time = Now - delay - period +# period = "1m" + +# # 指标过滤器 (如果留空,默认会拉取项目中大部分指标,可能消耗极大) +# filter = "metric.type=\"compute.googleapis.com/instance/cpu/utilization\" AND resource.labels.zone=\"asia-northeast1-a\"" + +# # HTTP 请求超时时间 +# timeout = "5s" + +# # GCP 指标元数据的缓存时长 (当 filter 为空时生效,避免频繁请求元数据) +# cache_ttl = "1h" + +# # 将 GCE (Google Compute Engine) 的 instance_name 提取为别名,并作为一个新的 Tag 追加到指标上 +# gce_host_tag = "gce_host" + +# # 最大并发请求数 (默认为 30,范围是 1 到 100) +# request_inflight = 30 + +# # 强制设置更大的并发请求数 (前提是您知道您在做什么并且确信不会触发 GCP 限流) +# force_request_inflight = 200 +``` + +## 采集指标与监控大盘 + +由于此插件本质上是 Google Cloud Monitoring API 的一个透传代理,采集到的指标完全取决于您的 `filter` 规则以及您使用的 GCP 服务。因此: +- 指标名称会根据 GCP 的 Metric Type 自动映射。 +- 没有统一的预置监控大盘,您需要在监控系统中针对具体的业务(如 GCE, Cloud SQL 等)自行配置 Dashboard。 diff --git a/inputs/googlecloud/dashboard.json b/inputs/googlecloud/dashboard.json new file mode 100644 index 000000000..8ecb33fb4 --- /dev/null +++ b/inputs/googlecloud/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Google Cloud Monitoring", + "uid": "0bc7b69a", + "tags": [ + "google cloud monitoring" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Google Cloud Plugin Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/greenplum/README.md b/inputs/greenplum/README.md new file mode 100644 index 000000000..9a8d68f06 --- /dev/null +++ b/inputs/greenplum/README.md @@ -0,0 +1,32 @@ +# Greenplum Input Plugin + +The Greenplum input plugin monitors the mirror node status of a Greenplum database cluster. + +> Note: This plugin relies on the `gpstate` command-line tool being available in the system PATH. + +## How it works + +The plugin periodically executes the `gpstate -m` command in the background and parses the `Status` and `Data Status` fields from its output. Because it invokes the official Greenplum management tool directly, **Categraf must be run as a user with permissions to execute `gpstate` (e.g., `gpadmin`)**, or the environment must be configured properly. + +## Configuration + +```toml +# # Collect interval +# interval = 15 + +[[instances]] +# There is no instance-specific configuration for this plugin. Just ensure gpstate is in the PATH. +# You can append labels to distinguish different clusters: +# labels = { cluster="gp-cluster-1" } +``` + +## Metrics + +All metrics will include `Mirror`, `Datadir`, and `Port` as tags. + +- `greenplum_Status`: Node state. `1` indicates `Passive`, otherwise `0`. +- `greenplum_Data_Status`: Data synchronization status. `1` indicates `Synchronized`, otherwise `0`. + +## Dashboard and Alerts + +It is recommended to incorporate these status metrics into your overall Greenplum dashboard. If `greenplum_Data_Status` remains `0` for an extended period, it indicates that primary-mirror synchronization is abnormal and an alert should be triggered. diff --git a/inputs/greenplum/README_CN.md b/inputs/greenplum/README_CN.md new file mode 100644 index 000000000..4a18e23bc --- /dev/null +++ b/inputs/greenplum/README_CN.md @@ -0,0 +1,32 @@ +# Greenplum 采集插件 + +Greenplum 采集插件用于监控 Greenplum 数据库集群的镜像节点 (Mirror) 状态。 + +> 注意:该插件依赖于运行环境中存在 `gpstate` 命令行工具。 + +## 采集原理 + +插件在后台会定期执行 `gpstate -m` 命令,并解析其输出中的 `Status` (运行状态) 和 `Data Status` (数据同步状态)。由于它直接调用 Greenplum 官方的管理工具,因此**必须使用有权限执行 `gpstate` 的用户(如 `gpadmin`)来运行 Categraf**,或者配置合适的免密环境。 + +## 配置说明 + +```toml +# # 采集周期 +# interval = 15 + +[[instances]] +# 该插件没有实例级别的特殊配置。只需确保环境中有 gpstate 即可。 +# 可以加一些标签来区分不同集群 +# labels = { cluster="gp-cluster-1" } +``` + +## 采集指标 + +所有指标将附带 `Mirror` (镜像名称), `Datadir` (数据目录) 和 `Port` (端口) 作为标签。 + +- `greenplum_Status`: 节点状态。`1` 表示状态为 `Passive`,否则为 `0`。 +- `greenplum_Data_Status`: 数据同步状态。`1` 表示状态为 `Synchronized` (已同步),否则为 `0`。 + +## 监控大盘 + +建议将这些状态指标放入 Greenplum 的整体监控大盘中进行监控,当 `greenplum_Data_Status` 长时间为 `0` 时,说明主备数据同步可能存在异常,应触发告警。 diff --git a/inputs/greenplum/dashboard.json b/inputs/greenplum/dashboard.json new file mode 100644 index 000000000..ef2ffac47 --- /dev/null +++ b/inputs/greenplum/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "Greenplum Mirror Status", + "uid": "47a0c77f", + "tags": [ + "greenplum mirror status" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Greenplum Mirror Status (1=Passive)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "greenplum_Status", + "legendFormat": "Mirror: {{Mirror}} on {{Datadir}}", + "refId": "A" + } + ] + }, + { + "title": "Greenplum Data Status (1=Synchronized)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "greenplum_Data_Status", + "legendFormat": "Mirror: {{Mirror}} on {{Datadir}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/hadoop_hdfs/README.md b/inputs/hadoop_hdfs/README.md index 4b5b5f3c8..b1579563b 100644 --- a/inputs/hadoop_hdfs/README.md +++ b/inputs/hadoop_hdfs/README.md @@ -1,3 +1,18 @@ -# hadoop-hdfs +# Hadoop HDFS Monitoring Plugin -hadoop-hdfs 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[hadoop-hdfs.toml](../../conf/input.jolokia_agent_misc/hadoop-hdfs.toml) +Categraf does not require a dedicated, standalone native plugin to monitor Hadoop HDFS. HDFS exposes its monitoring data via JMX, so it is highly recommended to use Categraf's built-in `jolokia_agent` plugin to fetch these metrics. + +## Configuration + +To monitor HDFS, please configure the `jolokia_agent` plugin directly. We have already prepared a template configuration suitable for Hadoop HDFS in the example configuration directory. + +Please refer to: [hadoop-hdfs.toml](../../conf/example.input.jolokia_agent/hadoop-hdfs.toml) + +Steps: +1. Copy the reference configuration above into your Categraf `conf/input.jolokia_agent/` directory. +2. Ensure that Jolokia Agent is enabled on your Hadoop NameNode or DataNode. +3. Modify the `urls` in the configuration file to point to your real Jolokia JMX HTTP Endpoint (e.g., `http://localhost:8778/jolokia/`). + +## Metrics and Dashboards + +Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend entirely on the `metrics` blocks defined in your configuration file. In your Grafana or Nightingale dashboards, simply query metrics starting with `jolokia_` or whatever `name_prefix` you defined in the configuration. diff --git a/inputs/hadoop_hdfs/README_CN.md b/inputs/hadoop_hdfs/README_CN.md new file mode 100644 index 000000000..72041cb42 --- /dev/null +++ b/inputs/hadoop_hdfs/README_CN.md @@ -0,0 +1,18 @@ +# Hadoop HDFS 监控插件 + +Categraf 监控 Hadoop HDFS 时,不需要专门的独立的二进制原生插件。HDFS 提供 JMX 接口暴露监控数据,因此推荐使用 Categraf 自带的 `jolokia_agent` 插件来抓取这些指标。 + +## 配置说明 + +要配置 HDFS 的监控,请直接修改 `jolokia_agent` 的配置文件。我们在配置示例目录中已经准备好了一份适用于 Hadoop HDFS 的模板。 + +请参考:[hadoop-hdfs.toml](../../conf/example.input.jolokia_agent/hadoop-hdfs.toml) + +具体步骤: +1. 将上述参考配置复制到您的 Categraf `conf/input.jolokia_agent/` 目录中。 +2. 确保您的 Hadoop NameNode 或 DataNode 启用了 Jolokia Agent。 +3. 修改配置文件中的 `urls`,指向真实的 Jolokia JMX HTTP Endpoint (例如: `http://localhost:8778/jolokia/`)。 + +## 采集指标与大盘 + +由于实际上使用的是 Jolokia Agent,采集到的指标完全取决于配置文件中配置的 `metrics`。请在您的 Grafana 或夜莺监控大盘中直接使用 `jolokia_` 或者配置中指定的 `name_prefix` 作为前缀来查询指标。 diff --git a/inputs/hadoop_hdfs/dashboard.json b/inputs/hadoop_hdfs/dashboard.json new file mode 100644 index 000000000..c41d3d994 --- /dev/null +++ b/inputs/hadoop_hdfs/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Hadoop HDFS Monitoring", + "uid": "d866f4f8", + "tags": [ + "hadoop hdfs monitoring" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Hadoop HDFS Plugin Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/http_response/README.md b/inputs/http_response/README.md index 45eb9fcf0..55084db79 100644 --- a/inputs/http_response/README.md +++ b/inputs/http_response/README.md @@ -43,6 +43,23 @@ targets = [ method = "POST" ``` +## 指标说明 + +- `http_response_dns_request` DNS 解析耗时,单位毫秒 +- `http_response_tcp_connect` TCP 建连耗时,单位毫秒 +- `http_response_tls_handshake` TLS 握手耗时,单位毫秒 +- `http_response_first_byte` 首包响应耗时,单位毫秒 +- `http_response_total_cost` 请求总耗时,单位毫秒 +- `http_response_response_time` 响应耗时,单位秒,保留为兼容旧指标 +- `http_response_response_code` HTTP 响应码 +- `http_response_result_code` 探测结果码 +- `http_response_cert_expire_timestamp` HTTPS 证书过期时间戳 + +说明: + +- 使用 IP 直连或连接复用时(HTTP/HTTPS 均可能),部分阶段指标可能为 `-1` +- `http_response_cert_expire_timestamp` 仅在 HTTPS 目标且成功建立 TLS 连接时输出 + ## 监控大盘和告警规则 该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置,alerts.json 是告警规则,可以导入夜莺使用。 \ No newline at end of file diff --git a/inputs/huatuo/dashboard.json b/inputs/huatuo/dashboard.json new file mode 100644 index 000000000..be438bd2d --- /dev/null +++ b/inputs/huatuo/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Huatuo eBPF Metrics", + "uid": "b758018e", + "tags": [ + "huatuo ebpf metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Huatuo Agent Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/ipvs/README.md b/inputs/ipvs/README.md index b8eacb92e..045e3c944 100644 --- a/inputs/ipvs/README.md +++ b/inputs/ipvs/README.md @@ -1,83 +1,55 @@ -# ipvs +# IPVS Input Plugin + +Forked from Telegraf. The IPVS input plugin uses the Linux kernel netlink socket interface to gather metrics about IPVS virtual and real servers. -Forked from [telegraf/ipvs](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/ipvs). The IPVS input plugin uses the linux kernel netlink socket interface to gather metrics about ipvs virtual and real servers. **Supported Platforms:** Linux -### Permissions +## Permissions -Assuming you installed the telegraf package via one of the published packages, -the process will be running as the `telegraf` user. However, in order for this -plugin to communicate over netlink sockets it needs the telegraf process to be -running as `root` (or some user with `CAP_NET_ADMIN` and `CAP_NET_RAW`). Be sure -to ensure these permissions before running telegraf with this plugin included. +In order for this plugin to communicate over netlink sockets, the Categraf process needs to be running as `root` (or as a user with `CAP_NET_ADMIN` and `CAP_NET_RAW` capabilities). Be sure to ensure these permissions before running Categraf with this plugin included. ## Configuration -``` + +```toml # Collect virtual and real server stats from Linux IPVS - # no configuration +[[instances]] +# No specific configuration is required. ``` ## Metrics -Server will contain tags identifying how it was configured, using one of -`address` + `port` + `protocol` *OR* `fwmark`. This is how one would normally -configure a virtual server using `ipvsadm`. - -- ipvs_virtual_server - - tags: - - sched (the scheduler in use) - - netmask (the mask used for determining affinity) - - address_family (inet/inet6) - - address - - port - - protocol - - fwmark - - fields: - - connections - - pkts_in - - pkts_out - - bytes_in - - bytes_out - - pps_in - - pps_out - - cps - -- ipvs_real_server - - tags: - - address - - port - - address_family (inet/inet6) - - virtual_address - - virtual_port - - virtual_protocol - - virtual_fwmark - - fields: - - active_connections - - inactive_connections - - connections - - pkts_in - - pkts_out - - bytes_in - - bytes_out - - pps_in - - pps_out - - cps - -## Example Output - -Virtual server is configured using `fwmark` and backed by 2 real servers: - -```shell -ipvs_virtual_server,address=172.18.64.234,address_family=inet,netmask=32,port=9000,protocol=tcp,sched=rr bytes_in=0i,bytes_out=0i,pps_in=0i,pps_out=0i,cps=0i,connections=0i,pkts_in=0i,pkts_out=0i 1541019340000000000 -ipvs_real_server,address=172.18.64.220,address_family=inet,port=9000,virtual_address=172.18.64.234,virtual_port=9000,virtual_protocol=tcp active_connections=0i,inactive_connections=0i,pkts_in=0i,bytes_out=0i,pps_out=0i,connections=0i,pkts_out=0i,bytes_in=0i,pps_in=0i,cps=0i 1541019340000000000 -ipvs_real_server,address=172.18.64.219,address_family=inet,port=9000,virtual_address=172.18.64.234,virtual_port=9000,virtual_protocol=tcp active_connections=0i,inactive_connections=0i,pps_in=0i,pps_out=0i,connections=0i,pkts_in=0i,pkts_out=0i,bytes_in=0i,bytes_out=0i,cps=0i 1541019340000000000 -``` - -Virtual server is configured using `proto+addr+port` and backed by 2 real -servers: - -```shell -ipvs_virtual_server,address_family=inet,fwmark=47,netmask=32,sched=rr cps=0i,connections=0i,pkts_in=0i,pkts_out=0i,bytes_in=0i,bytes_out=0i,pps_in=0i,pps_out=0i 1541019340000000000 -ipvs_real_server,address=172.18.64.220,address_family=inet,port=9000,virtual_fwmark=47 inactive_connections=0i,pkts_out=0i,bytes_out=0i,pps_in=0i,cps=0i,active_connections=0i,pkts_in=0i,bytes_in=0i,pps_out=0i,connections=0i 1541019340000000000 -ipvs_real_server,address=172.18.64.219,address_family=inet,port=9000,virtual_fwmark=47 cps=0i,active_connections=0i,inactive_connections=0i,connections=0i,pkts_in=0i,bytes_out=0i,pkts_out=0i,bytes_in=0i,pps_in=0i,pps_out=0i 1541019340000000000 -``` \ No newline at end of file +Servers will contain tags identifying how they were configured, using either `address` + `port` + `protocol` *OR* `fwmark`. This corresponds to how you would normally configure a virtual server using `ipvsadm`. + +### 1. ipvs_virtual_server +- **Tags:** + - `sched` (the scheduler in use) + - `netmask` (the mask used for determining affinity) + - `address_family` (inet/inet6) + - `address` + - `port` + - `protocol` + - `fwmark` +- **Fields:** + - `connections` + - `pkts_in` / `pkts_out` + - `bytes_in` / `bytes_out` + - `pps_in` / `pps_out` + - `cps` + +### 2. ipvs_real_server +- **Tags:** + - `address` + - `port` + - `address_family` (inet/inet6) + - `virtual_address` + - `virtual_port` + - `virtual_protocol` + - `virtual_fwmark` +- **Fields:** + - `active_connections` + - `inactive_connections` + - `connections` + - `pkts_in` / `pkts_out` + - `bytes_in` / `bytes_out` + - `pps_in` / `pps_out` + - `cps` diff --git a/inputs/ipvs/README_CN.md b/inputs/ipvs/README_CN.md new file mode 100644 index 000000000..6410bdd51 --- /dev/null +++ b/inputs/ipvs/README_CN.md @@ -0,0 +1,55 @@ +# IPVS 采集插件 + +该插件用于采集 Linux IPVS (IP Virtual Server) 的虚拟服务器 (Virtual Server) 和真实服务器 (Real Server) 的状态和网络流量指标。 +它通过底层的 netlink socket 接口与 Linux 内核直接通信来获取数据。该插件 fork 自 telegraf。 + +**支持平台:** Linux + +## 权限要求 + +为了通过 netlink socket 接口与内核通信,运行 Categraf 的进程需要 root 权限,或者至少具备 `CAP_NET_ADMIN` 和 `CAP_NET_RAW` 能力 (Capabilities)。在使用此插件前,请务必确保 Categraf 拥有足够的权限。 + +## 配置说明 + +```toml +# 采集 Linux IPVS 的虚拟和真实服务器指标 +[[instances]] +# 无需任何特殊配置,只需启用即可 +``` + +## 采集指标 + +采集的指标会自动打上标签,以标识虚拟服务器的配置方式(例如,使用 `address` + `port` + `protocol` 或者使用 `fwmark` 配置)。这与您平时使用 `ipvsadm` 配置虚拟服务器的方式一致。 + +### 1. ipvs_virtual_server +表示虚拟服务器 (负载均衡前端)。 +- **Tags:** + - `sched`: 使用的调度算法 (如 rr, wrr) + - `netmask`: 掩码 + - `address_family`: inet 或 inet6 + - `address`: VIP 地址 + - `port`: 端口 + - `protocol`: 协议 (tcp/udp) + - `fwmark`: 防火墙标记 +- **Fields (指标):** + - `connections`: 总连接数 + - `pkts_in` / `pkts_out`: 收发数据包总数 + - `bytes_in` / `bytes_out`: 收发字节总数 + - `pps_in` / `pps_out`: 每秒收发数据包速率 + - `cps`: 每秒新建连接数 + +### 2. ipvs_real_server +表示真实服务器 (后端的真实节点)。 +- **Tags:** + - `address`: Real Server IP + - `port`: Real Server 端口 + - `address_family`: inet 或 inet6 + - `virtual_address` / `virtual_port` / `virtual_protocol` / `virtual_fwmark`: 其所属的虚拟服务器的信息 +- **Fields (指标):** + - `active_connections`: 活跃连接数 + - `inactive_connections`: 非活跃连接数 + - `connections`: 总连接数 + - `pkts_in` / `pkts_out`: 收发数据包总数 + - `bytes_in` / `bytes_out`: 收发字节总数 + - `pps_in` / `pps_out`: 每秒收发数据包速率 + - `cps`: 每秒新建连接数 \ No newline at end of file diff --git a/inputs/ipvs/dashboard.json b/inputs/ipvs/dashboard.json new file mode 100644 index 000000000..8ca371191 --- /dev/null +++ b/inputs/ipvs/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "Linux IPVS Metrics", + "uid": "653fa525", + "tags": [ + "linux ipvs metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "IPVS Real Server Active Connections", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "ipvs_real_server_active_connections", + "legendFormat": "RS: {{address}}:{{port}}", + "refId": "A" + } + ] + }, + { + "title": "IPVS Real Server Inactive Connections", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "ipvs_real_server_inactive_connections", + "legendFormat": "RS: {{address}}:{{port}}", + "refId": "A" + } + ] + }, + { + "title": "IPVS Virtual Server Connections", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "ipvs_virtual_server_connections", + "legendFormat": "VS: {{address}}:{{port}}", + "refId": "A" + } + ] + }, + { + "title": "IPVS Virtual Server CPS", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "ipvs_virtual_server_cps", + "legendFormat": "VS: {{address}}:{{port}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/jboss/README.md b/inputs/jboss/README.md index 5baa7272e..a7cba428b 100644 --- a/inputs/jboss/README.md +++ b/inputs/jboss/README.md @@ -1,3 +1,19 @@ -# jboss +# JBoss Monitoring Plugin -jboss 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[jboss.toml](../../conf/input.jolokia_agent_misc/jboss.toml) +Categraf does not require a dedicated native plugin to monitor JBoss (WildFly). Since JBoss runs on the JVM and exposes rich monitoring data via JMX, it is highly recommended to use Categraf's built-in `jolokia_agent` plugin to fetch these metrics. + +## Configuration + +To monitor JBoss, please configure the `jolokia_agent_misc` plugin directly. We have already prepared a template configuration suitable for JBoss in the example configuration directory. + +Please refer to: [jboss.toml](../../conf/input.jolokia_agent_misc/jboss.toml) + +Steps: +1. Copy the reference configuration above into your Categraf `conf/input.jolokia_agent_misc/` directory. +2. Ensure that Jolokia Agent is deployed and enabled on your JBoss Server (usually by deploying `jolokia.war`). +3. Modify the `urls` in the configuration file to point to your real Jolokia JMX HTTP Endpoint (e.g., `http://localhost:8080/jolokia/`). + +## Metrics and Dashboards + +Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend entirely on the `metrics` blocks defined in your configuration file. Common metrics include JVM memory, thread counts, JBoss Web connections, and sessions. +In your Grafana or Nightingale dashboards, simply query metrics starting with `jolokia_` or whatever `name_prefix` you defined in the configuration. diff --git a/inputs/jboss/README_CN.md b/inputs/jboss/README_CN.md new file mode 100644 index 000000000..93ba3fdad --- /dev/null +++ b/inputs/jboss/README_CN.md @@ -0,0 +1,19 @@ +# JBoss 监控插件 + +Categraf 监控 JBoss (WildFly) 时,不需要专门的独立原生插件。JBoss 运行在 JVM 之上,通过 JMX 接口可以获取到丰富的监控数据,因此推荐使用 Categraf 自带的 `jolokia_agent` 插件来抓取这些指标。 + +## 配置说明 + +要配置 JBoss 的监控,请直接使用并修改 `jolokia_agent_misc` 插件。我们在配置示例目录中已经准备好了一份适用于 JBoss 的模板。 + +请参考:[jboss.toml](../../conf/input.jolokia_agent_misc/jboss.toml) + +具体步骤: +1. 将上述参考配置复制到您的 Categraf `conf/input.jolokia_agent_misc/` 目录中。 +2. 确保您的 JBoss 服务器上部署并启用了 Jolokia Agent (通常是部署 jolokia.war)。 +3. 修改配置文件中的 `urls`,指向真实的 Jolokia JMX HTTP Endpoint (例如: `http://localhost:8080/jolokia/`)。 + +## 采集指标与大盘 + +由于实际上使用的是 Jolokia Agent,采集到的指标完全取决于配置文件中配置的 `metrics`。常见的指标包括 JVM 内存、线程数、JBoss Web 连接数、会话数等。 +请在您的 Grafana 或夜莺监控大盘中直接使用 `jolokia_` 或者配置中指定的 `name_prefix` 作为前缀来查询指标。 diff --git a/inputs/jboss/dashboard.json b/inputs/jboss/dashboard.json new file mode 100644 index 000000000..3098b9b9e --- /dev/null +++ b/inputs/jboss/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "JBoss Monitoring", + "uid": "09d272ef", + "tags": [ + "jboss monitoring" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "JBoss Plugin Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/jenkins/README.md b/inputs/jenkins/README.md new file mode 100644 index 000000000..360bbafbb --- /dev/null +++ b/inputs/jenkins/README.md @@ -0,0 +1,61 @@ +# Jenkins Input Plugin + +This plugin collects node (Computer) status and job (Build) status from a Jenkins Continuous Integration server. +It retrieves data by directly querying the Jenkins JSON API. + +## Configuration + +```toml +# Collection interval +# interval = 60 + +[[instances]] +# The root URL of your Jenkins service +jenkins_url = "http://localhost:8080" + +# Authentication credentials (if Jenkins does not allow anonymous read access, you must provide a valid account/token) +jenkins_username = "admin" +jenkins_password = "password_or_token" + +# Maximum idle connections in the TCP connection pool +# max_connections = 5 +# HTTP Request Timeout +# response_timeout = "5s" + +# ===== Job Filtering Options ===== +# Maximum depth to scan folders/subjobs +# max_subjob_depth = 0 +# Maximum number of subjobs to fetch per layer +# max_subjob_per_layer = 10 +# Jobs that haven't been built within this age will be ignored +# max_build_age = "24h" + +# Job name filtering, supports wildcards +# job_include = [] +# job_exclude = [] + +# ===== Node Filtering Options ===== +# Node name filtering, supports wildcards +# node_include = [] +# node_exclude = [] +``` + +## Metrics + +**Global and Node Metrics:** +- `jenkins_up`: Whether the node is online (1: online, 0: offline) +- `jenkins_busy_executors`: Number of busy executors across the Jenkins cluster +- `jenkins_total_executors`: Total number of executors across the Jenkins cluster +- `jenkins_node_num_executors`: Number of executors on a specific node +- `jenkins_node_response_time`: Response time of the node +- `jenkins_node_disk_available`: Available disk space on the node +- `jenkins_node_temp_available`: Available temporary directory space on the node +- `jenkins_node_swap_available`: Available swap space on the node +- `jenkins_node_memory_available`: Available physical memory on the node +- `jenkins_node_swap_total`: Total swap on the node +- `jenkins_node_memory_total`: Total physical memory on the node + +**Job Metrics:** +- `jenkins_job_duration`: Job build duration +- `jenkins_job_number`: Build number +- `jenkins_job_result_code`: Result status code of the build (0: Success, 1: Failure, 2: Not_built, 3: Unstable, 4: Aborted) diff --git a/inputs/jenkins/README_CN.md b/inputs/jenkins/README_CN.md new file mode 100644 index 000000000..39a5a0ca7 --- /dev/null +++ b/inputs/jenkins/README_CN.md @@ -0,0 +1,61 @@ +# Jenkins 采集插件 + +该插件用于采集 Jenkins 持续集成服务器的节点状态 (Node/Computer) 以及任务构建 (Job/Build) 状态。 +它通过直接请求 Jenkins 的 JSON API 获取相关数据。 + +## 配置说明 + +```toml +# 采集周期 +# interval = 60 + +[[instances]] +# Jenkins 服务的根 URL +jenkins_url = "http://localhost:8080" + +# 认证凭据 (如果 Jenkins 未开启匿名访问,请务必提供有对应权限的账户密码/Token) +jenkins_username = "admin" +jenkins_password = "password_or_token" + +# TCP 连接池的最大空闲连接数 +# max_connections = 5 +# HTTP 请求超时时间 +# response_timeout = "5s" + +# ===== Job/任务 过滤配置 ===== +# 最大获取任务的历史层级 (控制扫描所有文件夹和子任务的深度) +# max_subjob_depth = 0 +# 每层最多获取的任务数 +# max_subjob_per_layer = 10 +# 超过多长时间未构建的任务将被忽略 +# max_build_age = "24h" + +# 任务名称过滤,支持通配符 +# job_include = [] +# job_exclude = [] + +# ===== Node/节点 过滤配置 ===== +# 节点名称过滤,支持通配符 +# node_include = [] +# node_exclude = [] +``` + +## 采集指标 + +**全局与节点 (Node) 指标:** +- `jenkins_up`: 节点是否在线 (1:在线, 0:离线) +- `jenkins_busy_executors`: 整个 Jenkins 正在工作的执行器数量 +- `jenkins_total_executors`: 整个 Jenkins 的总执行器数量 +- `jenkins_node_num_executors`: 单个节点的执行器数 +- `jenkins_node_response_time`: 单个节点的响应时间 +- `jenkins_node_disk_available`: 节点剩余磁盘空间 +- `jenkins_node_temp_available`: 节点剩余临时目录空间 +- `jenkins_node_swap_available`: 节点可用 Swap 空间 +- `jenkins_node_memory_available`: 节点可用物理内存 +- `jenkins_node_swap_total`: 节点总 Swap +- `jenkins_node_memory_total`: 节点总内存 + +**任务 (Job) 指标:** +- `jenkins_job_duration`: 任务构建耗时 +- `jenkins_job_number`: 任务构建的编号 +- `jenkins_job_result_code`: 任务构建结果的状态码 (0: Success, 1: Failure, 2: Not_built, 3: Unstable, 4: Aborted) diff --git a/inputs/jenkins/dashboard.json b/inputs/jenkins/dashboard.json new file mode 100644 index 000000000..1752b1f42 --- /dev/null +++ b/inputs/jenkins/dashboard.json @@ -0,0 +1,124 @@ +{ + "title": "Jenkins CI/CD Metrics", + "uid": "409a0c3f", + "tags": [ + "jenkins ci/cd metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Jenkins Up Status", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "jenkins_up", + "legendFormat": "Node: {{node_name}}", + "refId": "A" + } + ] + }, + { + "title": "Jenkins Job Result (0=Success,1=Fail)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "jenkins_job_result_code", + "legendFormat": "Job: {{name}}", + "refId": "A" + } + ] + }, + { + "title": "Jenkins Job Duration (ms)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "jenkins_job_duration", + "legendFormat": "Job: {{name}}", + "refId": "A" + } + ] + }, + { + "title": "Jenkins Busy Executors", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "jenkins_busy_executors", + "legendFormat": "Busy", + "refId": "A" + } + ] + }, + { + "title": "Jenkins Node Disk Available (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 12, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "jenkins_node_disk_available", + "legendFormat": "Node: {{node_name}}", + "refId": "A" + } + ] + }, + { + "title": "Jenkins Node Response Time (ms)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 16, + "w": 12, + "h": 8 + }, + "id": 6, + "targets": [ + { + "expr": "jenkins_node_response_time", + "legendFormat": "Node: {{node_name}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/jolokia/README.md b/inputs/jolokia/README.md new file mode 100644 index 000000000..f27351b8e --- /dev/null +++ b/inputs/jolokia/README.md @@ -0,0 +1,10 @@ +# Jolokia (Shared Library) + +This directory contains the shared client code and gatherer logic for the Jolokia protocol. +It is **NOT** a standalone Categraf plugin that can be enabled directly. + +For the actual plugins, please refer to: +- `jolokia_agent`: Used to directly connect to Jolokia Agents deployed inside individual Java applications (Recommended). +- `jolokia_proxy`: Used to collect metrics from multiple Java applications centrally via a Jolokia Proxy. + +Please refer to the documentation in those respective plugin directories for more details. diff --git a/inputs/jolokia/README_CN.md b/inputs/jolokia/README_CN.md new file mode 100644 index 000000000..797b80b92 --- /dev/null +++ b/inputs/jolokia/README_CN.md @@ -0,0 +1,10 @@ +# Jolokia (Shared Library) + +该目录存放了 Jolokia 协议的共享客户端代码和采集器逻辑。 +它**不是**一个可以直接使用的 Categraf 插件。 + +直接使用的采集插件为: +- `jolokia_agent`: 适用于直连各个 Java 应用内部署的 Jolokia Agent (推荐)。 +- `jolokia_proxy`: 适用于通过 Jolokia Proxy 集中采集多台 Java 应用的场景。 + +详情请参考上述两个插件的文档。 diff --git a/inputs/jolokia/dashboard.json b/inputs/jolokia/dashboard.json new file mode 100644 index 000000000..ef71f966c --- /dev/null +++ b/inputs/jolokia/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Jolokia Shared Lib", + "uid": "3679844a", + "tags": [ + "jolokia shared lib" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Jolokia Shared Library", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/jolokia_agent/README.md b/inputs/jolokia_agent/README.md new file mode 100644 index 000000000..a4099cf24 --- /dev/null +++ b/inputs/jolokia_agent/README.md @@ -0,0 +1,52 @@ +# Jolokia Agent Input Plugin + +This plugin collects Java JMX metrics by connecting to the Jolokia Agent (`jolokia.war` or `javaagent:jolokia-jvm.jar`) embedded in the target Java application, querying data via Jolokia's HTTP JSON API. + +This is the most common way to collect Java application metrics over the network. While direct JMX is also possible, Jolokia is often preferred due to its firewall-friendly nature and lower overhead. + +## Configuration + +```toml +# Collection interval +# interval = 60 + +[[instances]] +# URLs of the Jolokia Agents. Multiple URLs are supported. +urls = ["http://localhost:8080/jolokia"] + +# Basic Auth credentials (if Jolokia is secured) +# username = "admin" +# password = "password" + +# HTTP Request Timeout +# response_timeout = "5s" + +# ===== Metrics Collection Configuration ===== +# You can define multiple [[instances.metric]] blocks, each mapping to a set of JMX MBean attributes. +[[instances.metric]] +# Prefix for the generated metric names +name = "java_memory" +# The JMX MBean ObjectName to query +mbean = "java.lang:type=Memory" +# List of attributes to fetch. If empty, fetches all attributes. +# paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"] + +[[instances.metric]] +name = "java_garbage_collector" +mbean = "java.lang:name=*,type=GarbageCollector" +# Use specific MBean property values as tags +tag_keys = ["name"] +``` + +## Metrics + +The metric names and structures depend entirely on your `[[instances.metric]]` configurations. By default: +- The measurement/metric name will use the specified `name` prefix. +- JMX MBean attributes will be mapped as fields (e.g., `java_memory_HeapMemoryUsage_used`). +- Any property defined in `tag_keys` will be attached as tags to the data points. + +## Dashboards + +Because the metrics collected via Jolokia are highly customizable (can be used to monitor Tomcat, JBoss, Kafka, or any custom business JMX metrics), you will need to build your own dashboards in Grafana or Nightingale based on the `name` prefixes and specific scenarios you configured. + +If you are using our provided example configurations (such as `tomcat.toml`, `kafka.toml`, `activemq.toml`, etc.), you can directly import the default dashboards provided for those specific components. diff --git a/inputs/jolokia_agent/README_CN.md b/inputs/jolokia_agent/README_CN.md new file mode 100644 index 000000000..54107db0d --- /dev/null +++ b/inputs/jolokia_agent/README_CN.md @@ -0,0 +1,52 @@ +# Jolokia Agent 采集插件 + +该插件用于连接目标 Java 应用程序内嵌部署的 Jolokia Agent (`jolokia.war` 或 `javaagent:jolokia-jvm.jar`),通过 Jolokia 的 HTTP JSON API 采集 Java JMX 指标。 + +这是通过网络采集 Java 应用程序指标最常用的方式。对于直接暴露了 JMX 端口的 Java 程序,虽然也可以用 jmx 采集,但 Jolokia 方式往往更容易穿透防火墙且开销更小。 + +## 配置说明 + +```toml +# 采集周期 +# interval = 60 + +[[instances]] +# Jolokia Agent 的访问地址,支持配置多个地址 +urls = ["http://localhost:8080/jolokia"] + +# Basic Auth 认证 (如果目标 Jolokia 配置了认证) +# username = "admin" +# password = "password" + +# HTTP 请求超时时间 +# response_timeout = "5s" + +# ===== 指标采集配置 ===== +# 可以配置多个 [[instances.metric]] 块,每个块对应一组 JMX MBean 属性 +[[instances.metric]] +# 生成的 metric 前缀 +name = "java_memory" +# JMX MBean 的 ObjectName +mbean = "java.lang:type=Memory" +# 需要采集的属性列表。如果不填则采集该 MBean 的所有属性。 +# paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"] + +[[instances.metric]] +name = "java_garbage_collector" +mbean = "java.lang:name=*,type=GarbageCollector" +# 使用 MBean 中的属性值作为生成的 tags +tag_keys = ["name"] +``` + +## 采集指标 + +指标的名称和结构完全取决于 `[[instances.metric]]` 中配置的内容。默认情况下: +- Measurement / Metric 名称会带上 `name` 前缀。 +- JMX MBean 的属性将被映射为对应的 Field (如 `java_memory_HeapMemoryUsage_used`)。 +- 配置了 `tag_keys` 的属性,将作为 Tag 附加在数据中。 + +## 监控大盘 + +由于 Jolokia 采集的指标高度自定义(可采集 Tomcat, JBoss, Kafka, 乃至任何自定义业务的 JMX 指标),您需要根据您配置的 `name` 前缀和具体业务场景,在 Grafana 或夜莺中构建自己的监控大盘。 + +如果您使用的是我们提供的示例配置(如 `tomcat.toml`, `kafka.toml`, `activemq.toml` 等),则可以直接导入对应组件的默认大盘。 diff --git a/inputs/jolokia_agent/dashboard.json b/inputs/jolokia_agent/dashboard.json new file mode 100644 index 000000000..0038c7e88 --- /dev/null +++ b/inputs/jolokia_agent/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Jolokia Agent Custom Metrics", + "uid": "3866c107", + "tags": [ + "jolokia agent custom metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Jolokia Agent Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/jolokia_proxy/README.md b/inputs/jolokia_proxy/README.md new file mode 100644 index 000000000..bf61992b1 --- /dev/null +++ b/inputs/jolokia_proxy/README.md @@ -0,0 +1,46 @@ +# Jolokia Proxy Input Plugin + +This plugin collects JMX metrics from multiple target Java applications by using a centralized Jolokia Proxy. + +If you have a large number of Java services and deploying a Jolokia Agent or opening network ports on every single instance is difficult, you can deploy a centralized Jolokia Proxy service. Categraf can then issue requests to the Proxy, which forwards them to the target JMX endpoints. + +## Configuration + +```toml +# Collection interval +# interval = 60 + +[[instances]] +# The URL of the Jolokia Proxy service (Only one proxy URL per instance) +url = "http://localhost:8080/jolokia" + +# Credentials for accessing the Proxy service itself +# username = "proxyadmin" +# password = "proxypassword" + +# ===== Target Configurations ===== +# Default credentials for accessing target services +# default_target_username = "admin" +# default_target_password = "password" + +# List of target JMX URLs to proxy requests to +[[instances.target]] +url = "service:jmx:rmi:///jndi/rmi://target-host-1:9010/jmxrmi" +# username = "custom_user" +# password = "custom_password" + +[[instances.target]] +url = "service:jmx:rmi:///jndi/rmi://target-host-2:9010/jmxrmi" + +# ===== Metrics Collection Configuration ===== +# Identical to jolokia_agent, configure the MBeans you want to collect +[[instances.metric]] +name = "java_memory" +mbean = "java.lang:type=Memory" +``` + +## Metrics and Dashboards + +Because this plugin collects the exact same kind of data as the `jolokia_agent` plugin, the metric names and structure depend entirely on your `[[instances.metric]]` configurations. + +Therefore, it does not come with a single predefined dashboard. You must customize your dashboard based on the specific business logic (e.g., Tomcat / JBoss / Kafka) you are querying, or reuse existing Jolokia Agent dashboards. diff --git a/inputs/jolokia_proxy/README_CN.md b/inputs/jolokia_proxy/README_CN.md new file mode 100644 index 000000000..03cc6f220 --- /dev/null +++ b/inputs/jolokia_proxy/README_CN.md @@ -0,0 +1,46 @@ +# Jolokia Proxy 采集插件 + +该插件通过 Jolokia Proxy 集中采集多台目标 Java 应用程序的 JMX 指标。 + +如果您有大量的 Java 服务,但在各个业务进程中部署 Jolokia Agent 或打通网络端口存在困难,您可以部署一个集中的 Jolokia Proxy 服务,让 Categraf 通过该 Proxy 代理请求各个目标服务的 JMX 端口。 + +## 配置说明 + +```toml +# 采集周期 +# interval = 60 + +[[instances]] +# Jolokia Proxy 服务的访问地址 (只有一个代理服务 URL) +url = "http://localhost:8080/jolokia" + +# 访问 Proxy 服务的认证凭据 +# username = "proxyadmin" +# password = "proxypassword" + +# ===== 目标服务 (Target) 配置 ===== +# 默认的访问目标服务的凭据 +# default_target_username = "admin" +# default_target_password = "password" + +# 配置需要代理采集的目标服务 URL 列表 +[[instances.target]] +url = "service:jmx:rmi:///jndi/rmi://target-host-1:9010/jmxrmi" +# username = "custom_user" +# password = "custom_password" + +[[instances.target]] +url = "service:jmx:rmi:///jndi/rmi://target-host-2:9010/jmxrmi" + +# ===== 指标采集配置 ===== +# 与 jolokia_agent 一致,配置您想采集的 MBean +[[instances.metric]] +name = "java_memory" +mbean = "java.lang:type=Memory" +``` + +## 采集指标与监控大盘 + +由于此插件采集的内容与 `jolokia_agent` 一致,指标名称和结构均取决于 `[[instances.metric]]` 的配置。 + +因此,它没有一个统一的预置大盘,您需要基于具体业务逻辑 (如 Tomcat / JBoss / Kafka) 进行自定义,或复用其他 Jolokia Agent 的大盘。 diff --git a/inputs/jolokia_proxy/dashboard.json b/inputs/jolokia_proxy/dashboard.json new file mode 100644 index 000000000..4634de6e1 --- /dev/null +++ b/inputs/jolokia_proxy/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Jolokia Proxy Custom Metrics", + "uid": "d1f799e7", + "tags": [ + "jolokia proxy custom metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Jolokia Proxy Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/kafka/README.md b/inputs/kafka/README.md index 8d1270243..398e69775 100644 --- a/inputs/kafka/README.md +++ b/inputs/kafka/README.md @@ -1,6 +1,6 @@ # kafka -kafka 监控采集插件,由 [kafka-exporter](https://github.com/davidmparrott/kafka_exporter) 封装而来。 +kafka 监控采集插件,由kafka-exporter(https://github.com/davidmparrott/kafka_exporter)封装而来。 ## Configuration @@ -17,7 +17,7 @@ davidmparrott版本 fork自 https://github.com/danielqsj/kafka_exporter (以 -danielqsj版本作为原始版本, github版本也相对活跃, prometheus生态使用较多 +danielqsj版本作为原始版本, github版本也相对活跃, prometheus生态使用较多 categraf kafka plugin基于davidmparrott版本, 以下配置可以对danielqsj版本做一些兼容 1. 增加metric: kafka_broker_info diff --git a/inputs/kafka_connect/README.md b/inputs/kafka_connect/README.md index 6af5655a8..41c32020f 100644 --- a/inputs/kafka_connect/README.md +++ b/inputs/kafka_connect/README.md @@ -1,3 +1,19 @@ -# kafka-connect +# Kafka Connect Monitoring Plugin -kafka-connect 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[kafka-connect.toml](../../conf/input.jolokia_agent_misc/kafka-connect.toml) +Categraf does not require a dedicated native plugin to monitor Kafka Connect. Since Kafka Connect is a Java application and exposes comprehensive monitoring data via JMX, it is highly recommended to use Categraf's built-in `jolokia_agent` or `jolokia_agent_misc` plugin to fetch these metrics. + +## Configuration + +To monitor Kafka Connect, please configure the `jolokia_agent_misc` plugin directly. We have already prepared a template configuration suitable for Kafka Connect in the example configuration directory. + +Please refer to: [kafka-connect.toml](../../conf/input.jolokia_agent_misc/kafka-connect.toml) + +Steps: +1. Copy the reference configuration above into your Categraf `conf/input.jolokia_agent_misc/` directory. +2. Ensure that Jolokia Agent is enabled on your Kafka Connect Worker node. +3. Modify the `urls` in the configuration file to point to your real Jolokia JMX HTTP Endpoint (e.g., `http://localhost:8778/jolokia/`). + +## Metrics and Dashboards + +Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend entirely on the `metrics` blocks defined in your configuration file. Common metrics include Source/Sink Task status, commit latency, and throughput. +In your Grafana or Nightingale dashboards, simply query the mapped JMX metrics prefix defined in your configuration. diff --git a/inputs/kafka_connect/README_CN.md b/inputs/kafka_connect/README_CN.md new file mode 100644 index 000000000..74f9d3655 --- /dev/null +++ b/inputs/kafka_connect/README_CN.md @@ -0,0 +1,19 @@ +# Kafka Connect 监控插件 + +Categraf 监控 Kafka Connect 时,不需要专门的独立原生插件。Kafka Connect 作为 Java 应用程序,通过 JMX 接口暴露了完整的监控数据,因此推荐使用 Categraf 自带的 `jolokia_agent` 或 `jolokia_agent_misc` 插件来抓取这些指标。 + +## 配置说明 + +要配置 Kafka Connect 的监控,请直接修改 `jolokia_agent_misc` 的配置文件。我们在配置示例目录中已经准备好了一份适用于 Kafka Connect 的模板。 + +请参考:[kafka-connect.toml](../../conf/input.jolokia_agent_misc/kafka-connect.toml) + +具体步骤: +1. 将上述参考配置复制到您的 Categraf `conf/input.jolokia_agent_misc/` 目录中。 +2. 确保您的 Kafka Connect Worker 节点上启用了 Jolokia Agent。 +3. 修改配置文件中的 `urls`,指向真实的 Jolokia JMX HTTP Endpoint (例如: `http://localhost:8778/jolokia/`)。 + +## 采集指标与大盘 + +由于实际上使用的是 Jolokia Agent,采集到的指标完全取决于配置文件中配置的 `metrics`。常见的指标包括 Source/Sink Task 的运行状态、提交延迟、吞吐量等。 +请在您的 Grafana 或夜莺监控大盘中直接使用对应的 JMX 映射前缀查询指标。 diff --git a/inputs/kafka_connect/dashboard.json b/inputs/kafka_connect/dashboard.json new file mode 100644 index 000000000..db1760d04 --- /dev/null +++ b/inputs/kafka_connect/dashboard.json @@ -0,0 +1,28 @@ +{ + "title": "Kafka Connect Monitoring", + "uid": "fb2a0e76", + "tags": [ + "kafka connect monitoring" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Kafka Connect Plugin Information", + "type": "text", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/kernel/README.md b/inputs/kernel/README.md index 20bc2126c..31a98f9e7 100644 --- a/inputs/kernel/README.md +++ b/inputs/kernel/README.md @@ -1,7 +1,27 @@ -# kernel +# Kernel Input Plugin -采集本机的内核信息,比如 OS 启动时间,上下文切换的次数等 +This plugin collects status information from the host machine's Linux kernel. +The data is typically sourced from `/proc/stat` and `/proc/vmstat`. -## 监控大盘 +**Supported Platforms:** Linux -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +## Configuration + +```toml +# Collect Linux Kernel metrics +[[instances]] +# No specific configuration is required. +``` + +## Metrics + +- `kernel_boot_time`: System boot time (seconds since Epoch) +- `kernel_context_switches`: Total context switches since boot +- `kernel_interrupts`: Total interrupts since boot +- `kernel_processes_forked`: Total processes created via fork() since boot +- `kernel_entropy_avail`: Available entropy pool size (used for generating random numbers) + +## Dashboards + +Kernel metrics collected by this plugin are usually considered part of basic server monitoring and are often combined with CPU and memory metrics in a global `System` dashboard. +For convenience and testing, a simple standalone Kernel monitoring dashboard is also provided here. diff --git a/inputs/kernel/README_CN.md b/inputs/kernel/README_CN.md new file mode 100644 index 000000000..b4d373f17 --- /dev/null +++ b/inputs/kernel/README_CN.md @@ -0,0 +1,27 @@ +# Kernel 采集插件 + +该插件用于采集本机的 Linux 内核状态信息。 +数据通常来源于 `/proc/stat` 和 `/proc/vmstat`。 + +**支持平台:** Linux + +## 配置说明 + +```toml +# 采集 Linux 系统的 Kernel 指标 +[[instances]] +# 无需任何特殊配置,只需启用即可 +``` + +## 采集指标 + +- `kernel_boot_time`: 系统启动时间 (Epoch 秒数) +- `kernel_context_switches`: 系统启动以来的上下文切换总次数 +- `kernel_interrupts`: 系统启动以来的中断总次数 +- `kernel_processes_forked`: 系统启动以来的 fork() 创建的进程总数 +- `kernel_entropy_avail`: 系统当前可用的熵池大小 (通常用于衡量生成随机数的速度) + +## 监控大盘 + +该插件采集的 Kernel 指标通常属于服务器基础监控的一部分,因此在实际应用中往往会与 CPU、内存等指标一起放在全局的 `System` 大盘中。 +为方便单独查看测试,这里也提供了一个简单的 Kernel 专属监控大盘。 \ No newline at end of file diff --git a/inputs/kernel/dashboard.json b/inputs/kernel/dashboard.json new file mode 100644 index 000000000..4f816aeb3 --- /dev/null +++ b/inputs/kernel/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "Linux Kernel Metrics", + "uid": "bdfb682d", + "tags": [ + "linux kernel metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Kernel Context Switches/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(kernel_context_switches[5m])", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Kernel Interrupts/s", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(kernel_interrupts[5m])", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Kernel Processes Forked/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(kernel_processes_forked[5m])", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Kernel Entropy Available", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "kernel_entropy_avail", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/kernel_vmstat/README.md b/inputs/kernel_vmstat/README.md index af44e2f09..dd10099f2 100644 --- a/inputs/kernel_vmstat/README.md +++ b/inputs/kernel_vmstat/README.md @@ -1,126 +1,24 @@ -# kernel_vmstat +# Kernel VMStat Input Plugin -该监控插件采集的是 `/proc/vmstat` 的指标数据,需要较高版本的 kernel,`/proc/vmstat`内容较多,配置文件中给了一个白名单的配置,大家按需启用,只有启用了才会采集。 +This plugin collects metrics from `/proc/vmstat` on Linux. It requires a relatively modern Linux kernel. + +Since `/proc/vmstat` contains a large number of metrics, we use a whitelist mechanism in the configuration file. Only the metrics explicitly enabled (set to `1` or `true`) in the whitelist will be collected. + +## Configuration + +```toml +# Collect kernel vmstat metrics from /proc/vmstat +[[instances]] +# No other settings are needed, the white_list below controls which fields are collected. -```ini [white_list] oom_kill = 1 nr_free_pages = 0 nr_alloc_batch = 0 -nr_inactive_anon = 0 -nr_active_anon = 0 -nr_inactive_file = 0 -nr_active_file = 0 -nr_unevictable = 0 -nr_mlock = 0 -nr_anon_pages = 0 -nr_mapped = 0 -nr_file_pages = 0 -nr_dirty = 0 -nr_writeback = 0 -nr_slab_reclaimable = 0 -nr_slab_unreclaimable = 0 -nr_page_table_pages = 0 -nr_kernel_stack = 0 -nr_unstable = 0 -nr_bounce = 0 -nr_vmscan_write = 0 -nr_vmscan_immediate_reclaim = 0 -nr_writeback_temp = 0 -nr_isolated_anon = 0 -nr_isolated_file = 0 -nr_shmem = 0 -nr_dirtied = 0 -nr_written = 0 -numa_hit = 0 -numa_miss = 0 -numa_foreign = 0 -numa_interleave = 0 -numa_local = 0 -numa_other = 0 -workingset_refault = 0 -workingset_activate = 0 -workingset_nodereclaim = 0 -nr_anon_transparent_hugepages = 0 -nr_free_cma = 0 -nr_dirty_threshold = 0 -nr_dirty_background_threshold = 0 -pgpgin = 0 -pgpgout = 0 -pswpin = 0 -pswpout = 0 -pgalloc_dma = 0 -pgalloc_dma32 = 0 -pgalloc_normal = 0 -pgalloc_movable = 0 -pgfree = 0 -pgactivate = 0 -pgdeactivate = 0 -pgfault = 0 -pgmajfault = 0 -pglazyfreed = 0 -pgrefill_dma = 0 -pgrefill_dma32 = 0 -pgrefill_normal = 0 -pgrefill_movable = 0 -pgsteal_kswapd_dma = 0 -pgsteal_kswapd_dma32 = 0 -pgsteal_kswapd_normal = 0 -pgsteal_kswapd_movable = 0 -pgsteal_direct_dma = 0 -pgsteal_direct_dma32 = 0 -pgsteal_direct_normal = 0 -pgsteal_direct_movable = 0 -pgscan_kswapd_dma = 0 -pgscan_kswapd_dma32 = 0 -pgscan_kswapd_normal = 0 -pgscan_kswapd_movable = 0 -pgscan_direct_dma = 0 -pgscan_direct_dma32 = 0 -pgscan_direct_normal = 0 -pgscan_direct_movable = 0 -pgscan_direct_throttle = 0 -zone_reclaim_failed = 0 -pginodesteal = 0 -slabs_scanned = 0 -kswapd_inodesteal = 0 -kswapd_low_wmark_hit_quickly = 0 -kswapd_high_wmark_hit_quickly = 0 -pageoutrun = 0 -allocstall = 0 -pgrotated = 0 -drop_pagecache = 0 -drop_slab = 0 -numa_pte_updates = 0 -numa_huge_pte_updates = 0 -numa_hint_faults = 0 -numa_hint_faults_local = 0 -numa_pages_migrated = 0 -pgmigrate_success = 0 -pgmigrate_fail = 0 -compact_migrate_scanned = 0 -compact_free_scanned = 0 -compact_isolated = 0 -compact_stall = 0 -compact_fail = 0 -compact_success = 0 -htlb_buddy_alloc_success = 0 -htlb_buddy_alloc_fail = 0 -unevictable_pgs_culled = 0 -unevictable_pgs_scanned = 0 -unevictable_pgs_rescued = 0 -unevictable_pgs_mlocked = 0 -unevictable_pgs_munlocked = 0 -unevictable_pgs_cleared = 0 -unevictable_pgs_stranded = 0 -thp_fault_alloc = 0 -thp_fault_fallback = 0 -thp_collapse_alloc = 0 -thp_collapse_alloc_failed = 0 -thp_split = 0 -thp_zero_page_alloc = 0 -thp_zero_page_alloc_failed = 0 -balloon_inflate = 0 -balloon_deflate = 0 -balloon_migrate = 0 -``` \ No newline at end of file +# ... (see conf/input.kernel_vmstat/kernel_vmstat.toml for the full list) +``` + +## Dashboards + +By default, the collected metrics will be named `kernel_vmstat_`. +Since this represents deep kernel memory management and paging statistics (like `oom_kill`, `pgpgin`, `pgfault`), these metrics are generally visualized in custom advanced system dashboards or troubleshooting dashboards. diff --git a/inputs/kernel_vmstat/README_CN.md b/inputs/kernel_vmstat/README_CN.md new file mode 100644 index 000000000..af44e2f09 --- /dev/null +++ b/inputs/kernel_vmstat/README_CN.md @@ -0,0 +1,126 @@ +# kernel_vmstat + +该监控插件采集的是 `/proc/vmstat` 的指标数据,需要较高版本的 kernel,`/proc/vmstat`内容较多,配置文件中给了一个白名单的配置,大家按需启用,只有启用了才会采集。 + +```ini +[white_list] +oom_kill = 1 +nr_free_pages = 0 +nr_alloc_batch = 0 +nr_inactive_anon = 0 +nr_active_anon = 0 +nr_inactive_file = 0 +nr_active_file = 0 +nr_unevictable = 0 +nr_mlock = 0 +nr_anon_pages = 0 +nr_mapped = 0 +nr_file_pages = 0 +nr_dirty = 0 +nr_writeback = 0 +nr_slab_reclaimable = 0 +nr_slab_unreclaimable = 0 +nr_page_table_pages = 0 +nr_kernel_stack = 0 +nr_unstable = 0 +nr_bounce = 0 +nr_vmscan_write = 0 +nr_vmscan_immediate_reclaim = 0 +nr_writeback_temp = 0 +nr_isolated_anon = 0 +nr_isolated_file = 0 +nr_shmem = 0 +nr_dirtied = 0 +nr_written = 0 +numa_hit = 0 +numa_miss = 0 +numa_foreign = 0 +numa_interleave = 0 +numa_local = 0 +numa_other = 0 +workingset_refault = 0 +workingset_activate = 0 +workingset_nodereclaim = 0 +nr_anon_transparent_hugepages = 0 +nr_free_cma = 0 +nr_dirty_threshold = 0 +nr_dirty_background_threshold = 0 +pgpgin = 0 +pgpgout = 0 +pswpin = 0 +pswpout = 0 +pgalloc_dma = 0 +pgalloc_dma32 = 0 +pgalloc_normal = 0 +pgalloc_movable = 0 +pgfree = 0 +pgactivate = 0 +pgdeactivate = 0 +pgfault = 0 +pgmajfault = 0 +pglazyfreed = 0 +pgrefill_dma = 0 +pgrefill_dma32 = 0 +pgrefill_normal = 0 +pgrefill_movable = 0 +pgsteal_kswapd_dma = 0 +pgsteal_kswapd_dma32 = 0 +pgsteal_kswapd_normal = 0 +pgsteal_kswapd_movable = 0 +pgsteal_direct_dma = 0 +pgsteal_direct_dma32 = 0 +pgsteal_direct_normal = 0 +pgsteal_direct_movable = 0 +pgscan_kswapd_dma = 0 +pgscan_kswapd_dma32 = 0 +pgscan_kswapd_normal = 0 +pgscan_kswapd_movable = 0 +pgscan_direct_dma = 0 +pgscan_direct_dma32 = 0 +pgscan_direct_normal = 0 +pgscan_direct_movable = 0 +pgscan_direct_throttle = 0 +zone_reclaim_failed = 0 +pginodesteal = 0 +slabs_scanned = 0 +kswapd_inodesteal = 0 +kswapd_low_wmark_hit_quickly = 0 +kswapd_high_wmark_hit_quickly = 0 +pageoutrun = 0 +allocstall = 0 +pgrotated = 0 +drop_pagecache = 0 +drop_slab = 0 +numa_pte_updates = 0 +numa_huge_pte_updates = 0 +numa_hint_faults = 0 +numa_hint_faults_local = 0 +numa_pages_migrated = 0 +pgmigrate_success = 0 +pgmigrate_fail = 0 +compact_migrate_scanned = 0 +compact_free_scanned = 0 +compact_isolated = 0 +compact_stall = 0 +compact_fail = 0 +compact_success = 0 +htlb_buddy_alloc_success = 0 +htlb_buddy_alloc_fail = 0 +unevictable_pgs_culled = 0 +unevictable_pgs_scanned = 0 +unevictable_pgs_rescued = 0 +unevictable_pgs_mlocked = 0 +unevictable_pgs_munlocked = 0 +unevictable_pgs_cleared = 0 +unevictable_pgs_stranded = 0 +thp_fault_alloc = 0 +thp_fault_fallback = 0 +thp_collapse_alloc = 0 +thp_collapse_alloc_failed = 0 +thp_split = 0 +thp_zero_page_alloc = 0 +thp_zero_page_alloc_failed = 0 +balloon_inflate = 0 +balloon_deflate = 0 +balloon_migrate = 0 +``` \ No newline at end of file diff --git a/inputs/kernel_vmstat/dashboard.json b/inputs/kernel_vmstat/dashboard.json new file mode 100644 index 000000000..c5c7a12b8 --- /dev/null +++ b/inputs/kernel_vmstat/dashboard.json @@ -0,0 +1,57 @@ +{ + "title": "Linux Kernel VMStat Metrics", + "uid": "edaa554c", + "tags": [ + "linux kernel vmstat metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Kernel VMStat OOM Kills", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "kernel_vmstat_oom_kill", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Kernel VMStat Page Faults", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(kernel_vmstat_pgfault[5m])", + "legendFormat": "Minor: {{agent_hostname}}", + "refId": "A" + }, + { + "expr": "rate(kernel_vmstat_pgmajfault[5m])", + "legendFormat": "Major: {{agent_hostname}}", + "refId": "B" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/kube_proxy/README.md b/inputs/kube_proxy/README.md new file mode 100644 index 000000000..d3f2322ae --- /dev/null +++ b/inputs/kube_proxy/README.md @@ -0,0 +1,23 @@ +# Kubernetes Kube-Proxy Input Plugin + +This component is not an independent Go native input plugin. Instead, it leverages Categraf's `prometheus` scraping capabilities to collect metrics exposed directly by the Kubernetes Kube-Proxy component (via its `/metrics` endpoint). + +## Configuration + +To scrape Kube-Proxy metrics, you should configure the `prometheus` plugin. We have prepared a dedicated scraping template for Kube-Proxy in the example configuration directory. + +Reference configuration: [kube_proxy.toml](../../conf/input.prometheus/kube_proxy.toml) + +Steps: +1. Copy the reference configuration `kube_proxy.toml` to your Categraf `conf/input.prometheus/` directory. +2. Ensure that Categraf can access the kube-proxy metrics endpoint (typically `127.0.0.1:10249/metrics` or `NodeIP:10249`). When running as a DaemonSet, this is usually accessed via the Node IP. +3. Modify the `urls` in the configuration to point to the correct address. + +## Metrics and Dashboards + +Key metrics exposed by Kube-Proxy include: +- Sync proxy rules count and duration (`kubeproxy_sync_proxy_rules_duration_seconds`) +- Network programming latency (`kubeproxy_network_programming_duration_seconds`) +- REST client request status + +A matched Dashboard (`dashboard.json`) is provided in this directory. You can import this dashboard into Grafana or Nightingale to monitor the operational status of your Kube-Proxy instances. diff --git a/inputs/kube_proxy/README_CN.md b/inputs/kube_proxy/README_CN.md new file mode 100644 index 000000000..34e2845fc --- /dev/null +++ b/inputs/kube_proxy/README_CN.md @@ -0,0 +1,23 @@ +# Kubernetes Kube-Proxy 采集插件 + +该组件并非独立的 Go 原生采集插件,而是利用 Categraf 的 `prometheus` 抓取能力来采集 Kubernetes Kube-Proxy 组件本身暴露的 metrics 数据 (`/metrics` 接口)。 + +## 配置说明 + +要采集 Kube-Proxy 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。我们在示例配置中已经准备好了一个专用于 Kube-Proxy 的抓取模板。 + +参考配置:[kube_proxy.toml](../../conf/input.prometheus/kube_proxy.toml) + +具体步骤: +1. 将参考配置文件 `kube_proxy.toml` 复制到您的 Categraf `conf/input.prometheus/` 目录下。 +2. 确保您的 Kubernetes 集群中,kube-proxy 的 metrics 接口 (通常是 `127.0.0.1:10249/metrics` 或者节点 IP 的 `10249` 端口) 可以被 Categraf 访问到。如果在 DaemonSet 模式下,通常通过 Node IP 访问。 +3. 修改配置中的 `urls` 指向正确的地址。 + +## 采集指标与监控大盘 + +Kube-Proxy 暴露的指标主要包含: +- 同步规则次数和耗时 (`kubeproxy_sync_proxy_rules_duration_seconds`) +- 网络编程延迟 (`kubeproxy_network_programming_duration_seconds`) +- REST 客户端请求状态 + +本目录下提供了一个配套的 Dashboard (`dashboard.json`),您可以在 Grafana 或夜莺中导入该看板来观测您的 Kube-Proxy 运行状态。 diff --git a/inputs/kube_proxy/dashboard.json b/inputs/kube_proxy/dashboard.json new file mode 100644 index 000000000..7cf4d5cf7 --- /dev/null +++ b/inputs/kube_proxy/dashboard.json @@ -0,0 +1,610 @@ +{ + "name": "Kubernetes / Proxy", + "tags": "", + "ident": "", + "configs": { + "version": "2.0.0", + "links": [], + "var": [ + { + "name": "ident", + "type": "query", + "definition": "label_values(kubeproxy_sync_proxy_rules_iptables_restore_failures_total, ident)", + "reg": "", + "multi": true, + "allOption": true, + "allValue": ".*" + } + ], + "panels": [ + { + "version": "2.0.0", + "id": "c0305f2f-68a1-4e60-9713-4d83f9cfd98c", + "type": "stat", + "name": "Up", + "links": [], + "layout": { + "h": 5, + "w": 4, + "x": 0, + "y": 0, + "i": "c0305f2f-68a1-4e60-9713-4d83f9cfd98c", + "isResizable": true + }, + "targets": [ + { + "refId": "A", + "expr": "sum(up{job=\"kube-proxy\"})", + "legend": "" + } + ], + "options": {}, + "custom": { + "version": "2.0.0", + "textMode": "value", + "colorMode": "value" + } + }, + { + "type": "timeseries", + "id": "54bd34da-ea68-441d-ac6f-cd74b2130e14", + "layout": { + "h": 5, + "w": 10, + "x": 4, + "y": 0, + "i": "54bd34da-ea68-441d-ac6f-cd74b2130e14", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{job=\"kube-proxy\", ident=~\"$ident\"}[5m]))", + "legend": "rate" + } + ], + "name": "Rules Sync Rate", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "20979f64-9057-4645-9328-3f4d1eb5e90f", + "layout": { + "h": 5, + "w": 10, + "x": 14, + "y": 0, + "i": "20979f64-9057-4645-9328-3f4d1eb5e90f", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{job=\"kube-proxy\", ident=~\"$ident\"}[5m]))", + "legend": "{{ident}}" + } + ], + "name": "Rule Sync Latency 99th Quantile", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "seconds" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "e15d3349-b056-4df2-a6a2-52ddc7bed913", + "layout": { + "h": 5, + "w": 12, + "x": 0, + "y": 5, + "i": "e15d3349-b056-4df2-a6a2-52ddc7bed913", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{job=\"kube-proxy\", ident=~\"$ident\"}[5m]))", + "legend": "rate" + } + ], + "name": "Network Programming Rate", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "4d63cd62-3e4f-489a-87cc-fea18ee9e4fe", + "layout": { + "h": 5, + "w": 12, + "x": 12, + "y": 5, + "i": "4d63cd62-3e4f-489a-87cc-fea18ee9e4fe", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\", ident=~\"$ident\"}[5m])) by (ident, le))", + "legend": "{{ident}}" + } + ], + "name": "Network Programming Latency 99th Quantile", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "8ddc5ec1-4ccb-421c-9976-1a3c0cd416b8", + "layout": { + "h": 5, + "w": 12, + "x": 0, + "y": 10, + "i": "8ddc5ec1-4ccb-421c-9976-1a3c0cd416b8", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", ident=~\"$ident\",code=~\"2..\"}[5m]))", + "legend": "2xx" + }, + { + "refId": "B", + "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", ident=~\"$ident\",code=~\"3..\"}[5m]))", + "legend": "3xx" + }, + { + "refId": "C", + "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", ident=~\"$ident\",code=~\"4..\"}[5m]))", + "legend": "4xx" + }, + { + "refId": "D", + "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", ident=~\"$ident\",code=~\"5..\"}[5m]))", + "legend": "5xx" + } + ], + "name": "Kube API Request Rate", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "ed034ce6-e6ec-4b5b-ad81-660d087c5cf9", + "layout": { + "h": 5, + "w": 12, + "x": 12, + "y": 10, + "i": "ed034ce6-e6ec-4b5b-ad81-660d087c5cf9", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\",ident=~\"$ident\",verb=\"POST\"}[5m])) by (verb, url, le))", + "legend": "{{verb}} {{url}}" + } + ], + "name": "Post Request Latency 99th Quantile", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "f40ae278-17d1-45e3-848a-06f14ad0423d", + "layout": { + "h": 5, + "w": 24, + "x": 0, + "y": 15, + "i": "f40ae278-17d1-45e3-848a-06f14ad0423d", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\", ident=~\"$ident\", verb=\"GET\"}[5m])) by (verb, url, le))", + "legend": "{{verb}} {{url}}" + } + ], + "name": "Get Request Latency 99th Quantile", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "047b27f1-7ff9-43ff-bc94-396688422071", + "layout": { + "h": 5, + "w": 8, + "x": 0, + "y": 20, + "i": "047b27f1-7ff9-43ff-bc94-396688422071", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "process_resident_memory_bytes{job=\"kube-proxy\",ident=~\"$ident\"}", + "legend": "{{ident}}" + } + ], + "name": "Memory", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 1 + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "5103d5ee-2245-45be-ad70-f5410dd02544", + "layout": { + "h": 5, + "w": 8, + "x": 8, + "y": 20, + "i": "5103d5ee-2245-45be-ad70-f5410dd02544", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "rate(process_cpu_seconds_total{job=\"kube-proxy\",ident=~\"$ident\"}[5m])", + "legend": "{{ident}}" + } + ], + "name": "CPU usage", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "1d15c866-c83b-408b-98e7-e26ada5e7cfc", + "layout": { + "h": 5, + "w": 8, + "x": 16, + "y": 20, + "i": "1d15c866-c83b-408b-98e7-e26ada5e7cfc", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "go_goroutines{job=\"kube-proxy\",ident=~\"$ident\"}", + "legend": "{{ident}}" + } + ], + "name": "Goroutines", + "links": [], + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "linear", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + } + ] + } +} \ No newline at end of file diff --git a/inputs/kubelet/README.md b/inputs/kubelet/README.md new file mode 100644 index 000000000..4370e69d5 --- /dev/null +++ b/inputs/kubelet/README.md @@ -0,0 +1,24 @@ +# Kubernetes Kubelet Input Plugin + +This component is not an independent Go native input plugin. Instead, it leverages Categraf's `prometheus` scraping capabilities to collect metrics exposed directly by the Kubernetes node component Kubelet (via its `/metrics` and `/metrics/cadvisor` endpoints). + +## Configuration + +To scrape Kubelet metrics, you should configure the `prometheus` plugin. We have prepared a dedicated scraping template for Kubelet in the example configuration directory. + +Reference configuration: [kubelet.toml](../../conf/input.prometheus/kubelet.toml) + +Steps: +1. Copy the reference configuration `kubelet.toml` to your Categraf `conf/input.prometheus/` directory. +2. Ensure that Categraf (usually deployed as a DaemonSet on each Node) can access the Kubelet API on the current node. This often involves using the Node IP and a service account token. +3. Configure the correct authentication in `kubelet.toml` according to your Kubernetes cluster's security setup (e.g., TLS settings, token file paths). + +## Metrics and Dashboards + +Key metrics exposed by Kubelet include: +- Pod running status and volume mount status on the node +- Latency of Kubelet's own API operations (`kubelet_runtime_operations_duration_seconds`) +- OOM events, PLEG latency +- Built-in cAdvisor container metrics (like `container_cpu_usage_seconds_total`) + +A matched Dashboard (`dashboard.json`) is provided in this directory. You can import this dashboard into Grafana or Nightingale to monitor the operational status of your Kubelet instances and containers. diff --git a/inputs/kubelet/README_CN.md b/inputs/kubelet/README_CN.md new file mode 100644 index 000000000..a7261ae91 --- /dev/null +++ b/inputs/kubelet/README_CN.md @@ -0,0 +1,24 @@ +# Kubernetes Kubelet 采集插件 + +该组件并非独立的 Go 原生采集插件,而是利用 Categraf 的 `prometheus` 抓取能力来采集 Kubernetes 节点组件 Kubelet 暴露的 metrics 数据 (`/metrics` 和 `/metrics/cadvisor` 等接口)。 + +## 配置说明 + +要采集 Kubelet 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。我们在示例配置中已经准备好了一个专用于 Kubelet 的抓取模板。 + +参考配置:[kubelet.toml](../../conf/input.prometheus/kubelet.toml) + +具体步骤: +1. 将参考配置文件 `kubelet.toml` 复制到您的 Categraf `conf/input.prometheus/` 目录下。 +2. 确保 Categraf 作为 DaemonSet 部署在每个 Node 上时,可以访问到当前节点的 Kubelet API(通常通过挂载 Node 的 IP 和相应的认证 Token 获取)。 +3. 根据您的 Kubernetes 集群的安全配置(如是否需要 TLS,Token 文件路径),在 `kubelet.toml` 中配置正确的认证信息。 + +## 采集指标与监控大盘 + +Kubelet 暴露的指标主要包含: +- 节点的 Pod 运行状态、卷挂载状态 +- Kubelet 自身的 API 操作延迟 (`kubelet_runtime_operations_duration_seconds`) +- OOM 记录、PLEG 延迟 +- 内置的 cAdvisor 容器指标 (`container_cpu_usage_seconds_total` 等) + +本目录下提供了一个配套的 Dashboard (`dashboard.json`),您可以在 Grafana 或夜莺中导入该看板来观测您的 Kubelet 和容器运行状态。 diff --git a/inputs/kubelet/dashboard.json b/inputs/kubelet/dashboard.json new file mode 100644 index 000000000..1f03985f7 --- /dev/null +++ b/inputs/kubelet/dashboard.json @@ -0,0 +1,444 @@ +{ + "name": "Kubelet metrics by ident", + "tags": "", + "ident": "", + "configs": { + "var": [ + { + "name": "ident", + "type": "query", + "definition": "label_values(kubelet_running_pods, ident)", + "multi": true, + "allOption": true + } + ], + "panels": [ + { + "type": "stat", + "id": "d3caf396-b3a1-449b-acec-f550967889e6", + "layout": { + "h": 3, + "w": 4, + "x": 0, + "y": 0, + "i": "d3caf396-b3a1-449b-acec-f550967889e6", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "sum(up{job=\"kubelet\"})" + } + ], + "name": "Kubelet UP", + "custom": { + "textMode": "value", + "colorMode": "background", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum(kubelet_running_pods{ident=~\"$ident\"})" + } + ], + "name": "Running Pods", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 4, + "y": 0, + "i": "38c38b23-a7e3-4177-8c41-3ce955ea0434", + "isResizable": true + }, + "id": "38c38b23-a7e3-4177-8c41-3ce955ea0434" + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum(kubelet_running_containers{ident=~\"$ident\", container_state=\"running\"})" + } + ], + "name": "Running Containers", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 8, + "y": 0, + "i": "26bf2320-fcff-48f8-a6fc-aa9076bb9329", + "isResizable": true + }, + "id": "525859b9-91d7-4180-b363-bf8ceec977d8" + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum(volume_manager_total_volumes{ident=~\"$ident\", state=\"desired_state_of_world\"})" + } + ], + "name": "Desired Volumes", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 12, + "y": 0, + "i": "54ae4ab3-e932-418c-a637-f2f515cce1b9", + "isResizable": true + }, + "id": "84af4617-2ae0-4b30-a82a-6e8586342224" + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum(volume_manager_total_volumes{ident=~\"$ident\", state=\"actual_state_of_world\"})" + } + ], + "name": "Actual Volumes", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 16, + "y": 0, + "i": "d9de76d7-2203-40e7-a792-9888ec869e82", + "isResizable": true + }, + "id": "d431f4bd-9115-41d2-a494-1d680bdd1e0f" + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum(increase(kubelet_runtime_operations_errors_total{ident=~\"$ident\"}[5m]))" + } + ], + "name": "OP Errors in 5min", + "custom": { + "textMode": "value", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "valueMappings": [ + { + "type": "range", + "match": { + "from": 1 + }, + "result": { + "color": "#d0021b" + } + }, + { + "type": "range", + "match": { + "to": 1 + }, + "result": { + "color": "#417505" + } + } + ], + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 20, + "y": 0, + "i": "bf2bbd15-347d-404c-9b8f-e524875befe2", + "isResizable": true + }, + "id": "54de62bc-8af3-4c27-8b8e-1af567b363fc" + }, + { + "type": "row", + "id": "730d4a9b-791f-4aaf-a042-668f66e73814", + "name": "Operations", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 3, + "i": "730d4a9b-791f-4aaf-a042-668f66e73814", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "increase(kubelet_runtime_operations_total{ident=~\"$ident\"}[5m])" + } + ], + "name": "Operations in 5min", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 4, + "i": "d26e6818-6704-492a-8cbf-58473dd85716", + "isResizable": true + }, + "id": "d26e6818-6704-492a-8cbf-58473dd85716" + }, + { + "targets": [ + { + "refId": "A", + "expr": "increase(kubelet_runtime_operations_errors_total{ident=~\"$ident\"}[5m])" + } + ], + "name": "Operation Errors in 5min", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 4, + "i": "4e585d2f-c61c-4350-86ec-dca7ddc34ceb", + "isResizable": true + }, + "id": "09a6ad5b-8c0e-4f17-b17f-3ebc514f7d20" + }, + { + "targets": [ + { + "refId": "A", + "expr": "increase(kubelet_runtime_operations_duration_seconds_sum{ident=~\"$ident\"}[1h])/increase(kubelet_runtime_operations_duration_seconds_count{ident=~\"$ident\"}[1h])" + } + ], + "name": "Average Operation duration in 1 hour (Unit: Second)", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 24, + "x": 0, + "y": 8, + "i": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7", + "isResizable": true + }, + "id": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7" + }, + { + "type": "row", + "id": "dd7e84c5-03ce-467c-871a-aa110fe051f4", + "name": "PLEG relist", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 12, + "i": "dd7e84c5-03ce-467c-871a-aa110fe051f4", + "isResizable": false + } + }, + { + "targets": [ + { + "refId": "A", + "expr": "rate(kubelet_pleg_relist_duration_seconds_count{ident=~\"$ident\"}[1h])" + } + ], + "name": "relist rate", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 13, + "i": "f3822da8-a9c9-4db1-ba12-465d3ece823e", + "isResizable": true + }, + "id": "f3822da8-a9c9-4db1-ba12-465d3ece823e" + }, + { + "targets": [ + { + "refId": "A", + "expr": "increase(kubelet_pleg_relist_duration_seconds_sum{ident=~\"$ident\"}[1h])/increase(kubelet_pleg_relist_duration_seconds_count{ident=~\"$ident\"}[1h])" + } + ], + "name": "relist duration (Unit: Second)", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 13, + "i": "2b4ada76-6c30-42cd-9bd3-c939b4c0139c", + "isResizable": true + }, + "id": "a6e4c914-bfca-4419-a264-f5b1cbab261a" + } + ], + "version": "2.0.0" + } +} diff --git a/inputs/kubernetes/README.md b/inputs/kubernetes/README.md index 17bead332..bd67a5ca7 100644 --- a/inputs/kubernetes/README.md +++ b/inputs/kubernetes/README.md @@ -1,6 +1,6 @@ # kubernetes -forked from [telegraf/kubernetes](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/kubernetes). 这个插件的作用是通过kubelet提供的API获取监控数据,包括系统容器的监控数据、node的、pod数据卷的、pod网络的、pod容器的 +forked from telegraf/kubernetes. 这个插件的作用是通过kubelet提供的API获取监控数据,包括系统容器的监控数据、node的、pod数据卷的、pod网络的、pod容器的 ## Change diff --git a/inputs/ldap/README.md b/inputs/ldap/README.md index d075d5687..a2d05b491 100644 --- a/inputs/ldap/README.md +++ b/inputs/ldap/README.md @@ -1,114 +1,53 @@ # LDAP Input Plugin -This plugin gathers metrics from LDAP servers' monitoring (`cn=Monitor`) -backend. Currently this plugin supports [OpenLDAP](https://www.openldap.org/) -and [389ds](https://www.port389.org/) servers. +This plugin gathers metrics from LDAP servers' monitoring (`cn=Monitor`) backend. +Currently, this plugin supports [OpenLDAP](https://www.openldap.org/) and [389ds](https://www.port389.org/) servers. -To use this plugin you must enable the monitoring backend/plugin of your LDAP -server. See -[OpenLDAP](https://www.openldap.org/devel/admin/monitoringslapd.html) or 389ds -documentation for details. +To use this plugin, you **must** enable the monitoring backend/plugin of your LDAP server. +See [OpenLDAP Monitoring](https://www.openldap.org/devel/admin/monitoringslapd.html) or 389ds documentation for details. -## Metrics - -Depending on the server dialect, different metrics are produced. The metrics -are usually named according to the selected dialect. - -### Tags +## Configuration -- server -- Server name or IP -- port -- Port used for connecting +```toml +# Collect LDAP monitoring metrics +[[instances]] +# LDAP server host and port +server = "localhost" +port = 389 -## Example Output +# SSL/TLS options +# insecure_skip_verify = false +# starttls = false -Using the `openldap` dialect - -```text -openldap_modify_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_referrals_statistics agent_hostname=zy-fat port=389 server=localhost 0 -openldap_unbind_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_delete_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_extended_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_pdu_statistics agent_hostname=zy-fat port=389 server=localhost 42 -openldap_starting_threads agent_hostname=zy-fat port=389 server=localhost 0 -openldap_active_threads agent_hostname=zy-fat port=389 server=localhost 1 -openldap_uptime_time agent_hostname=zy-fat port=389 server=localhost 102 -openldap_bytes_statistics agent_hostname=zy-fat port=389 server=localhost 3176 -openldap_compare_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_bind_operations_completed agent_hostname=zy-fat port=389 server=localhost 1 -openldap_total_connections agent_hostname=zy-fat port=389 server=localhost 1002 -openldap_search_operations_completed agent_hostname=zy-fat port=389 server=localhost 1 -openldap_abandon_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_add_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_open_threads agent_hostname=zy-fat port=389 server=localhost 1 -openldap_add_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_operations_initiated agent_hostname=zy-fat port=389 server=localhost 3 -openldap_write_waiters agent_hostname=zy-fat port=389 server=localhost 0 -openldap_entries_statistics agent_hostname=zy-fat port=389 server=localhost 41 -openldap_modrdn_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_pending_threads agent_hostname=zy-fat port=389 server=localhost 0 -openldap_max_pending_threads agent_hostname=zy-fat port=389 server=localhost 0 -openldap_bind_operations_initiated agent_hostname=zy-fat port=389 server=localhost 1 -openldap_max_file_descriptors_connections agent_hostname=zy-fat port=389 server=localhost 1024 -openldap_compare_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_search_operations_initiated agent_hostname=zy-fat port=389 server=localhost 2 -openldap_modrdn_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_read_waiters agent_hostname=zy-fat port=389 server=localhost 1 -openldap_backload_threads agent_hostname=zy-fat port=389 server=localhost 1 -openldap_current_connections agent_hostname=zy-fat port=389 server=localhost 1 -openldap_unbind_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_delete_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_extended_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_modify_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0 -openldap_max_threads agent_hostname=zy-fat port=389 server=localhost 16 -openldap_abandon_operations_completed agent_hostname=zy-fat port=389 server=localhost 0 -openldap_operations_completed agent_hostname=zy-fat port=389 server=localhost 2 -openldap_database_2_databases agent_hostname=zy-fat port=389 server=localhost 0 +# Bind DN and password (must have read access to the cn=Monitor tree) +# bind_dn = "" +# bind_password = "" ``` -Using the `389ds` dialect +## Metrics -```text -389ds_current_connections_at_max_threads agent_hostname=zy-fat port=389 server=localhost 0 -389ds_connections_max_threads agent_hostname=zy-fat port=389 server=localhost 0 -389ds_add_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_dtablesize agent_hostname=zy-fat port=389 server=localhost 63936 -389ds_strongauth_binds agent_hostname=zy-fat port=389 server=localhost 13 -389ds_modrdn_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_maxthreads_per_conn_hits agent_hostname=zy-fat port=389 server=localhost 0 -389ds_current_connections agent_hostname=zy-fat port=389 server=localhost 2 -389ds_security_errors agent_hostname=zy-fat port=389 server=localhost 0 -389ds_entries_sent agent_hostname=zy-fat port=389 server=localhost 13 -389ds_cache_entries agent_hostname=zy-fat port=389 server=localhost 0 -389ds_backends agent_hostname=zy-fat port=389 server=localhost 0 -389ds_threads agent_hostname=zy-fat port=389 server=localhost 17 -389ds_connections agent_hostname=zy-fat port=389 server=localhost 2 -389ds_read_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_entries_returned agent_hostname=zy-fat port=389 server=localhost 13 -389ds_unauth_binds agent_hostname=zy-fat port=389 server=localhost 0 -389ds_search_operations agent_hostname=zy-fat port=389 server=localhost 14 -389ds_simpleauth_binds agent_hostname=zy-fat port=389 server=localhost 0 -389ds_operations_completed agent_hostname=zy-fat port=389 server=localhost 51 -389ds_connections_in_max_threads agent_hostname=zy-fat port=389 server=localhost 0 -389ds_modify_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_wholesubtree_search_operations agent_hostname=zy-fat port=389 server=localhost 1 -389ds_read_waiters agent_hostname=zy-fat port=389 server=localhost 0 -389ds_compare_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_errors agent_hostname=zy-fat port=389 server=localhost 13 -389ds_in_operations agent_hostname=zy-fat port=389 server=localhost 52 -389ds_total_connections agent_hostname=zy-fat port=389 server=localhost 15 -389ds_cache_hits agent_hostname=zy-fat port=389 server=localhost 0 -389ds_list_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_referrals_returned agent_hostname=zy-fat port=389 server=localhost 0 -389ds_copy_entries agent_hostname=zy-fat port=389 server=localhost 0 -389ds_operations_initiated agent_hostname=zy-fat port=389 server=localhost 52 -389ds_chainings agent_hostname=zy-fat port=389 server=localhost 0 -389ds_bind_security_errors agent_hostname=zy-fat port=389 server=localhost 0 -389ds_onelevel_search_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_bytes_sent agent_hostname=zy-fat port=389 server=localhost 1702 -389ds_bytes_received agent_hostname=zy-fat port=389 server=localhost 0 -389ds_referrals agent_hostname=zy-fat port=389 server=localhost 0 -389ds_delete_operations agent_hostname=zy-fat port=389 server=localhost 0 -389ds_anonymous_binds agent_hostname=zy-fat port=389 server=localhost 0 -``` +Depending on the server dialect, different metrics are produced. +### Tags +All metrics will be tagged with the following: +- `server`: Server name or IP +- `port`: Port used for connecting + +### OpenLDAP Metrics +Metrics start with `openldap_`, such as: +- `openldap_active_threads` +- `openldap_total_connections` +- `openldap_current_connections` +- `openldap_bytes_statistics` +- `openldap_bind_operations_completed` +- `openldap_search_operations_completed` +- `openldap_uptime_time` + +### 389ds Metrics +Metrics start with `389ds_`, such as: +- `389ds_current_connections` +- `389ds_threads` +- `389ds_operations_completed` +- `389ds_search_operations` +- `389ds_errors` +- `389ds_bytes_sent` diff --git a/inputs/ldap/README_CN.md b/inputs/ldap/README_CN.md new file mode 100644 index 000000000..1d4b428fa --- /dev/null +++ b/inputs/ldap/README_CN.md @@ -0,0 +1,53 @@ +# LDAP 采集插件 + +该插件通过查询 LDAP 服务器的监控后端 (`cn=Monitor`) 来采集指标数据。 +目前,此插件支持采集 **OpenLDAP** 和 **389ds** 两种 LDAP 服务器。 + +在使用此插件之前,您**必须**在您的 LDAP 服务器上开启相应的监控后端或监控插件。 +详细步骤可参考 [OpenLDAP Monitor 说明](https://www.openldap.org/devel/admin/monitoringslapd.html) 或 389ds 的相关文档。 + +## 配置说明 + +```toml +# 采集 LDAP 监控指标 +[[instances]] +# LDAP 服务器的连接地址和端口 +server = "localhost" +port = 389 + +# 是否使用 SSL/TLS 加密 +# insecure_skip_verify = false +# starttls = false + +# LDAP 绑定的账户名与密码 (需具有读取 cn=Monitor 树的权限) +# bind_dn = "" +# bind_password = "" +``` + +## 采集指标 + +根据所连接的 LDAP 服务器的底层实现(方言 dialect),插件会生成不同命名的指标。 + +### Tags +所有的指标都会带上以下两个默认标签: +- `server`: 连接的服务器名称或 IP +- `port`: 连接的端口 + +### OpenLDAP 指标 +前缀通常为 `openldap_`,常见的有: +- `openldap_active_threads`: 活跃线程数 +- `openldap_total_connections`: 累计建立的连接总数 +- `openldap_current_connections`: 当前并发的连接数 +- `openldap_bytes_statistics`: 字节统计 +- `openldap_bind_operations_completed`: 成功的绑定操作数 +- `openldap_search_operations_completed`: 成功的查询操作数 +- `openldap_uptime_time`: 正常运行时间 (秒) + +### 389ds 指标 +前缀通常为 `389ds_`,常见的有: +- `389ds_current_connections`: 当前连接数 +- `389ds_threads`: 当前线程数 +- `389ds_operations_completed`: 完成的操作总数 +- `389ds_search_operations`: 查询操作数 +- `389ds_errors`: 错误数 +- `389ds_bytes_sent`: 发送的字节数 diff --git a/inputs/ldap/dashboard.json b/inputs/ldap/dashboard.json new file mode 100644 index 000000000..c16e8e0ad --- /dev/null +++ b/inputs/ldap/dashboard.json @@ -0,0 +1,106 @@ +{ + "title": "LDAP Server Metrics", + "uid": "2c8ba34a", + "tags": [ + "ldap server metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "OpenLDAP Current Connections", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "openldap_current_connections", + "legendFormat": "{{server}}:{{port}}", + "refId": "A" + } + ] + }, + { + "title": "OpenLDAP Bind Operations/s", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(openldap_bind_operations_completed[5m])", + "legendFormat": "{{server}}:{{port}}", + "refId": "A" + } + ] + }, + { + "title": "OpenLDAP Search Operations/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(openldap_search_operations_completed[5m])", + "legendFormat": "{{server}}:{{port}}", + "refId": "A" + } + ] + }, + { + "title": "389ds Current Connections", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "389ds_current_connections", + "legendFormat": "{{server}}:{{port}}", + "refId": "A" + } + ] + }, + { + "title": "389ds Errors/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 12, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "rate(389ds_errors[5m])", + "legendFormat": "{{server}}:{{port}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/linux_sysctl_fs/README.md b/inputs/linux_sysctl_fs/README.md index a2e0e1aee..f236d8514 100644 --- a/inputs/linux_sysctl_fs/README.md +++ b/inputs/linux_sysctl_fs/README.md @@ -1,3 +1,32 @@ -# linux_sysctl_fs +# Linux Sysctl FS Input Plugin -采集一些 /proc/sys/fs 下的内容 +This plugin collects Linux kernel filesystem-level parameter metrics, directly sourced from the `/proc/sys/fs/` directory. +It is highly recommended for monitoring system-wide file descriptor limits (file-max) and kernel inode/dentry cache usage. + +**Supported Platforms:** Linux + +## Configuration + +```toml +# Collect Linux system file descriptor and inode status limits +[[instances]] +# This plugin requires no special configuration. Just enable it. +``` + +## Metrics + +All collected metrics are prefixed with `linux_sysctl_fs_`. +Key metrics include: + +- `linux_sysctl_fs_file-nr`: Number of allocated file handles +- `linux_sysctl_fs_file-max`: Maximum number of allowed file handles +- `linux_sysctl_fs_inode-nr`: Number of allocated inodes +- `linux_sysctl_fs_inode-free-nr`: Number of free inodes +- `linux_sysctl_fs_dentry-nr`: Number of dentry cache entries +- `linux_sysctl_fs_dentry-unused-nr`: Number of unused dentry cache entries +- `linux_sysctl_fs_aio-nr`: Current number of asynchronous I/O (AIO) requests +- `linux_sysctl_fs_aio-max-nr`: Maximum allowed number of AIO requests + +## Dashboards + +These metrics reflect critical system-level limits, especially the ratio between `file-nr` and `file-max` (File Descriptor Usage Rate). We have provided a default Dashboard to help you track these core limitations. diff --git a/inputs/linux_sysctl_fs/README_CN.md b/inputs/linux_sysctl_fs/README_CN.md new file mode 100644 index 000000000..737ff3fd3 --- /dev/null +++ b/inputs/linux_sysctl_fs/README_CN.md @@ -0,0 +1,32 @@ +# Linux Sysctl FS 采集插件 + +该插件用于采集 Linux 内核文件系统级别的参数指标,这些指标直接来源于 `/proc/sys/fs/` 目录。 +它非常适合用来监控系统级的文件描述符限制 (file-max) 以及内核 inode/dentry 缓存状态。 + +**支持平台:** Linux + +## 配置说明 + +```toml +# 采集 Linux 系统文件句柄与 Inode 等限制状态 +[[instances]] +# 该插件无需任何特殊配置,启用即可。 +``` + +## 采集指标 + +所有收集到的指标名称前缀为 `linux_sysctl_fs_`。 +主要指标如下: + +- `linux_sysctl_fs_file-nr`: 系统当前已经分配的文件句柄数 +- `linux_sysctl_fs_file-max`: 系统允许分配的最大文件句柄数 +- `linux_sysctl_fs_inode-nr`: 当前分配的 inode 数量 +- `linux_sysctl_fs_inode-free-nr`: 当前空闲的 inode 数量 +- `linux_sysctl_fs_dentry-nr`: dentry 缓存的数量 +- `linux_sysctl_fs_dentry-unused-nr`: 未使用的 dentry 缓存数量 +- `linux_sysctl_fs_aio-nr`: 当前的异步 I/O (AIO) 请求数量 +- `linux_sysctl_fs_aio-max-nr`: 允许的最大异步 I/O 请求数量 + +## 监控大盘 + +这些指标反映了极其重要的系统级限制,特别是 `file-nr` 和 `file-max` 的比例(文件描述符使用率)。我们为您准备了默认的 Dashboard 来追踪这几个核心限制。 diff --git a/inputs/linux_sysctl_fs/dashboard.json b/inputs/linux_sysctl_fs/dashboard.json new file mode 100644 index 000000000..dac1f9389 --- /dev/null +++ b/inputs/linux_sysctl_fs/dashboard.json @@ -0,0 +1,103 @@ +{ + "title": "Linux Sysctl FS Limits", + "uid": "46858063", + "tags": [ + "linux sysctl fs limits" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "File Descriptor Usage %", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "linux_sysctl_fs_file\\-nr / linux_sysctl_fs_file\\-max * 100", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "File Descriptors Allocated", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "linux_sysctl_fs_file\\-nr", + "legendFormat": "Allocated", + "refId": "A" + }, + { + "expr": "linux_sysctl_fs_file\\-max", + "legendFormat": "Max", + "refId": "B" + } + ] + }, + { + "title": "Inode Cache", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "linux_sysctl_fs_inode\\-nr", + "legendFormat": "Total Inodes", + "refId": "A" + }, + { + "expr": "linux_sysctl_fs_inode\\-free\\-nr", + "legendFormat": "Free Inodes", + "refId": "B" + } + ] + }, + { + "title": "AIO Usage", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "linux_sysctl_fs_aio\\-nr", + "legendFormat": "AIO Allocated", + "refId": "A" + }, + { + "expr": "linux_sysctl_fs_aio\\-max\\-nr", + "legendFormat": "AIO Max", + "refId": "B" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/mem/README.md b/inputs/mem/README.md index 3df923af1..ee68d3b7b 100644 --- a/inputs/mem/README.md +++ b/inputs/mem/README.md @@ -1,7 +1,32 @@ -# mem +# Mem (Memory) Input Plugin -内存采集插件,维持默认配置即可。 +This plugin collects host-level memory metrics, including total memory, available memory, usage percentage, and caches. -## 监控大盘 +**Supported Platforms:** Windows, Linux, macOS, BSD, etc. -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +## Configuration + +```toml +# Collect host physical memory metrics +[[instances]] +# Usually requires no specific configuration. Just leave it enabled. +``` + +## Metrics + +All collected metrics are prefixed with `mem_`. +Key metrics include: + +- `mem_total`: Total amount of physical memory in bytes +- `mem_available`: Available memory in bytes (The most important metric for evaluating memory pressure) +- `mem_used`: Used memory in bytes +- `mem_used_percent`: Memory usage percentage (%) +- `mem_free`: Absolute free memory in bytes +- `mem_cached`: Memory used by page cache (Linux) +- `mem_buffers`: Memory used by block device buffers (Linux) +- `mem_swap_total` / `mem_swap_free` / `mem_swap_used_percent`: Swap-related metrics + +## Dashboards + +Metrics collected by this plugin are among the most fundamental server monitoring data. Typically, OS memory monitoring is unified under a global **System** dashboard alongside CPU and disk metrics. +For convenience in standalone viewing, a basic Dashboard containing only memory dimensions is also provided in this directory. diff --git a/inputs/mem/README_CN.md b/inputs/mem/README_CN.md new file mode 100644 index 000000000..7b06e6170 --- /dev/null +++ b/inputs/mem/README_CN.md @@ -0,0 +1,32 @@ +# Mem (内存) 采集插件 + +内存采集插件用于收集主机级别的内存使用率、空闲内存、缓存等物理内存相关指标。 + +**支持平台:** Windows, Linux, macOS, BSD 等 + +## 配置说明 + +```toml +# 采集主机物理内存指标 +[[instances]] +# 通常无需任何特殊配置,保持默认启用即可。 +``` + +## 采集指标 + +所有收集到的指标名称前缀为 `mem_`。 +部分核心指标如下: + +- `mem_total`: 总物理内存字节数 +- `mem_available`: 可用内存字节数 (评估系统是否有内存压力的最重要指标) +- `mem_used`: 已用内存字节数 +- `mem_used_percent`: 内存使用率 (%) +- `mem_free`: 绝对空闲的内存字节数 +- `mem_cached`: 页面缓存占用的内存字节数 (Linux) +- `mem_buffers`: 块设备缓存占用的内存字节数 (Linux) +- `mem_swap_total` / `mem_swap_free` / `mem_swap_used_percent`: Swap 相关指标 + +## 监控大盘 + +该插件采集的指标是服务器最基础的监控数据之一。通常,OS 的内存监控大盘会与 CPU、磁盘等指标统一放置在全局的 **System (主机系统)** 大盘下面。 +为方便单独查看,本目录也提供了一个仅包含内存维度的基础 Dashboard。 \ No newline at end of file diff --git a/inputs/mem/dashboard.json b/inputs/mem/dashboard.json new file mode 100644 index 000000000..9a9792294 --- /dev/null +++ b/inputs/mem/dashboard.json @@ -0,0 +1,93 @@ +{ + "title": "System Memory Metrics", + "uid": "c30cc572", + "tags": [ + "system memory metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Memory Usage Percent", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "mem_used_percent", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Memory Available (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "mem_available", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Memory Swap Usage Percent", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "mem_swap_used_percent", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Memory Cached/Buffers (Bytes)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "mem_cached", + "legendFormat": "Cached: {{agent_hostname}}", + "refId": "A" + }, + { + "expr": "mem_buffers", + "legendFormat": "Buffers: {{agent_hostname}}", + "refId": "B" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/mongodb/README.md b/inputs/mongodb/README.md index da6fd9b9e..848f72054 100644 --- a/inputs/mongodb/README.md +++ b/inputs/mongodb/README.md @@ -1,11 +1,11 @@ # mongodb -mongodb 监控采集插件,由mongodb-exporter(https://github.com/percona/mongodb_exporter) 封装而来。v0.3.30-v0.3.42从 [telegraf/mongodb](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/mongodb) fork。 +mongodb 监控采集插件,由mongodb-exporter(https://github.com/percona/mongodb_exporter) 封装而来。v0.3.30-v0.3.42从telegraf/mongodb fork。 ## Configuration - + - 配置文件,[参考示例](../../conf/input.mongodb/mongodb.toml) - 配置权限,至少授予以下权限给配置文件中用于连接 MongoDB 的 user 才能收集指标: ``` @@ -19,8 +19,6 @@ mongodb 监控采集插件,由mongodb-exporter(https://github.com/percona/mo } ``` - - 一个简单配置 ``` mongo -h xxx -u xxx -p xxx --authenticationDatabase admin @@ -29,25 +27,6 @@ mongodb 监控采集插件,由mongodb-exporter(https://github.com/percona/mo ``` 更详细的权限配置请参考[官方文档](https://www.mongodb.com/docs/manual/reference/built-in-roles/#mongodb-authrole-clusterMonitor) - -### 注意事项 - > 如果MongoDB 开启了operationProfiling仅用以上权限会出现system.profile无权限错误,默认的mongo角色中对system.profile集合只有find权限,要解决这个问题需要创建一个新的角色。 - ``` - db.createRole({ - role: "StatsReader", - privileges: [ - { - resource: { db: "", collection: "system.profile" }, - actions: [ "collStats", "indexStats" ] - } - ], - roles: [] - }) - - # 给categraf用户授权 - db.grantRolesToUser("categraf",[{ role: "StatsReader", db: "admin" }]) - ``` - ## 监控大盘和告警规则 同级目录下的 dashboard.json、alerts.json 是大盘和告警规则, dashboard2.json 是v0.3.30版本以后的大盘。 diff --git a/inputs/mtail/README_CN.md b/inputs/mtail/README_CN.md new file mode 100644 index 000000000..085dfbd8b --- /dev/null +++ b/inputs/mtail/README_CN.md @@ -0,0 +1,283 @@ +# mtail插件 + +## 简介 +功能:提取日志内容,转换为监控metrics + ++ 输入: 日志 ++ 输出: metrics 按照mtail语法输出, 仅支持counter、gauge、histogram ++ 处理: 本质是golang的正则提取+表达式计算 + +## 启动 +编辑mtail.toml文件, 一般每个instance需要指定不同的progs参数(不同的progs文件或者目录),否则指标会相互干扰。 +**注意**: 如果不同instance使用相同progs, 可以通过给每个instance增加labels做区分, +```toml +labels = { k1=v1 } +``` +或 +```toml +[instances.labels] +k1=v1 +``` + +1. conf/inputs.mtail/mtail.toml中指定instance +```toml + +[[instances]] +## 指定mtail prog的目录 +progs = "/path/to/prog1" +## 指定mtail要读取的日志 +logs = ["/path/to/a.log", "path/to/b.log"] +## 指定时区 +# override_timezone = "Asia/Shanghai" +## metrics是否带时间戳,注意,这里是"true" +# emit_metric_timestamp = "true" +## 日志编码,支持 gbk, gb18030, gb2312, big5, utf-8, 默认 utf-8 +# encoding = "gbk" + +... +``` +2. 在/path/to/prog1 目录下编写规则文件 +``` +gauge xxx_errors +/ERROR.*/ { + xxx_errros++ +} +``` + +3. 一个tab中执行 `categraf --test --inputs mtail`,用于测试 +4. 另一个tab中,"/path/to/a.log" 或者 "path/to/b.log" 追加一行 ERROR,看看categraf的输出 +5. 测试通过后,启动categraf + +### 输入 +logs参数指定要处理的日志源, 支持模糊匹配, 支持多个log文件。 + +### 处理规则 +`progs`指定具体的规则文件目录(或文件) + + +## 处理规则与语法 + +### 处理流程 +```python +for line in lines: + for regex in regexes: + if match: + do something +``` + +### 语法 + +``` golang +exported variable + +pattern { + action statements +} + +def decorator { + pattern and action statements +} +``` + +#### 定义指标名称 +前面也提过,指标仅支持 counter gauge histogram 三种类型。 +一个🌰 +```golang +counter lines +/INFO.*/ { + lines++ +} +``` + +注意,定义的名称只支持 C类型的命名方式(字母/数字/下划线),**如果想使用"-" 要使用"as"导出别名**。例如, +```golang +counter lines_total as "line-count" +``` +这样获取到的就是line-count这个指标名称了 + +#### 匹配与计算(pattern/action) + +```golang +PATTERN { +ACTION +} +``` + +例子 +```golang +/foo/ { + ACTION1 +} + +variable > 0 { + ACTION2 +} + +/foo/ && variable > 0 { + ACTION3 +} +``` +支持RE2正则匹配 +```golang +const PREFIX /^\w+\W+\d+ / + +PREFIX { + ACTION1 +} + +PREFIX + /foo/ { + ACTION2 +} +``` + +这样,ACTION1 是匹配以小写字符+大写字符+数字+空格的行,ACTION2 是匹配小写字符+大写字符+数字+空格+foo开头的行。 + +#### 关系运算符 ++ `<` 小于 `<=` 小于等于 ++ `>` 大于 `>=` 大于等于 ++ `==` 相等 `!=` 不等 ++ `=~` 匹配(模糊) `!~` 不匹配(模糊) ++ `||` 逻辑或 `&&` 逻辑与 `!` 逻辑非 + +#### 数学运算符 ++ `|` 按位或 ++ `&` 按位与 ++ `^` 按位异或 ++ `+ - * /` 四则运算 ++ `<<` 按位左移 ++ `>>` 按位右移 ++ `**` 指数运算 ++ `=` 赋值 ++ `++` 自增运算 ++ `--` 自减运算 ++ `+=` 加且赋值 + +#### 支持else与otherwise +```golang +/foo/ { +ACTION1 +} else { +ACTION2 +} +``` +支持嵌套 +```golang +/foo/ { + /foo1/ { + ACTION1 + } + /foo2/ { + ACTION2 + } + otherwise { + ACTION3 + } +} +``` + +支持命名与非命名提取 + +```golang +/(?P\S+) (\S+) \[\S+\] (\S+) \(\S*\) \S+ (?P\d+)/ { + bytes_total[$operation][$3] += $bytes +} +``` +增加常量label +```python +# test.mtail +# 定义常量label env +hidden text env +# 给label 赋值 这样定义是global范围; +# 局部添加,则在对应的condition中添加 +env="production" +counter line_total by logfile,env +/^(?P\w+\s+\d+\s+\d+:\d+:\d+)/ { + line_total[getfilename()][env]++ +} +``` +获取到的metrics中会添加上`env=production`的label 如下: +```python +# metrics +line_total{env="production",logfile="/path/to/xxxx.log",prog="test.mtail"} 4 1661165941788 +``` + +如果要给metrics增加变量label,必须要使用命名提取。例如 +```python +# 日志内容 +192.168.0.1 GET /foo +192.168.0.2 GET /bar +192.168.0.1 POST /bar +``` + +``` python +# test.mtail +counter my_http_requests_total by log_file, verb +/^/ + +/(?P[0-9A-Za-z\.:-]+) / + +/(?P[A-Z]+) / + +/(?P\S+).*/ + +/$/ { + my_http_requests_total[getfilename()][$verb]++ +} +``` + +```python +# metrics +my_http_requests_total{logfile="xxx.log",verb="GET",prog="test.mtail"} 4242 +my_http_requests_total{logfile="xxx.log",verb="POST",prog="test.mtail"} 42 +``` + +命名提取的变量可以在条件中使用 +```golang +/(?P\d+)/ && $x > 1 { +nonzero_positives++ +} +``` + +#### 时间处理 +不显示处理,则默认使用系统时间 + +默认emit_metric_timestamp="false" (注意是字符串) +``` +http_latency_bucket{prog="histo.mtail",le="1"} 0 +http_latency_bucket{prog="histo.mtail",le="2"} 0 +http_latency_bucket{prog="histo.mtail",le="4"} 0 +http_latency_bucket{prog="histo.mtail",le="8"} 0 +http_latency_bucket{prog="histo.mtail",le="+Inf"} 0 +http_latency_sum{prog="histo.mtail"} 0 +http_latency_count{prog="histo.mtail"} 0 +``` + +参数 emit_metric_timestamp="true" (注意是字符串) +``` +http_latency_bucket{prog="histo.mtail",le="1"} 1 1661152917471 +http_latency_bucket{prog="histo.mtail",le="2"} 2 1661152917471 +http_latency_bucket{prog="histo.mtail",le="4"} 2 1661152917471 +http_latency_bucket{prog="histo.mtail",le="8"} 2 1661152917471 +http_latency_bucket{prog="histo.mtail",le="+Inf"} 2 1661152917471 +http_latency_sum{prog="histo.mtail"} 3 1661152917471 +http_latency_count{prog="histo.mtail"} 4 1661152917471 +``` + +使用日志的时间 +``` +Aug 22 15:28:32 GET /api/v1/pods latency=2s code=200 +Aug 22 15:28:32 GET /api/v1/pods latency=1s code=200 +Aug 22 15:28:32 GET /api/v1/pods latency=0s code=200 +``` + +``` +histogram http_latency buckets 1, 2, 4, 8 +/^(?P\w+\s+\d+\s+\d+:\d+:\d+)/ { + strptime($date, "Jan 02 15:04:05") + /latency=(?P\d+)/ { + http_latency=$latency + } +} +``` + +日志提取的时间,一定要注意时区问题,有一个参数 `override_timezone` 可以控制时区选择,否则默认使用UTC转换。 +比如我启动时指定 `override_timezone=Asia/Shanghai`, 这个时候日志提取的时间会当做东八区时间 转换为timestamp, 然后再从timestamp转换为各时区时间时 就没有问题了,如图。 +![timestamp](https://cdn.jsdelivr.net/gh/flashcatcloud/categraf@main/inputs/mtail/timestamp.png) +如果不带 `override_timezone=Asia/Shanghai`, 则默认将`Aug 22 15:34:32` 当做UTC时间,转换为timestamp。 这样再转换为本地时间时,会多了8个小时, 如图。 +![timestamp](https://cdn.jsdelivr.net/gh/flashcatcloud/categraf@main/inputs/mtail/timezone.png) diff --git a/inputs/mtail/Readme.md b/inputs/mtail/Readme.md index 4dea73c5a..bc088fc70 100644 --- a/inputs/mtail/Readme.md +++ b/inputs/mtail/Readme.md @@ -1,61 +1,62 @@ -# mtail插件 +# mtail Input Plugin -## 简介 -功能:提取日志内容,转换为监控metrics +## Introduction +Function: Extracts content from log files and converts them into monitoring metrics. -+ 输入: 日志 -+ 输出: metrics 按照mtail语法输出, 仅支持counter、gauge、histogram -+ 处理: 本质是golang的正则提取+表达式计算 ++ **Input**: Log files ++ **Output**: Metrics generated according to `mtail` syntax (only `counter`, `gauge`, and `histogram` are supported). ++ **Processing**: Essentially regex extraction and expression calculation in Golang. -## 启动 -编辑mtail.toml文件, 一般每个instance需要指定不同的progs参数(不同的progs文件或者目录),否则指标会相互干扰。 -**注意**: 如果不同instance使用相同progs, 可以通过给每个instance增加labels做区分, +## Startup +Edit the `mtail.toml` file. Generally, each instance needs to specify different `progs` (different `mtail` program files or directories) to prevent metrics from interfering with each other. + +**Note**: If different instances share the same `progs`, you can differentiate them by adding `labels` to each instance: ```toml -labels = { k1=v1 } +labels = { k1="v1" } ``` -或 +Or: ```toml [instances.labels] -k1=v1 +k1="v1" ``` -1. conf/inputs.mtail/mtail.toml中指定instance +1. Specify the instance in `conf/input.mtail/mtail.toml`: ```toml - [[instances]] -## 指定mtail prog的目录 +## Directory containing mtail progs progs = "/path/to/prog1" -## 指定mtail要读取的日志 +## Log files for mtail to read logs = ["/path/to/a.log", "path/to/b.log"] -## 指定时区 +## Specify timezone overrides if necessary # override_timezone = "Asia/Shanghai" -## metrics是否带时间戳,注意,这里是"true" +## Whether metrics include a timestamp (note: this is a string "true") # emit_metric_timestamp = "true" - +## Log encoding (gbk, gb18030, gb2312, big5, utf-8), default is utf-8 +# encoding = "gbk" ... ``` -2. 在/path/to/prog1 目录下编写规则文件 -``` + +2. Write a rule file in the `/path/to/prog1` directory (e.g. `test.mtail`): +```text gauge xxx_errors /ERROR.*/ { - xxx_errros++ + xxx_errors++ } ``` -3. 一个tab中执行 `categraf --test --inputs mtail`,用于测试 -4. 另一个tab中,"/path/to/a.log" 或者 "path/to/b.log" 追加一行 ERROR,看看categraf的输出 -5. 测试通过后,启动categraf +3. Open a terminal tab and run `categraf --test --inputs mtail` to test. +4. In another tab, append an `ERROR` line to `/path/to/a.log` and observe Categraf's output. +5. Once testing passes, start Categraf normally. -### 输入 -logs参数指定要处理的日志源, 支持模糊匹配, 支持多个log文件。 +### Input +The `logs` parameter specifies the log sources. It supports glob matching and multiple log files. -### 处理规则 -`progs`指定具体的规则文件目录(或文件) +### Processing Rules +`progs` specifies the specific rule file directory (or file). +## Processing Rules and Syntax -## 处理规则与语法 - -### 处理流程 +### Processing Workflow ```python for line in lines: for regex in regexes: @@ -63,9 +64,9 @@ for line in lines: do something ``` -### 语法 +### Syntax Overview -``` golang +```text exported variable pattern { @@ -77,46 +78,33 @@ def decorator { } ``` -#### 定义指标名称 -前面也提过,指标仅支持 counter gauge histogram 三种类型。 -一个🌰 -```golang +#### Defining Metric Names +Only `counter`, `gauge`, and `histogram` types are supported. + +Example: +```text counter lines /INFO.*/ { lines++ } ``` -注意,定义的名称只支持 C类型的命名方式(字母/数字/下划线),**如果想使用"-" 要使用"as"导出别名**。例如, -```golang +Note: Defined names only support C-style naming conventions (letters/numbers/underscores). **If you want to use hyphens "-", use `as` to export an alias**. For example: +```text counter lines_total as "line-count" ``` -这样获取到的就是line-count这个指标名称了 +The exported metric name will be `line-count`. -#### 匹配与计算(pattern/action) +#### Matching and Calculation (pattern/action) -```golang +```text PATTERN { -ACTION + ACTION } ``` -例子 -```golang -/foo/ { - ACTION1 -} - -variable > 0 { - ACTION2 -} - -/foo/ && variable > 0 { - ACTION3 -} -``` -支持RE2正则匹配 -```golang +Support for RE2 regular expressions: +```text const PREFIX /^\w+\W+\d+ / PREFIX { @@ -128,86 +116,65 @@ PREFIX + /foo/ { } ``` -这样,ACTION1 是匹配以小写字符+大写字符+数字+空格的行,ACTION2 是匹配小写字符+大写字符+数字+空格+foo开头的行。 - -#### 关系运算符 -+ `<` 小于 `<=` 小于等于 -+ `>` 大于 `>=` 大于等于 -+ `==` 相等 `!=` 不等 -+ `=~` 匹配(模糊) `!~` 不匹配(模糊) -+ `||` 逻辑或 `&&` 逻辑与 `!` 逻辑非 - -#### 数学运算符 -+ `|` 按位或 -+ `&` 按位与 -+ `^` 按位异或 -+ `+ - * /` 四则运算 -+ `<<` 按位左移 -+ `>>` 按位右移 -+ `**` 指数运算 -+ `=` 赋值 -+ `++` 自增运算 -+ `--` 自减运算 -+ `+=` 加且赋值 - -#### 支持else与otherwise -```golang +#### Relational Operators +- `<` less than, `<=` less than or equal to +- `>` greater than, `>=` greater than or equal to +- `==` equal to, `!=` not equal to +- `=~` match (regex), `!~` does not match (regex) +- `||` logical OR, `&&` logical AND, `!` logical NOT + +#### Mathematical Operators +- `|` bitwise OR, `&` bitwise AND, `^` bitwise XOR +- `+ - * /` basic arithmetic +- `<<` bitwise left shift, `>>` bitwise right shift +- `**` exponentiation +- `=` assignment +- `++` increment, `--` decrement +- `+=` add and assign + +#### Supporting `else` and `otherwise` +```text /foo/ { -ACTION1 + ACTION1 } else { -ACTION2 + ACTION2 } ``` -支持嵌套 -```golang + +Nested blocks and `otherwise` are supported: +```text /foo/ { /foo1/ { ACTION1 } - /foo2/ { - ACTION2 - } otherwise { ACTION3 } } ``` -支持命名与非命名提取 - -```golang +#### Named and Unnamed Extraction +```text /(?P\S+) (\S+) \[\S+\] (\S+) \(\S*\) \S+ (?P\d+)/ { bytes_total[$operation][$3] += $bytes } ``` -增加常量label -```python -# test.mtail -# 定义常量label env + +Adding constant labels: +```text hidden text env -# 给label 赋值 这样定义是global范围; -# 局部添加,则在对应的condition中添加 env="production" counter line_total by logfile,env /^(?P\w+\s+\d+\s+\d+:\d+:\d+)/ { line_total[getfilename()][env]++ } ``` -获取到的metrics中会添加上`env=production`的label 如下: -```python -# metrics -line_total{env="production",logfile="/path/to/xxxx.log",prog="test.mtail"} 4 1661165941788 -``` -如果要给metrics增加变量label,必须要使用命名提取。例如 -```python -# 日志内容 +To add variable labels to metrics, you **must** use named extraction: +```text +# Log content 192.168.0.1 GET /foo -192.168.0.2 GET /bar -192.168.0.1 POST /bar -``` -``` python # test.mtail counter my_http_requests_total by log_file, verb /^/ + @@ -219,63 +186,26 @@ counter my_http_requests_total by log_file, verb } ``` -```python -# metrics -my_http_requests_total{logfile="xxx.log",verb="GET",prog="test.mtail"} 4242 -my_http_requests_total{logfile="xxx.log",verb="POST",prog="test.mtail"} 42 -``` - -命名提取的变量可以在条件中使用 -```golang +Named extraction variables can be used in conditions: +```text /(?P\d+)/ && $x > 1 { -nonzero_positives++ + nonzero_positives++ } ``` -#### 时间处理 -不显示处理,则默认使用系统时间 +#### Time Processing +By default, the system time is used for metrics (`emit_metric_timestamp="false"`). +If you set `emit_metric_timestamp="true"`, Categraf will attach timestamps. -默认emit_metric_timestamp="false" (注意是字符串) -``` -http_latency_bucket{prog="histo.mtail",le="1"} 0 -http_latency_bucket{prog="histo.mtail",le="2"} 0 -http_latency_bucket{prog="histo.mtail",le="4"} 0 -http_latency_bucket{prog="histo.mtail",le="8"} 0 -http_latency_bucket{prog="histo.mtail",le="+Inf"} 0 -http_latency_sum{prog="histo.mtail"} 0 -http_latency_count{prog="histo.mtail"} 0 -``` - -参数 emit_metric_timestamp="true" (注意是字符串) -``` -http_latency_bucket{prog="histo.mtail",le="1"} 1 1661152917471 -http_latency_bucket{prog="histo.mtail",le="2"} 2 1661152917471 -http_latency_bucket{prog="histo.mtail",le="4"} 2 1661152917471 -http_latency_bucket{prog="histo.mtail",le="8"} 2 1661152917471 -http_latency_bucket{prog="histo.mtail",le="+Inf"} 2 1661152917471 -http_latency_sum{prog="histo.mtail"} 3 1661152917471 -http_latency_count{prog="histo.mtail"} 4 1661152917471 -``` - -使用日志的时间 -``` -Aug 22 15:28:32 GET /api/v1/pods latency=2s code=200 -Aug 22 15:28:32 GET /api/v1/pods latency=1s code=200 -Aug 22 15:28:32 GET /api/v1/pods latency=0s code=200 -``` - -``` +You can also parse timestamps from log lines: +```text histogram http_latency buckets 1, 2, 4, 8 /^(?P\w+\s+\d+\s+\d+:\d+:\d+)/ { - strptime($date, "Jan 02 15:04:05") + strptime($date, "Jan 02 15:04:05") /latency=(?P\d+)/ { http_latency=$latency } } ``` -日志提取的时间,一定要注意时区问题,有一个参数 `override_timezone` 可以控制时区选择,否则默认使用UTC转换。 -比如我启动时指定 `override_timezone=Asia/Shanghai`, 这个时候日志提取的时间会当做东八区时间 转换为timestamp, 然后再从timestamp转换为各时区时间时 就没有问题了,如图。 -![timestamp](https://cdn.jsdelivr.net/gh/flashcatcloud/categraf@main/inputs/mtail/timestamp.png) -如果不带 `override_timezone=Asia/Shanghai`, 则默认将`Aug 22 15:34:32` 当做UTC时间,转换为timestamp。 这样再转换为本地时间时,会多了8个小时, 如图。 -![timestamp](https://cdn.jsdelivr.net/gh/flashcatcloud/categraf@main/inputs/mtail/timezone.png) +Pay attention to timezones when extracting time from logs. Use `override_timezone` to control timezone parsing. For example, setting `override_timezone="Asia/Shanghai"` ensures that the extracted time is treated as East 8 timezone and properly converted to timestamp. diff --git a/inputs/mtail/dashboard.json b/inputs/mtail/dashboard.json new file mode 100644 index 000000000..a11ffbb47 --- /dev/null +++ b/inputs/mtail/dashboard.json @@ -0,0 +1,34 @@ +{ + "title": "MTail Custom Logs Analytics", + "uid": "7b70b815", + "tags": [ + "mtail custom logs analytics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Mtail Monitoring Usage Instruction", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "mtail_prog_load_errors_total", + "legendFormat": "{{prog}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/mysql/README.md b/inputs/mysql/README.md index 965401945..a622df7cb 100644 --- a/inputs/mysql/README.md +++ b/inputs/mysql/README.md @@ -1,103 +1,669 @@ -# mysql +# MySQL 插件 -mysql 监控采集插件,核心原理就是连到 mysql 实例,执行一些 sql,解析输出内容,整理为监控数据上报。 +## 简介 -## Configuration +`mysql` 插件通过连接 MySQL 实例并执行内置 SQL,采集 MySQL 运行状态、全局变量、InnoDB 关键指标、连接分布、库表空间占用、复制状态、Binlog 体积,以及用户自定义 SQL 结果。 -```toml -# # collect interval -# interval = 15 +它适用于以下场景: + +- 监控单实例或多实例 MySQL / Percona Server / MariaDB 的基础健康状态 +- 观测连接数、慢查询、缓存、锁等待、InnoDB Buffer Pool 等数据库核心指标 +- 采集库级、表级磁盘占用,以及主从 / 副本延迟、Binlog 体积 +- 将业务自定义 SQL 结果与内置指标统一纳入 Categraf + +## 限制与兼容性 + +- 插件的前提是“能连上数据库并成功 `Ping`”;`mysql_up = 1` 只表示连接与认证成功,不代表所有可选采集项都成功 +- 不同指标族依赖不同 SQL 能力和数据库权限;权限不足时,通常表现为部分指标缺失并伴随日志报错,而不是整实例 `up = 0` +- 只有当 `address` 以 `.sock` 结尾时,插件才会使用 Unix socket;`localhost` 不会自动切换为 socket 连接 +- 主从 / 副本相关采集同时兼容 `SHOW SLAVE STATUS`、`SHOW ALL SLAVES STATUS`、`SHOW REPLICA STATUS`、`SHOW ALL REPLICAS STATUS`,但最终能输出哪些字段,取决于数据库版本与返回列 +- `gather_replica_status` 虽然走的是 `SHOW REPLICA STATUS` 路径,但当前输出的指标名前缀仍然是 `mysql_slave_...`,这是现有实现的历史兼容行为 +- 代码中存在两套 Binlog 采集路径:一套默认开启的 `mysql_binlog_*`,一套可选开启的 `mysql_binary_*`;它们不会互相替代 + +## 权限建议 + +如果希望采集大多数内置指标,建议为监控账号授予至少如下权限: + +```sql +GRANT PROCESS, REPLICATION CLIENT ON *.* TO 'categraf'@'%'; +GRANT SELECT ON *.* TO 'categraf'@'%'; +``` + +不同模块的典型权限依赖如下: + +| 模块 | 主要 SQL | 常见权限要求 | 说明 | +| --- | --- | --- | --- | +| 基础存活探测 | `Ping()` | 可登录即可 | 对应 `mysql_up` | +| 全局状态 / 全局变量 | `SHOW GLOBAL STATUS` / `SHOW GLOBAL VARIABLES` | 依 MySQL 版本而异 | 若权限不足,会导致核心状态类指标缺失 | +| InnoDB 状态 | `SHOW ENGINE INNODB STATUS` | `PROCESS` | 无此权限时,InnoDB 状态解析类指标无法获取 | +| Processlist 分布 | `information_schema.processlist` | `PROCESS` | 否则通常只能看到当前账号自己的连接,统计会失真 | +| 库 / 表大小 | `information_schema.tables` | 对目标库有 `SELECT` | 没有权限的库 / 表不会出现在结果中 | +| 主从 / 副本状态 | `SHOW SLAVE STATUS` / `SHOW REPLICA STATUS` | `REPLICATION CLIENT` | 对应复制延迟、线程状态等指标 | +| Binlog 大小 | `SHOW BINARY LOGS` | `REPLICATION CLIENT` | 同时影响 `mysql_binlog_*` 与 `mysql_binary_*` | -# 要监控 MySQL,首先要给出要监控的MySQL的连接地址、用户名、密码 +## 快速开始 + +最小可用配置: + +```toml [[instances]] address = "127.0.0.1:3306" -username = "root" -password = "1234" - -# # set tls=custom to enable tls -# parameters = "tls=false" - -# 通过 show global status监控mysql,默认抓取一些基础指标, -# 如果想抓取更多global status的指标,把下面的配置设置为true -extra_status_metrics = true - -# 通过show global variables监控mysql的全局变量,默认抓取一些常规的 -# 常规的基本够用了,扩展的部分,默认不采集,下面的配置设置为false -extra_innodb_metrics = false - -# 监控processlist,关注较少,默认不采集 -gather_processlist_processes_by_state = false -gather_processlist_processes_by_user = false - -# 监控各个数据库的磁盘占用大小 -gather_schema_size = false - -# 监控所有的table的磁盘占用大小 -gather_table_size = false - -# 是否采集系统表的大小,通过不用,所以默认设置为false -gather_system_table_size = false - -# 通过 show slave status监控slave的情况,比较关键,所以默认采集 -gather_slave_status = true - -# # timeout -# timeout_seconds = 3 - -# # interval = global.interval * interval_times -# interval_times = 1 - -# 为mysql实例附一个instance的标签,因为通过address=127.0.0.1:3306不好区分 -# important! use global unique string to specify instance -# labels = { instance="n9e-10.2.3.4:3306" } - -## Optional TLS Config -# use_tls = false -# tls_min_version = "1.2" -# tls_ca = "/etc/categraf/ca.pem" -# tls_cert = "/etc/categraf/cert.pem" -# tls_key = "/etc/categraf/key.pem" -## Use TLS but skip chain & host verification -# insecure_skip_verify = true - -# 自定义SQL,指定SQL、返回的各个列那些是作为metric,哪些是作为label -# [[instances.queries]] -# mesurement = "users" -# metric_fields = [ "total" ] -# label_fields = [ "service" ] -# # field_to_append = "" -# timeout = "3s" -# request = ''' -# select 'n9e' as service, count(*) as total from n9e_v5.users -# ''' +username = "categraf" +password = "" +timeout_seconds = 3 + +# 强烈建议补一个便于区分实例的标签 +labels = { instance = "prod-mysql-01:3306" } ``` -## 监控多个实例 +说明: + +- `address` 以 `.sock` 结尾时走 Unix socket,例如 `/var/run/mysqld/mysqld.sock` +- `username` / `password` 是否允许为空,取决于 MySQL 账户本身的认证方式;插件代码并不强制要求非空 +- `labels` 不是插件专属字段,但在多实例场景下非常建议设置 `instance` 之类的稳定标签 -当主机填写为localhost时mysql会采用 unix domain socket连接 -当主机填写为127.0.0.1时mysql会采用tcp方式连接 -大家最常问的问题是如何监控多个mysql实例,实际大家对toml配置学习一下就了解了,`[[instances]]` 部分表示数组,是可以出现多个的,address参数支持通过unix路径连接 所以,举例: +### TLS 连接示例 + +如果需要由插件注册自定义 TLS 配置,请同时满足两件事: + +1. 设置 `use_tls = true` +2. 在 `parameters` 中显式传入 `tls=custom` + +示例: + +```toml +[[instances]] +address = "mysql.example.com:3306" +username = "categraf" +password = "" +use_tls = true +parameters = "tls=custom" +tls_ca = "/etc/categraf/ca.pem" +tls_cert = "/etc/categraf/client.pem" +tls_key = "/etc/categraf/client-key.pem" +``` + +如果只设置 `use_tls = true`,但没有在 `parameters` 中加上 `tls=custom`,当前实现不会使用这套自定义 TLS 配置。 + +### 监控多个实例 + +`[[instances]]` 是数组,可以配置多个 MySQL 实例: ```toml [[instances]] address = "10.2.3.6:3306" -username = "root" -password = "1234" -labels = { instance="n9e-10.2.3.6:3306" } +username = "categraf" +password = "" +labels = { instance = "prod-mysql-a:3306" } [[instances]] address = "10.2.6.9:3306" -username = "root" -password = "1234" -labels = { instance="zbx-10.2.6.9:3306" } +username = "categraf" +password = "" +labels = { instance = "prod-mysql-b:3306" } + +[[instances]] +address = "/var/run/mysqld/mysqld.sock" +username = "categraf" +password = "" +labels = { instance = "local-mysql.sock" } +``` + +## 配置项 + +本文重点描述 MySQL 插件相关字段。`interval`、`interval_times`、`labels` 等实例通用字段沿用 Categraf 通用语义。 + +### 常用通用实例字段 + +| 配置项 | 类型 | 默认值 | 说明 | +| --- | --- | --- | --- | +| `interval` | int | 继承全局配置 | 采集周期,单位秒 | +| `interval_times` | int | `1` | 实际采集周期 = 全局 `interval * interval_times` | +| `labels` | map[string]string | 空 | 给当前实例的所有指标追加固定标签;多实例场景强烈建议补一个稳定的 `instance` 标签 | + +### 连接与基础采集 + +| 配置项 | 类型 | 默认值 | 说明 | +| --- | --- | --- | --- | +| `address` | string | 必填 | MySQL 地址。以 `.sock` 结尾时使用 Unix socket,否则使用 TCP | +| `username` | string | 空 | MySQL 用户名 | +| `password` | string | 空 | MySQL 密码 | +| `parameters` | string | 空 | 直接拼到 DSN `?` 后面的参数串,例如 `parseTime=true&loc=Local`;使用自定义 TLS 时需要写 `tls=custom` | +| `timeout_seconds` | int | `3` | 连接 / Ping 超时时间。若 DSN 自身已携带 timeout,则以 DSN 为准 | + +### 内置采集开关 + +| 配置项 | 类型 | 默认值 | 说明 | +| --- | --- | --- | --- | +| `extra_status_metrics` | bool | false | 扩展 `SHOW GLOBAL STATUS` 指标白名单 | +| `extra_innodb_metrics` | bool | false | 扩展 InnoDB 相关指标白名单;同时额外输出 `mysql_global_status_buffer_pool_pages_used` | +| `gather_processlist_processes_by_state` | bool | false | 采集 `information_schema.processlist` 中按状态归类的连接数 | +| `gather_processlist_processes_by_user` | bool | false | 采集 `information_schema.processlist` 中按用户归类的连接数 | +| `gather_schema_size` | bool | false | 采集库级磁盘占用 | +| `gather_table_size` | bool | false | 采集业务库表级磁盘占用 | +| `gather_system_table_size` | bool | false | 采集系统库(`mysql`、`sys`、`information_schema`、`performance_schema`)表级磁盘占用 | +| `gather_slave_status` | bool | false | 采集 `SHOW SLAVE STATUS` / `SHOW ALL SLAVES STATUS` 的一组精简指标 | +| `gather_binary_logs` | bool | false | 额外采集一组 `mysql_binary_*` Binlog 指标 | +| `gather_replica_status` | bool | false | 采集 `SHOW REPLICA STATUS` / `SHOW ALL REPLICAS STATUS` 的可解析字段 | +| `gather_all_slave_channels` | bool | false | 仅影响 `gather_replica_status`:为 `true` 时导出所有 channel;否则只取第一行 | + +### 禁用开关 + +| 配置项 | 类型 | 默认值 | 说明 | +| --- | --- | --- | --- | +| `disable_global_status` | bool | false | 禁用 `SHOW GLOBAL STATUS` 采集 | +| `disable_global_variables` | bool | false | 禁用 `SHOW GLOBAL VARIABLES` 采集 | +| `disable_innodb_status` | bool | false | 禁用 `SHOW ENGINE INNODB STATUS` 文本解析 | +| `disable_extra_innodb_status` | bool | false | 禁用基于缓存计算的 Buffer Pool 衍生指标 | +| `disable_binlogs` | bool | false | 禁用默认开启的旧版 Binlog 采集,关闭后不再输出 `mysql_binlog_*` | + +### TLS 配置 +| 配置项 | 类型 | 默认值 | 说明 | +| --- | --- | --- | --- | +| `use_tls` | bool | false | 是否启用 TLS 配置注册 | +| `tls_ca` | string | 空 | CA 文件路径 | +| `tls_cert` | string | 空 | 客户端证书路径 | +| `tls_key` | string | 空 | 客户端私钥路径 | +| `tls_key_pwd` | string | 空 | 私钥口令 | +| `insecure_skip_verify` | bool | false | 是否跳过服务端证书校验 | +| `tls_server_name` | string | 空 | 自定义 TLS `ServerName` | +| `tls_min_version` | string | 空 | 最低 TLS 版本,可选 `1.0` / `1.1` / `1.2` / `1.3` | +| `tls_max_version` | string | 空 | 最高 TLS 版本,可选 `1.0` / `1.1` / `1.2` / `1.3` | +| `tls_cipher_suites` | []string | 空 | 显式指定 Cipher Suites | + +### 自定义 SQL + +支持两种作用域: + +- 顶层 `[[queries]]`:对当前插件内的所有实例生效 +- 实例级 `[[instances.queries]]`:只对当前实例生效 + +每个 query 支持如下字段: + +| 配置项 | 类型 | 默认值 | 说明 | +| --- | --- | --- | --- | +| `mesurement` | string | 空 | 自定义指标名前缀。注意当前实现要求使用这个历史拼写 | +| `metric_fields` | []string | 空 | 作为数值指标导出的列名列表 | +| `label_fields` | []string | 空 | 作为标签导出的列名列表 | +| `field_to_append` | string | 空 | 将某一列的值追加到指标名中,适合动态分组 | +| `timeout` | duration | 继承 `timeout_seconds`,再退化到 `3s` | 单条自定义 SQL 的超时 | +| `request` | string | 空 | 实际执行的 SQL | + +自定义 SQL 的使用规则: + +- `metric_fields`、`label_fields`、`field_to_append` 应与结果集里的小写列名或小写别名一致,因为实现会先把数据库返回列名统一转成小写后再匹配 +- `metric_fields` 对应列必须能转换为数值,否则该行会报错并跳过 +- 如果设置了 `field_to_append`,该列的值会被清洗后拼入指标名:空格转下划线,`%` 变为 `percent`,并统一转成小写 + +示例: + +```toml +[[instances.queries]] +mesurement = "users" +metric_fields = ["total"] +label_fields = ["service"] +timeout = "3s" +request = ''' +SELECT 'billing' AS service, COUNT(*) AS total FROM users; +''' +``` + +上述查询会生成 `mysql_users_total{service="billing", ...}`。 + +## 快速验证 + +MySQL 插件没有内置调试 HTTP API,最直接的验证方式是“启动插件后查基础指标,再对照日志看是否有模块级错误”。 + +### 1. 使用最小配置启动采集 + +至少保证以下配置存在: + +```toml [[instances]] -address = "/tmp/mysql.sock" -username = "root" -password = "1234" -labels = { instance="zbx-localhost:3306" } +address = "127.0.0.1:3306" +username = "categraf" +password = "" +labels = { instance = "prod-mysql-01:3306" } ``` -## 监控大盘和告警规则 +### 2. 在指标存储中检查基础指标 + +启动 Categraf 后,先查询以下指标: + +```promql +mysql_up{address="127.0.0.1:3306"} +mysql_scrape_use_seconds{address="127.0.0.1:3306"} +mysql_global_status_threads_connected{address="127.0.0.1:3306"} +mysql_version_info{address="127.0.0.1:3306"} +``` + +预期现象: + +- `mysql_up = 1` +- `mysql_scrape_use_seconds` 有值 +- `mysql_global_status_threads_connected` 有值 +- `mysql_version_info` 出现,并带有 `version`、`innodb_version`、`version_comment` 标签 + +### 3. 如果启用了可选模块,再检查对应指标 + +- 启用 `gather_schema_size` 后,检查 `mysql_schema_size_bytes` +- 启用 `gather_table_size` 后,检查 `mysql_table_size_data_bytes` +- 启用 `gather_slave_status` 后,检查 `mysql_slave_status_seconds_behind_master` +- 保持默认 `disable_binlogs = false` 时,检查 `mysql_binlog_size_bytes` + +### 4. 对照 Categraf 日志 + +如果 `mysql_up = 1`,但某些指标不存在,再看日志中是否出现类似报错: + +- `failed to query global status` +- `failed to query engine innodb status` +- `failed to query slave status` +- `failed to get table size` + +这通常意味着: + +- 当前账号缺权限 +- 当前实例不是对应角色,例如并不是副本,却开启了复制状态采集 +- 当前数据库版本不支持某条 SQL + +## Metrics + +所有指标都以 `mysql_` 为前缀。 + +### 1. 基础存活与采集耗时 + +| 指标 | 类型 | 说明 | +| --- | --- | --- | +| `mysql_up` | Gauge | 数据库连接与 `Ping` 是否成功。`1` 表示成功,`0` 表示失败 | +| `mysql_scrape_use_seconds` | Gauge | 单次采集耗时 | + +### 2. `SHOW GLOBAL STATUS` 核心指标 + +默认会导出以下直接指标族: + +```text +mysql_global_status_uptime +mysql_global_status_prepared_stmt_count +mysql_global_status_slow_queries +mysql_global_status_questions +mysql_global_status_queries +mysql_global_status_connections +mysql_global_status_max_used_connections +mysql_global_status_aborted_clients +mysql_global_status_aborted_connects +mysql_global_status_open_files +mysql_global_status_open_tables +mysql_global_status_bytes_sent +mysql_global_status_bytes_received +mysql_global_status_qcache_hits +mysql_global_status_qcache_inserts +mysql_global_status_qcache_lowmem_prunes +mysql_global_status_table_locks_waited +mysql_global_status_table_locks_waited_rate +mysql_global_status_created_tmp_tables +mysql_global_status_created_tmp_disk_tables +mysql_global_status_created_tmp_files +mysql_global_status_threads_connected +mysql_global_status_threads_running +mysql_global_status_key_blocks_used +mysql_global_status_key_blocks_unused +mysql_global_status_key_blocks_not_flushed +mysql_global_status_key_read_requests +mysql_global_status_key_reads +mysql_global_status_key_write_requests +mysql_global_status_key_writes +mysql_global_status_innodb_log_waits +mysql_global_status_innodb_data_reads +mysql_global_status_innodb_data_writes +mysql_global_status_innodb_os_log_fsyncs +mysql_global_status_innodb_mutex_spin_waits +mysql_global_status_innodb_mutex_spin_rounds +mysql_global_status_innodb_mutex_os_waits +mysql_global_status_innodb_row_lock_waits +mysql_global_status_innodb_row_lock_time +mysql_global_status_innodb_row_lock_current_waits +mysql_global_status_innodb_current_row_locks +mysql_global_status_innodb_buffer_pool_read_requests +mysql_global_status_innodb_buffer_pool_reads +``` + +另外还会导出以下按标签分组的指标族: + +| 指标 | 标签 | 说明 | +| --- | --- | --- | +| `mysql_global_status_commands_total` | `command` | `com_*` 类命令计数,例如 `select`、`insert`、`update`、`delete` | +| `mysql_global_status_handlers_total` | `handler` | `handler_*` 计数 | +| `mysql_global_status_connection_errors_total` | `error` | `connection_errors_*` 计数 | +| `mysql_global_status_buffer_pool_pages_data` / `free` / `misc` / `old` / `total` / `dirty` | 无 | Buffer Pool 页数分布 | +| `mysql_global_status_buffer_pool_page_changes_total` | `operation` | Buffer Pool 页状态变化计数 | +| `mysql_global_status_innodb_row_ops_total` | `operation` | `innodb_rows_*` 行操作计数 | +| `mysql_global_status_performance_schema_lost_total` | `instrumentation` | `performance_schema_*` 丢失计数 | + +当 `extra_status_metrics = true` 时,还会额外输出以下后缀的 `mysql_global_status_` 指标: + +```text +binlog_cache_disk_use +binlog_cache_use +opened_tables +qcache_total_blocks +qcache_free_blocks +qcache_free_memory +qcache_not_cached +qcache_queries_in_cache +select_full_join +select_full_range_join +select_range +select_range_check +select_scan +sort_merge_passes +sort_range +sort_rows +sort_scan +table_locks_immediate +table_locks_immediate_rate +threads_cached +threads_created +table_open_cache_hits +table_open_cache_misses +``` + +### 3. `SHOW GLOBAL VARIABLES` 与信息类指标 + +默认会导出以下 `mysql_global_variables_` 指标: + +```text +key_buffer_size +key_cache_block_size +max_connections +max_prepared_stmt_count +query_cache_size +table_open_cache +thread_cache_size +long_query_time +max_user_connections +read_only +``` + +另外还会导出: + +| 指标 | 类型 | 说明 | +| --- | --- | --- | +| `mysql_version_info` | Info 型(值恒为 `1`) | 版本信息,标签包含 `version`、`innodb_version`、`version_comment` | +| `mysql_transaction_isolation` | Info 型(值恒为 `1`) | 当前事务隔离级别,标签 `level` | +| `mysql_galera_variables_info` | Info 型(值恒为 `1`) | Galera / PXC 集群名称,标签 `wsrep_cluster_name` | +| `mysql_galera_gcache_size_bytes` | Gauge | 从 `wsrep_provider_options` 解析出的 `gcache.size` | + +### 4. InnoDB 状态与衍生指标 + +`SHOW ENGINE INNODB STATUS` 文本解析会导出: + +| 指标 | 类型 | 说明 | +| --- | --- | --- | +| `mysql_engine_innodb_queries_inside_innodb` | Gauge | InnoDB 内部正在执行的查询数 | +| `mysql_engine_innodb_queries_in_queue` | Gauge | InnoDB 队列中的查询数 | +| `mysql_engine_innodb_read_views_open_inside_innodb` | Gauge | 当前打开的 read view 数 | + +基于缓存计算的 Buffer Pool 衍生指标(默认开启,除非 `disable_extra_innodb_status = true`): + +| 指标 | 类型 | 说明 | +| --- | --- | --- | +| `mysql_global_status_buffer_pool_bytes_used` | Gauge | Buffer Pool 已使用字节数 | +| `mysql_global_status_buffer_pool_bytes_data` | Gauge | Buffer Pool 数据页字节数 | +| `mysql_global_status_buffer_pool_bytes_free` | Gauge | Buffer Pool 空闲字节数 | +| `mysql_global_status_buffer_pool_bytes_total` | Gauge | Buffer Pool 总字节数 | +| `mysql_global_status_buffer_pool_bytes_dirty` | Gauge | Buffer Pool 脏页字节数 | +| `mysql_global_status_buffer_pool_pages_utilization` | Gauge | Buffer Pool 页利用率,百分比 | + +当 `extra_innodb_metrics = true` 时,还会额外输出: + +- `mysql_global_status_buffer_pool_pages_used` +- 以及以下更多 `mysql_global_status_` 指标: + +```text +innodb_active_transactions +innodb_buffer_pool_bytes_data +innodb_buffer_pool_pages_data +innodb_buffer_pool_pages_dirty +innodb_buffer_pool_pages_flushed +innodb_buffer_pool_pages_free +innodb_buffer_pool_pages_total +innodb_buffer_pool_read_ahead +innodb_buffer_pool_read_ahead_evicted +innodb_buffer_pool_read_ahead_rnd +innodb_buffer_pool_wait_free +innodb_buffer_pool_write_requests +innodb_checkpoint_age +innodb_current_transactions +innodb_data_fsyncs +innodb_data_pending_fsyncs +innodb_data_pending_reads +innodb_data_pending_writes +innodb_data_read +innodb_data_written +innodb_dblwr_pages_written +innodb_dblwr_writes +innodb_hash_index_cells_total +innodb_hash_index_cells_used +innodb_history_list_length +innodb_ibuf_free_list +innodb_ibuf_merged +innodb_ibuf_merged_delete_marks +innodb_ibuf_merged_deletes +innodb_ibuf_merged_inserts +innodb_ibuf_merges +innodb_ibuf_segment_size +innodb_ibuf_size +innodb_lock_structs +innodb_locked_tables +innodb_locked_transactions +innodb_log_write_requests +innodb_log_writes +innodb_lsn_current +innodb_lsn_flushed +innodb_lsn_last_checkpoint +innodb_mem_adaptive_hash +innodb_mem_additional_pool +innodb_mem_dictionary +innodb_mem_file_system +innodb_mem_lock_system +innodb_mem_page_hash +innodb_mem_recovery_system +innodb_mem_thread_hash +innodb_mem_total +innodb_os_file_fsyncs +innodb_os_file_reads +innodb_os_file_writes +innodb_os_log_pending_fsyncs +innodb_os_log_pending_writes +innodb_os_log_written +innodb_pages_created +innodb_pages_read +innodb_pages_written +innodb_pending_aio_log_ios +innodb_pending_aio_sync_ios +innodb_pending_buffer_pool_flushes +innodb_pending_checkpoint_writes +innodb_pending_ibuf_aio_reads +innodb_pending_log_flushes +innodb_pending_log_writes +innodb_pending_normal_aio_reads +innodb_pending_normal_aio_writes +innodb_queries_inside +innodb_queries_queued +innodb_read_views +innodb_rows_deleted +innodb_rows_inserted +innodb_rows_read +innodb_rows_updated +innodb_s_lock_os_waits +innodb_s_lock_spin_rounds +innodb_s_lock_spin_waits +innodb_semaphore_wait_time +innodb_semaphore_waits +innodb_tables_in_use +innodb_x_lock_os_waits +innodb_x_lock_spin_rounds +innodb_x_lock_spin_waits +``` + +### 5. Processlist 指标 + +| 指标 | 类型 | 标签 | 说明 | +| --- | --- | --- | --- | +| `mysql_processlist_processes_by_state` | Gauge | `state` | 连接按状态归类后的数量 | +| `mysql_processlist_processes_by_user` | Gauge | `user` | 连接按用户归类后的数量 | + +### 6. 库与表空间指标 + +| 指标 | 类型 | 标签 | 说明 | +| --- | --- | --- | --- | +| `mysql_schema_size_bytes` | Gauge | `schema` | 库级总空间大小 | +| `mysql_table_size_index_bytes` | Gauge | `schema`, `table` | 表索引空间 | +| `mysql_table_size_data_bytes` | Gauge | `schema`, `table` | 表数据空间 | +| `mysql_table_size_free_data_bytes` | Gauge | `schema`, `table` | 表空闲空间 | + +### 7. 复制状态指标 + +`gather_slave_status = true` 时,会从 `SHOW SLAVE STATUS` / `SHOW ALL SLAVES STATUS` 路径导出一组经过筛选的指标,格式为 `mysql_slave_status_`。常见指标包括: + +```text +mysql_slave_status_seconds_behind_source +mysql_slave_status_seconds_behind_master +mysql_slave_status_slave_io_running +mysql_slave_status_slave_sql_running +mysql_slave_status_master_server_id +mysql_slave_status_source_server_id +mysql_slave_status_sql_delay +mysql_slave_status_exec_master_log_pos +mysql_slave_status_read_master_log_pos +``` + +这些指标会附带以下标签: + +- `master_host` +- `master_uuid` +- `channel_name` + +`gather_replica_status = true` 时,会从 `SHOW REPLICA STATUS` / `SHOW ALL REPLICAS STATUS` 路径导出“可解析为数值或布尔值”的列,指标名格式为: + +```text +mysql_slave_ +``` + +例如在较新的 MySQL 版本上,常见会看到: + +```text +mysql_slave_seconds_behind_source +mysql_slave_source_server_id +mysql_slave_sql_delay +``` + +注意: + +- 这一组指标名虽然来自 `SHOW REPLICA STATUS`,但前缀仍是 `mysql_slave_` +- 当前实现更适合导出数值型列;字符串列不会形成最终时序,某些 `YES` / `NO` 状态列也可能因为列类型是字符串而被丢弃 +- `gather_all_slave_channels = true` 时,会额外打上 `channel` 标签并导出所有 channel;否则只导出第一行 + +### 8. Binlog 指标 + +默认情况下(`disable_binlogs = false`),会导出一组旧版 Binlog 指标: + +| 指标 | 类型 | 说明 | +| --- | --- | --- | +| `mysql_binlog_size_bytes` | Gauge | 所有 Binlog 文件大小总和 | +| `mysql_binlog_file_count` | Gauge | Binlog 文件数量 | +| `mysql_binlog_file_number` | Gauge | 最后一个 Binlog 文件名中的序号 | + +当 `gather_binary_logs = true` 时,还会额外导出一组新版 Binlog 指标: + +| 指标 | 类型 | 说明 | +| --- | --- | --- | +| `mysql_binary_size_bytes` | Gauge | 所有 Binlog 文件大小总和 | +| `mysql_binary_files_count` | Gauge | Binlog 文件数量 | + +两套指标都依赖 `SHOW BINARY LOGS`。如果实例未开启 Binlog 或当前账号没有 `REPLICATION CLIENT` 权限,这些指标会缺失。 + +### 9. 自定义 SQL 指标 + +自定义 SQL 会根据配置动态生成指标名: + +- 不带 `field_to_append`:`mysql__` +- 带 `field_to_append`:`mysql___` + +示例: + +- `mesurement = "users"` +- `metric_fields = ["total"]` + +则输出 `mysql_users_total` + +如果再设置 `field_to_append = "state"`,并且某行 `state = 'Lock Wait'`,则会生成类似: + +`mysql_users_lock_wait_total` + +### 10. Galera / PXC 相关指标 + +如果实例暴露了 `wsrep_*` 相关状态 / 变量,还会额外输出: + +| 指标 | 类型 | 说明 | +| --- | --- | --- | +| `mysql_galera_status_info` | Info 型(值恒为 `1`) | `wsrep_local_state_uuid`、`wsrep_cluster_state_uuid`、`wsrep_provider_version` | +| `mysql_galera_variables_info` | Info 型(值恒为 `1`) | `wsrep_cluster_name` | +| `mysql_galera_gcache_size_bytes` | Gauge | Galera gcache 大小 | +| `mysql_galera_evs_repl_latency_min_seconds` | Gauge | 组通信延迟最小值 | +| `mysql_galera_evs_repl_latency_avg_seconds` | Gauge | 组通信延迟平均值 | +| `mysql_galera_evs_repl_latency_max_seconds` | Gauge | 组通信延迟最大值 | +| `mysql_galera_evs_repl_latency_stdev` | Gauge | 组通信延迟标准差 | +| `mysql_galera_evs_repl_latency_sample_size` | Gauge | 样本数 | + +## FAQ + +### 1. 为什么 `mysql_up = 1`,但有些指标还是没有? + +`mysql_up` 只代表“连接与 `Ping` 成功”。库表大小、复制状态、Binlog、Processlist 等都依赖额外权限和 SQL 能力。先看 Categraf 日志里的 `failed to query ...` 报错,再确认权限、角色和数据库版本。 + +### 2. 为什么把 `address` 写成 `localhost`,却没有走 Unix socket? + +当前实现只有在 `address` 以 `.sock` 结尾时才走 Unix socket。`localhost:3306` 仍然按 TCP 处理。若要使用 socket,请直接写 socket 文件路径。 + +### 3. 为什么 TLS 配好了证书,还是连不上? + +除了 `use_tls = true` 外,还需要在 `parameters` 中显式加上 `tls=custom`。例如: + +```toml +use_tls = true +parameters = "tls=custom" +``` + +### 4. `gather_slave_status` 和 `gather_replica_status` 应该开哪个? + +- 如果你更关心一组稳定、精简的复制指标,优先用 `gather_slave_status` +- 如果你希望尽量暴露 `SHOW REPLICA STATUS` 返回的数值列,可启用 `gather_replica_status` +- 两者可以同时开启,但会得到两套不同命名风格的复制指标 + +### 5. 为什么开了 `gather_binary_logs`,却还会看到 `mysql_binlog_*`? + +因为这是两套独立的 Binlog 采集逻辑: + +- `disable_binlogs = false` 时,默认输出 `mysql_binlog_*` +- `gather_binary_logs = true` 时,额外输出 `mysql_binary_*` + +开启后者不会自动关闭前者。 + +### 6. 自定义 SQL 为什么没有数据? + +常见原因有三类: + +- `metric_fields` / `label_fields` / `field_to_append` 没与 SQL 结果里的小写列名或小写别名保持一致 +- `metric_fields` 对应列不是数值 +- 自定义 SQL 超时;当前默认会继承实例的 `timeout_seconds`,默认值是 3 秒 + +## 其他说明 + +- 同目录下的 `alerts.json`、`dashboard-by-instance.json`、`dashboard-by-ident.json` 可作为告警与看板参考 +- 如果你需要极简权限模式,可以只启用基础探测与自定义 SQL,但要接受大量内置指标不可用的事实 + +## 许可证 -本 README 的同级目录,大家可以看到alerts.json 是告警规则,导入夜莺就可以使用, dashboard-by-instance.json 就是监控大盘(注意!监控大盘使用instance大盘变量,所以,上面的配置文件中要配置一个instance的标签,就是 `labels = { instance="n9e-10.2.3.4:3306" }` 部分),也是导入夜莺就可以使用。dashboard-by-ident是使用ident作为大盘变量,适用于先找到宿主机器,再找机器上面的mysql实例的场景 \ No newline at end of file +Apache License 2.0 diff --git a/inputs/nats/README.md b/inputs/nats/README.md new file mode 100644 index 000000000..92e01a4f7 --- /dev/null +++ b/inputs/nats/README.md @@ -0,0 +1,37 @@ +# NATS Input Plugin + +This plugin collects operational metrics from NATS message servers. It gathers real-time statistics by accessing the monitoring HTTP API (`/varz` endpoint) provided by the NATS Server. + +## Configuration + +For this plugin to work, your NATS server must have its HTTP monitoring port enabled (by setting `http_port` or `https_port` in the NATS configuration file). + +```toml +# Collect NATS monitoring metrics +# interval = 60 + +[[instances]] +# NATS monitoring endpoint (must include schema and port) +server = "http://localhost:8222" +``` + +## Metrics + +All collected metrics will be tagged with the `server` label corresponding to the scraped endpoint. +Key metrics include: + +- `nats_in_msgs` / `nats_out_msgs`: Total number of messages received/sent +- `nats_in_bytes` / `nats_out_bytes`: Total number of bytes received/sent +- `nats_uptime`: NATS server uptime +- `nats_cores`: Number of CPU cores allocated to NATS +- `nats_mem`: Memory footprint of NATS +- `nats_connections`: Number of currently connected clients +- `nats_total_connections`: Total number of connections accepted historically +- `nats_subscriptions`: Number of active subscriptions +- `nats_slow_consumers`: Number of slow consumers +- `nats_routes`: Number of cluster routes +- `nats_remotes`: Number of remote connections + +## Dashboards + +A matching Dashboard (`dashboard.json`) is provided in this directory. You can import this dashboard into Grafana or Nightingale to monitor the operational status of your NATS servers (including connection counts, throughput, subscription counts, and other core metrics). diff --git a/inputs/nats/README_CN.md b/inputs/nats/README_CN.md new file mode 100644 index 000000000..ce8a4fe22 --- /dev/null +++ b/inputs/nats/README_CN.md @@ -0,0 +1,37 @@ +# NATS 采集插件 + +该插件用于采集 NATS 消息服务器的运行指标。它通过访问 NATS Server 提供的监控 HTTP API(`/varz` 接口)来获取实时的统计数据。 + +## 配置说明 + +要使此插件正常工作,您的 NATS 服务器必须开启 HTTP 监控端口(在 NATS 配置文件中设置 `http_port` 或 `https_port`)。 + +```toml +# 采集 NATS 监控指标 +# interval = 60 + +[[instances]] +# NATS 监控接口地址 (需包含 schema 和端口) +server = "http://localhost:8222" +``` + +## 采集指标 + +所有收集到的指标都会打上 `server` 标签,对应所抓取的接口地址。 +主要包含以下指标: + +- `nats_in_msgs` / `nats_out_msgs`: 收发消息总数 +- `nats_in_bytes` / `nats_out_bytes`: 收发字节总数 +- `nats_uptime`: NATS 服务运行时间 +- `nats_cores`: 分配给 NATS 的 CPU 核心数 +- `nats_mem`: NATS 占用的内存大小 +- `nats_connections`: 当前连接的客户端数量 +- `nats_total_connections`: 历史建立过的连接总数 +- `nats_subscriptions`: 当前活跃的订阅数量 +- `nats_slow_consumers`: 消费较慢的消费者数量 +- `nats_routes`: 集群路由数 +- `nats_remotes`: 远程连接数 + +## 监控大盘 + +本目录下提供了一个配套的 Dashboard (`dashboard.json`),您可以在 Grafana 或夜莺中导入该看板来观测您的 NATS 服务器运行状态(包括连接数、吞吐率、订阅数量等核心指标)。 diff --git a/inputs/nats/dashboard.json b/inputs/nats/dashboard.json new file mode 100644 index 000000000..69ca721e4 --- /dev/null +++ b/inputs/nats/dashboard.json @@ -0,0 +1,116 @@ +{ + "title": "NATS Message Server", + "uid": "fba3da61", + "tags": [ + "nats message server" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "NATS Connections", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "nats_connections", + "legendFormat": "{{server}}", + "refId": "A" + } + ] + }, + { + "title": "NATS Subscriptions", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "nats_subscriptions", + "legendFormat": "{{server}}", + "refId": "A" + } + ] + }, + { + "title": "NATS Message Throughput (Msgs/s)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(nats_in_msgs[5m])", + "legendFormat": "In", + "refId": "A" + }, + { + "expr": "rate(nats_out_msgs[5m])", + "legendFormat": "Out", + "refId": "B" + } + ] + }, + { + "title": "NATS Byte Throughput (Bytes/s)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(nats_in_bytes[5m])", + "legendFormat": "In", + "refId": "A" + }, + { + "expr": "rate(nats_out_bytes[5m])", + "legendFormat": "Out", + "refId": "B" + } + ] + }, + { + "title": "NATS Slow Consumers", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 16, + "w": 24, + "h": 8 + }, + "id": 5, + "targets": [ + { + "expr": "nats_slow_consumers", + "legendFormat": "{{server}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/net/README.md b/inputs/net/README.md index 867bce699..588f69703 100644 --- a/inputs/net/README.md +++ b/inputs/net/README.md @@ -1,11 +1,32 @@ -# net +# Net (Network Interfaces) Input Plugin -网络流量监控插件,比如各个网卡的流量、包量、错包情况等 +This plugin monitors network traffic. It primarily collects metrics for each network interface, including traffic (bytes in/out), packet counts, dropped packets, and transmission errors. + +**Supported Platforms:** Windows, Linux, macOS, BSD, etc. ## Configuration -通常可以维持默认配置,不过有的时候,我们有些网卡不想采集,只想采集指定的网卡,可以通过 interfaces 这个配置来指定。 +In most cases, you can leave the default configuration as is; the plugin will automatically discover and collect metrics for all active network interfaces. If you want to limit data collection to specific interfaces (e.g., for performance or noise reduction), you can use the `interfaces` option (which supports regex). + +```toml +# Collect network interface metrics +[[instances]] +# interfaces = ["eth0", "enp*"] +# ignore_interfaces = ["lo", "docker*", "veth*"] +``` + +## Metrics + +All collected metrics are prefixed with `net_`. Key metrics include: + +- `net_bytes_recv` / `net_bytes_sent`: Bytes received and sent (used to calculate bandwidth/throughput) +- `net_packets_recv` / `net_packets_sent`: Packets received and sent (used to calculate PPS) +- `net_errin` / `net_errout`: Error packets during receive and transmit +- `net_dropin` / `net_dropout`: Dropped packets during receive and transmit + +All metrics are tagged with the `interface` label corresponding to the specific NIC name. -## 监控大盘 +## Dashboards -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +These metrics are essential for basic server monitoring. Typically, network monitoring is grouped under a global **System** dashboard alongside CPU and memory metrics. +For standalone viewing, a basic Dashboard focusing solely on network dimensions is also provided in this directory. diff --git a/inputs/net/README_CN.md b/inputs/net/README_CN.md new file mode 100644 index 000000000..59d2d3d26 --- /dev/null +++ b/inputs/net/README_CN.md @@ -0,0 +1,32 @@ +# Net (网络接口) 采集插件 + +网络流量监控插件,主要用于收集操作系统各个网卡(网络接口)的流量(进出字节)、包量、丢包和错包情况等。 + +**支持平台:** Windows, Linux, macOS, BSD 等 + +## 配置说明 + +通常情况下,您可以维持默认配置,该插件会自动发现并采集所有网卡的指标数据。如果您出于性能或者安全考虑,只想采集特定的网卡,可以通过 `interfaces` 配置项(支持正则表达式)进行过滤。 + +```toml +# 采集网络接口指标 +[[instances]] +# interfaces = ["eth0", "enp*"] +# ignore_interfaces = ["lo", "docker*", "veth*"] +``` + +## 采集指标 + +所有收集到的指标名称前缀为 `net_`。主要指标如下: + +- `net_bytes_recv` / `net_bytes_sent`: 接收和发送的字节数(用来计算带宽使用率/吞吐量) +- `net_packets_recv` / `net_packets_sent`: 接收和发送的数据包数量(用来计算 PPS) +- `net_errin` / `net_errout`: 接收和发送时的错误包数 +- `net_dropin` / `net_dropout`: 接收和发送时因缓冲区满等原因丢弃的包数 + +这些指标均会打上 `interface` 标签,对应具体的网卡名称。 + +## 监控大盘 + +这些指标是主机最核心的基础监控数据之一。通常,OS 的网络流量监控会与其他硬件指标统一放置在 **System (主机系统)** 大盘中。 +为方便查看单独维度的网络状态,本目录下也提供了一个仅包含网卡维度的基础 Dashboard。 \ No newline at end of file diff --git a/inputs/net/dashboard.json b/inputs/net/dashboard.json new file mode 100644 index 000000000..ead2c97ea --- /dev/null +++ b/inputs/net/dashboard.json @@ -0,0 +1,108 @@ +{ + "title": "System Network Metrics", + "uid": "13940004", + "tags": [ + "system network metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Network Bandwidth (Bytes/s)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(net_bytes_recv[5m])", + "legendFormat": "In: {{interface}}", + "refId": "A" + }, + { + "expr": "rate(net_bytes_sent[5m])", + "legendFormat": "Out: {{interface}}", + "refId": "B" + } + ] + }, + { + "title": "Network Packets (PPS)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(net_packets_recv[5m])", + "legendFormat": "In: {{interface}}", + "refId": "A" + }, + { + "expr": "rate(net_packets_sent[5m])", + "legendFormat": "Out: {{interface}}", + "refId": "B" + } + ] + }, + { + "title": "Network Dropped Packets/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(net_dropin[5m])", + "legendFormat": "In Drop: {{interface}}", + "refId": "A" + }, + { + "expr": "rate(net_dropout[5m])", + "legendFormat": "Out Drop: {{interface}}", + "refId": "B" + } + ] + }, + { + "title": "Network Errors/s", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(net_errin[5m])", + "legendFormat": "In Err: {{interface}}", + "refId": "A" + }, + { + "expr": "rate(net_errout[5m])", + "legendFormat": "Out Err: {{interface}}", + "refId": "B" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/netstat/README.md b/inputs/netstat/README.md index ca1e475f2..33693a0bb 100644 --- a/inputs/netstat/README.md +++ b/inputs/netstat/README.md @@ -1,7 +1,36 @@ -# netstat +# Netstat Input Plugin -该插件采集网络连接情况,比如有多少 time_wait 连接,多少 established 连接 +This plugin monitors network connection states. It primarily collects statistics on the distribution of various TCP/UDP connection states within the operating system, such as the number of connections in `TIME_WAIT`, `ESTABLISHED`, or `CLOSE_WAIT` states. -# 监控大盘 +**Supported Platforms:** Windows, Linux, macOS, BSD, etc. -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +## Configuration + +```toml +# Collect network TCP connection state statistics +[[instances]] +# Usually requires no specific configuration. Just leave it enabled. +``` + +## Metrics + +All collected metrics are prefixed with `netstat_`. Key metrics include: + +- `netstat_tcp_established`: Number of TCP connections in the ESTABLISHED state +- `netstat_tcp_syn_sent`: Number of connections in the SYN_SENT state +- `netstat_tcp_syn_recv`: Number of connections in the SYN_RECV state +- `netstat_tcp_fin_wait1`: Number of connections in the FIN_WAIT1 state +- `netstat_tcp_fin_wait2`: Number of connections in the FIN_WAIT2 state +- `netstat_tcp_time_wait`: Number of connections in the TIME_WAIT state (high values may indicate port exhaustion) +- `netstat_tcp_close`: Number of connections in the CLOSE state +- `netstat_tcp_close_wait`: Number of connections in the CLOSE_WAIT state (high values may indicate an unresponsive application failing to release connections) +- `netstat_tcp_last_ack`: Number of connections in the LAST_ACK state +- `netstat_tcp_listen`: Number of sockets in the LISTEN state +- `netstat_tcp_closing`: Number of connections in the CLOSING state +- `netstat_tcp_none`: Number of TCP connections with an unknown state +- `netstat_udp_socket`: Number of active UDP sockets + +## Dashboards + +These metrics are essential for basic server monitoring. Typically, OS network connection monitoring is unified under a global **System** dashboard alongside CPU and disk metrics. +For standalone viewing, a basic Dashboard focusing solely on TCP/UDP connection states is also provided in this directory. diff --git a/inputs/netstat/README_CN.md b/inputs/netstat/README_CN.md new file mode 100644 index 000000000..edcd2e2f5 --- /dev/null +++ b/inputs/netstat/README_CN.md @@ -0,0 +1,36 @@ +# Netstat 采集插件 + +网络连接状态监控插件。该插件主要用于采集操作系统中各类 TCP/UDP 连接的状态分布情况,例如有多少个处于 `TIME_WAIT`、`ESTABLISHED`、`CLOSE_WAIT` 状态的连接。 + +**支持平台:** Windows, Linux, macOS, BSD 等 + +## 配置说明 + +```toml +# 采集网络 TCP 连接状态统计 +[[instances]] +# 通常无需任何特殊配置,保持默认启用即可。 +``` + +## 采集指标 + +所有收集到的指标名称前缀为 `netstat_`。主要指标如下: + +- `netstat_tcp_established`: 已建立连接的 TCP 数量 +- `netstat_tcp_syn_sent`: 处于 SYN_SENT 状态的连接数 +- `netstat_tcp_syn_recv`: 处于 SYN_RECV 状态的连接数 +- `netstat_tcp_fin_wait1`: 处于 FIN_WAIT1 状态的连接数 +- `netstat_tcp_fin_wait2`: 处于 FIN_WAIT2 状态的连接数 +- `netstat_tcp_time_wait`: 处于 TIME_WAIT 状态的连接数(如果过高可能预示端口耗尽) +- `netstat_tcp_close`: 处于 CLOSE 状态的连接数 +- `netstat_tcp_close_wait`: 处于 CLOSE_WAIT 状态的连接数(如果过高可能预示应用程序卡死或未正确释放连接) +- `netstat_tcp_last_ack`: 处于 LAST_ACK 状态的连接数 +- `netstat_tcp_listen`: 处于 LISTEN 状态的连接数 +- `netstat_tcp_closing`: 处于 CLOSING 状态的连接数 +- `netstat_tcp_none`: 无法获取状态的 TCP 连接数 +- `netstat_udp_socket`: 活跃的 UDP Socket 数量 + +## 监控大盘 + +这些指标是主机最核心的基础监控数据之一。通常,OS 的网络连接监控大盘会与 CPU、磁盘等指标统一放置在 **System (主机系统)** 大盘下面。 +为方便单独查看,本目录也提供了一个仅包含 TCP/UDP 连接状态维度的基础 Dashboard。 \ No newline at end of file diff --git a/inputs/netstat/dashboard.json b/inputs/netstat/dashboard.json new file mode 100644 index 000000000..0d66596c5 --- /dev/null +++ b/inputs/netstat/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "System Netstat Metrics", + "uid": "b1e40208", + "tags": [ + "system netstat metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "TCP Established Connections", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "netstat_tcp_established", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "TCP Time-Wait Connections", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "netstat_tcp_time_wait", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "TCP Close-Wait Connections", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "netstat_tcp_close_wait", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "UDP Sockets", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "netstat_udp_socket", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/netstat_filter/README.md b/inputs/netstat_filter/README.md index e852d0adb..bd69a87e4 100644 --- a/inputs/netstat_filter/README.md +++ b/inputs/netstat_filter/README.md @@ -1,31 +1,50 @@ -# netstat_filter - -该插件采集网络连接情况,并根据用户条件进行过滤统计,以达到监控用户关心链接情况 -## 指标列表 -tcp_established -tcp_syn_sent -tcp_syn_recv -tcp_fin_wait1 -tcp_fin_wait2 -tcp_time_wait -tcp_close -tcp_close_wait -tcp_last_ack -tcp_listen -tcp_closing -tcp_none -tcp_send_queue -tcp_recv_queue - -## 功能说明 -对源IP、源端口、目标IP和目标端口过滤后进行网卡recv-Q、send-Q进行采集,该指标可以很好反应出指定连接的质量,例如rtt时间过长,导致收到服务端ack确认很慢就会使send-Q长期大于0,可以及时通过监控发现,从而提前优化网络或程序 - -当过滤结果为多个连接时会将send和recv值进行加和 -例如: -配置文件``raddr_port = 11883`` -当本地和不同IP的11883都有连接建立的情况下,会将多条连接的结果进行加和。或在并发多连接的情况下,会合并加合,总之过滤的越粗略被加合数就会越多。 - -多条规则请复制``[[instances]]``进行配置 - -## 注意事项 -netstat_filter_tcp_send_queue和netstat_filter_tcp_recv_queue指标目前只支持linux。windows用户默认为0。 +# Netstat Filter Input Plugin + +This plugin monitors network connections and aggregates statistics based on user-defined filtering criteria (such as source/destination IPs or ports). It is highly useful for precisely monitoring specific critical network connections (like database connection pools). + +In addition to standard connection states, this plugin can collect the `recv-Q` (receive queue) and `send-Q` (send queue) of network sockets. This is valuable for reflecting the quality of network connections (e.g., a high RTT or slow client processing will cause `send-Q` to consistently stay above 0). + +**Supported Platforms:** Windows, Linux (Note: `recv-Q` and `send-Q` metrics are currently fully supported only on Linux; on Windows, they default to 0). + +## Configuration + +You can filter by source IP (`laddr_ip`), source port (`laddr_port`), destination IP (`raddr_ip`), and destination port (`raddr_port`). If left empty or 0, they match anything. + +```toml +# Collect TCP connection statistics based on specific filters +# interval = 15 + +# You can configure multiple instances if you have multiple independent rules +[[instances]] +# Use labels to distinguish which rule this data belongs to +# labels = { "filter"="mysql_backend" } + +# Example rule: Only collect connections related to port 3306 (local or remote) +# laddr_ip = "" +# laddr_port = 0 +# raddr_ip = "" +# raddr_port = 3306 + +[[instances]] +# labels = { "filter"="redis_backend" } +# raddr_port = 6379 +``` + +When a filter matches multiple connections, the plugin will **sum up** the values for these connections (e.g., the number of connections in a given state, or the total `send_queue` and `recv_queue` bytes). + +## Metrics + +All metrics are prefixed with `netstat_filter_`. The metrics list includes: + +- `netstat_filter_tcp_established`: Number of ESTABLISHED connections matching the filter +- `netstat_filter_tcp_syn_sent`: Number of SYN_SENT connections matching the filter +- `netstat_filter_tcp_syn_recv`: Number of SYN_RECV connections matching the filter +- `netstat_filter_tcp_time_wait`: Number of TIME_WAIT connections matching the filter +- `netstat_filter_tcp_close_wait`: Number of CLOSE_WAIT connections matching the filter +- (Other standard TCP states like `fin_wait1`, `fin_wait2`, `last_ack`, `listen`, `closing`, `none` are also supported) +- `netstat_filter_tcp_send_queue`: Total bytes queued in the send queues of matching connections (Linux only) +- `netstat_filter_tcp_recv_queue`: Total bytes queued in the receive queues of matching connections (Linux only) + +## Dashboards + +A basic Dashboard (`dashboard.json`) is provided in this directory. It supports displaying connection pool health and queue backlogs across different filtering rules (via the `filter` label), which is extremely helpful for application-layer network tuning. diff --git a/inputs/netstat_filter/README_CN.md b/inputs/netstat_filter/README_CN.md new file mode 100644 index 000000000..b59e171e3 --- /dev/null +++ b/inputs/netstat_filter/README_CN.md @@ -0,0 +1,50 @@ +# Netstat Filter 采集插件 + +该插件采集网络连接情况,并允许根据用户配置的条件(例如特定源/目标 IP、端口)进行过滤统计。这使得用户可以精确监控所关心的关键网络连接(如数据库连接池状态)。 + +除了常规的网络状态,此插件还能采集网络 Socket 的 `recv-Q`(接收队列)和 `send-Q`(发送队列)。这对于反映网络连接的质量非常有用(例如 RTT 时间过长、客户端处理慢,会导致 `send-Q` 持续大于 0)。 + +**支持平台:** Windows, Linux (其中 `recv-Q` 和 `send-Q` 当前仅完整支持 Linux,Windows 下默认为 0) + +## 配置说明 + +支持对源 IP (`laddr_ip`)、源端口 (`laddr_port`)、目标 IP (`raddr_ip`) 和目标端口 (`raddr_port`) 进行过滤。如果不填写,则匹配所有。 + +```toml +# 采集指定过滤条件的 TCP 连接状态统计 +# interval = 15 + +# 如果您有多种规则需要独立统计,可以配置多个 instance +[[instances]] +# filter 标签用于区分这是哪一个规则采集的数据 +# labels = { "filter"="mysql_backend" } + +# 过滤规则示例:只采集连接到本地或远端 3306 端口的连接 +# laddr_ip = "" +# laddr_port = 0 +# raddr_ip = "" +# raddr_port = 3306 + +[[instances]] +# labels = { "filter"="redis_backend" } +# raddr_port = 6379 +``` + +当过滤结果匹配多条连接时,插件会将这些连接的各项统计(如处于某种状态的连接数,以及 `send_queue` 和 `recv_queue`)进行**加和**。 + +## 采集指标 + +所有指标前缀为 `netstat_filter_`。指标列表如下: + +- `netstat_filter_tcp_established`: 满足过滤条件的 ESTABLISHED 连接数 +- `netstat_filter_tcp_syn_sent`: 满足条件的 SYN_SENT 连接数 +- `netstat_filter_tcp_syn_recv`: 满足条件的 SYN_RECV 连接数 +- `netstat_filter_tcp_time_wait`: 满足条件的 TIME_WAIT 连接数 +- `netstat_filter_tcp_close_wait`: 满足条件的 CLOSE_WAIT 连接数 +- (其他 TCP 状态如 `fin_wait1`, `fin_wait2`, `last_ack`, `listen`, `closing`, `none` 等类似) +- `netstat_filter_tcp_send_queue`: 匹配连接的发送队列排队字节数总和 (Linux only) +- `netstat_filter_tcp_recv_queue`: 匹配连接的接收队列排队字节数总和 (Linux only) + +## 监控大盘 + +本目录提供了一个基础的 Dashboard (`dashboard.json`),支持通过不同的过滤规则(`filter` label)展示目标服务连接池及队列的积压情况,对应用层网络调优很有帮助。 diff --git a/inputs/netstat_filter/dashboard.json b/inputs/netstat_filter/dashboard.json new file mode 100644 index 000000000..28e1c0a4b --- /dev/null +++ b/inputs/netstat_filter/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "Netstat Filter Quality", + "uid": "5fa8290a", + "tags": [ + "netstat filter quality" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Filtered TCP Established", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "netstat_filter_tcp_established", + "legendFormat": "{{filter}}", + "refId": "A" + } + ] + }, + { + "title": "Filtered TCP Send Queue Bytes", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "netstat_filter_tcp_send_queue", + "legendFormat": "{{filter}}", + "refId": "A" + } + ] + }, + { + "title": "Filtered TCP Recv Queue Bytes", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "netstat_filter_tcp_recv_queue", + "legendFormat": "{{filter}}", + "refId": "A" + } + ] + }, + { + "title": "Filtered TCP Time Wait", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "netstat_filter_tcp_time_wait", + "legendFormat": "{{filter}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/nfsclient/README.md b/inputs/nfsclient/README.md new file mode 100644 index 000000000..4671bf11e --- /dev/null +++ b/inputs/nfsclient/README.md @@ -0,0 +1,48 @@ +# NFS Client Input Plugin + +This plugin collects performance and operational statistics for Network File Systems (NFS) mounted on the host as a client. +It gathers metrics such as read/write bytes, request counts, and latency for various NFS operations (e.g., `GETATTR`, `READ`, `WRITE`) by parsing the `/proc/self/mountstats` file. + +**Supported Platforms:** Linux + +## Configuration + +```toml +# Collect NFS client metrics +# interval = 60 + +[[instances]] +# Whether to collect full statistics for all NFS operations (defaults to collecting only key operations) +fullstat = false + +# Include/exclude specific mount points +# include_mounts = ["/mnt/nfs_share1"] +# exclude_mounts = ["/mnt/backup"] + +# Include/exclude specific NFS operation types (uppercase, e.g., "READ", "WRITE") +# include_operations = [] +# exclude_operations = [] +``` + +## Metrics + +The plugin supports NFSv3 and NFSv4. All metrics are tagged with `mountpoint`, `server` (NFS server address), and `export` (exported path). + +Key metric categories include: +- **Bytes Statistics (`nfsclient_bytes_*)**: `read`, `write`, `direct_read`, `direct_write` +- **Event Statistics (`nfsclient_events_*)**: `inoderevalidates`, `dentryrevalidates`, `datainvalidates`, etc. +- **Operation Statistics (`nfsclient_ops_*`)**: + - `ops`: Total number of requests for the operation + - `trans`: Number of RPC requests transmitted + - `timeouts`: Number of timeouts + - `bytes_sent` / `bytes_recv`: Bytes sent and received for the operation + - `queue_time_ms`: Time spent waiting in the queue (in milliseconds) + - `response_time_ms`: Time spent waiting for the server to respond (in milliseconds) + - `total_time_ms`: Total execution time (in milliseconds) + - `errors`: Number of operational errors + +*Note: Each NFS operation (such as READ, WRITE, GETATTR) generates a corresponding set of `nfsclient_ops_*` metrics, distinguished by the `operation` label.* + +## Dashboards + +A companion Dashboard (`dashboard.json`) is provided in this directory. It can be used to monitor the read/write throughput, latency (Response Time / Queue Time), and timeout errors for each mount point. diff --git a/inputs/nfsclient/README_CN.md b/inputs/nfsclient/README_CN.md new file mode 100644 index 000000000..8d7cf66c3 --- /dev/null +++ b/inputs/nfsclient/README_CN.md @@ -0,0 +1,48 @@ +# NFS Client 采集插件 + +该插件用于采集主机上作为 NFS 客户端挂载的网络文件系统(NFS)的性能与操作统计数据。 +它通过读取系统的 `/proc/self/mountstats` 文件来收集诸如读写字节数、各项 NFS 操作(如 `GETATTR`, `READ`, `WRITE` 等)的请求次数及延迟指标。 + +**支持平台:** Linux + +## 配置说明 + +```toml +# 采集 NFS 客户端指标 +# interval = 60 + +[[instances]] +# 是否采集全量的 NFS 操作指标(默认只采集常用的关键操作) +fullstat = false + +# 包含/排除特定的挂载点 +# include_mounts = ["/mnt/nfs_share1"] +# exclude_mounts = ["/mnt/backup"] + +# 包含/排除特定的 NFS 操作类型(大写,例如 "READ", "WRITE") +# include_operations = [] +# exclude_operations = [] +``` + +## 采集指标 + +该插件支持 NFSv3 和 NFSv4,所有输出指标都会附带 `mountpoint`、`server` (NFS 服务端地址) 和 `export` (挂载的路径) 标签。 + +主要指标分类如下: +- **字节统计 (`nfsclient_bytes_*)**: `read`, `write`, `direct_read`, `direct_write` +- **事件统计 (`nfsclient_events_*)**: `inoderevalidates`, `dentryrevalidates`, `datainvalidates` 等 +- **操作统计 (`nfsclient_ops_*`)**: + - `ops`: 操作的总请求次数 + - `trans`: 发送的 RPC 请求次数 + - `timeouts`: 超时次数 + - `bytes_sent` / `bytes_recv`: 该操作发送和接收的字节数 + - `queue_time_ms`: 在队列中等待的时间 (单位:毫秒) + - `response_time_ms`: 服务端响应时间 (单位:毫秒) + - `total_time_ms`: 总耗时 (单位:毫秒) + - `errors`: 操作错误数 + +*注意:每种 NFS 操作(如 READ, WRITE, GETATTR)都会生成对应的一组 `nfsclient_ops_*` 指标,并通过 `operation` 标签进行区分。* + +## 监控大盘 + +本目录下提供了一个配套的 Dashboard (`dashboard.json`),可用于监控各挂载点的读写吞吐量、读写延迟(Response Time / Queue Time)以及超时错误等情况。 diff --git a/inputs/nfsclient/dashboard.json b/inputs/nfsclient/dashboard.json new file mode 100644 index 000000000..d956b39e0 --- /dev/null +++ b/inputs/nfsclient/dashboard.json @@ -0,0 +1,93 @@ +{ + "title": "NFS Client Metrics", + "uid": "bd54ba2a", + "tags": [ + "nfs client metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "NFS Client Read/Write Bytes/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(nfsclient_bytes_read[5m])", + "legendFormat": "Read: {{mountpoint}}", + "refId": "A" + }, + { + "expr": "rate(nfsclient_bytes_write[5m])", + "legendFormat": "Write: {{mountpoint}}", + "refId": "B" + } + ] + }, + { + "title": "NFS Client Response Time (ms)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(nfsclient_ops_response_time_ms[5m]) / rate(nfsclient_ops_ops[5m])", + "legendFormat": "{{mountpoint}} ({{operation}})", + "refId": "A" + } + ] + }, + { + "title": "NFS Client Operations/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(nfsclient_ops_ops[5m])", + "legendFormat": "{{mountpoint}} ({{operation}})", + "refId": "A" + } + ] + }, + { + "title": "NFS Client Timeouts/s", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(nfsclient_ops_timeouts[5m])", + "legendFormat": "{{mountpoint}} ({{operation}})", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/nginx/README.md b/inputs/nginx/README.md index 1fda948b3..734409be9 100644 --- a/inputs/nginx/README.md +++ b/inputs/nginx/README.md @@ -1,4 +1,4 @@ -- 该插件依赖 **nginx** 的 **http_stub_status_module** +- 该插件依赖**nginx**的 **http_stub_status_module # 应用场景 一般用于业务系统做对外或对外路由映射时使用代理服务,是运维最常见且最重要的代理工具。 @@ -93,7 +93,7 @@ server { } 浏览器访问https://nginx.domains.com出现: -Active connections: 5 +Active connections: 5 server accepts handled requests 90837 90837 79582 Reading: 0 Writing: 1 Waiting: 4 @@ -187,12 +187,12 @@ journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!" # 监控告警规则配置 - +``` +``` 个人经验仅供参考: - -- 超过2000毫秒,为P2级别,启用企业微信应用推送告警,3分钟内恢复发出恢复告警。 -- 超过5000毫秒,为P1级别,启用电话语音告警&企业微信应用告警,3分钟内恢复发出恢复告警。 - +超过2000毫秒,为P2级别,启用企业微信应用推送告警,3分钟内恢复发出恢复告警。 +超过5000毫秒,为P1级别,启用电话语音告警&企业微信应用告警,3分钟内恢复发出恢复告警。 +``` # 监控图表配置 diff --git a/inputs/node_exporter/README.md b/inputs/node_exporter/README.md new file mode 100644 index 000000000..111703644 --- /dev/null +++ b/inputs/node_exporter/README.md @@ -0,0 +1,36 @@ +# Node Exporter Input Plugin + +This plugin directly integrates the core logic of the official Prometheus [node_exporter](https://github.com/prometheus/node_exporter) to collect comprehensive hardware and OS metrics for *nix systems. +Compared to Categraf's native plugins (like `cpu`, `mem`, `disk`), this plugin provides a 100% compatible metric set with the official `node_exporter`, making it easy for users to reuse the rich ecosystem of community-provided Grafana dashboards and alert rules based on node_exporter. + +**Supported Platforms:** Linux, macOS, BSD, etc. + +## Configuration + +```toml +# Collect Node Exporter compatible metrics +# interval = 15 + +[[instances]] +# Typically, you just need to enable this plugin. +# If you need to toggle specific collectors, you can pass arguments to categraf's startup command line. +# Example: --collector.textfile.directory=/var/lib/node_exporter/textfile_collector +``` + +*Note: When enabling the `node_exporter` plugin in Categraf, its metrics may semantically overlap with Categraf's native basic plugins (`cpu`, `mem`, `disk`, etc.). It is generally recommended to either use Categraf's native basic plugin suite or solely enable this `node_exporter` plugin on a single machine.* + +## Metrics + +All metrics strictly follow the official Prometheus `node_exporter` naming conventions and are generally prefixed with `node_`. For example: +- `node_cpu_seconds_total` +- `node_memory_MemAvailable_bytes` +- `node_network_receive_bytes_total` +- `node_filesystem_free_bytes` +- `node_disk_read_bytes_total` + +For detailed descriptions of the collectors, please refer directly to the [official node_exporter documentation](https://github.com/prometheus/node_exporter). + +## Dashboards + +Because this plugin is 100% compatible with the open-source `node_exporter`, you can directly import popular community dashboards in Grafana (e.g., Dashboard ID: 1860 "Node Exporter Full"). +A minimalistic basic monitoring Dashboard (`dashboard.json`) is also provided in this directory for quick validation of data collection. diff --git a/inputs/node_exporter/README_CN.md b/inputs/node_exporter/README_CN.md new file mode 100644 index 000000000..457b4df9e --- /dev/null +++ b/inputs/node_exporter/README_CN.md @@ -0,0 +1,36 @@ +# Node Exporter 采集插件 + +该插件直接集成了 Prometheus 官方的 [node_exporter](https://github.com/prometheus/node_exporter) 核心逻辑,用于采集 *nix 类系统的全面硬件和操作系统指标。 +相比于原生的 Categraf 插件 (如 `cpu`, `mem`, `disk` 等),该插件能够提供和官方 `node_exporter` 100% 一致的指标集,方便用户直接复用社区中丰富的基于 node_exporter 的 Grafana 看板和告警规则。 + +**支持平台:** Linux, macOS, BSD 等 + +## 配置说明 + +```toml +# 采集 Node Exporter 兼容指标 +# interval = 15 + +[[instances]] +# 通常只需启用该插件即可。 +# 如果有特别的 collector 开启/关闭需求,您可以在 categraf 的命令行启动参数中传入 +# 例如:--collector.textfile.directory=/var/lib/node_exporter/textfile_collector +``` + +*注意:在 Categraf 中启用 `node_exporter` 插件时,可能会与 Categraf 自带的 `cpu`, `mem`, `disk` 等基础插件在语义上存在一定重叠,通常建议在一个机器上:要么使用 Categraf 自身的基础插件套餐,要么只开启这一个 `node_exporter` 插件。* + +## 采集指标 + +所有的指标均遵循 Prometheus 官方 `node_exporter` 的命名规范,通常以 `node_` 开头。例如: +- `node_cpu_seconds_total` +- `node_memory_MemAvailable_bytes` +- `node_network_receive_bytes_total` +- `node_filesystem_free_bytes` +- `node_disk_read_bytes_total` + +更多关于采集器的具体说明,请直接参考 [node_exporter 官方文档](https://github.com/prometheus/node_exporter)。 + +## 监控大盘 + +由于此插件 100% 兼容开源 `node_exporter`,您可以直接在 Grafana 导入社区流行的 `Node Exporter Full` 看板 (如 Dashboard ID: 1860)。 +本目录下也为您提供了一个极简的基础监控 Dashboard (`dashboard.json`),用于快速验证数据采集是否正常。 diff --git a/inputs/node_exporter/dashboard.json b/inputs/node_exporter/dashboard.json new file mode 100644 index 000000000..ed47a1b79 --- /dev/null +++ b/inputs/node_exporter/dashboard.json @@ -0,0 +1,98 @@ +{ + "title": "Node Exporter Basic Metrics", + "uid": "631db07a", + "tags": [ + "node exporter basic metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Node CPU Usage %", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "100 - (avg by (agent_hostname) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Node Memory Usage %", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "100 * (1 - ((node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes))", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Node Disk Read/Write Bytes", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total[5m])", + "legendFormat": "Read: {{device}}", + "refId": "A" + }, + { + "expr": "rate(node_disk_written_bytes_total[5m])", + "legendFormat": "Write: {{device}}", + "refId": "B" + } + ] + }, + { + "title": "Node Network Traffic Bytes", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total[5m])", + "legendFormat": "In: {{device}}", + "refId": "A" + }, + { + "expr": "rate(node_network_transmit_bytes_total[5m])", + "legendFormat": "Out: {{device}}", + "refId": "B" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/nsq/README.md b/inputs/nsq/README.md index 3ada9ca1d..b02cfc934 100644 --- a/inputs/nsq/README.md +++ b/inputs/nsq/README.md @@ -1,54 +1,46 @@ -# nsq -forked from [telegraf/nsq](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/nsq/nsq.go) +# NSQ Input Plugin + +This plugin collects metrics from [NSQ](https://nsq.io/), a realtime distributed messaging platform. +It is forked from [telegraf/nsq](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/nsq/nsq.go). + ## Configuration -- 配置文件,[参考示例](../../conf/input.nsq/nsq.toml) - -## 指标列表 -### nsq_client类 -ready_count 可消费消息数 -inflight_count 正在处理消息数 -message_count 消息总数 -finish_count 完成统计 -requeue_count 重新排队消息数 - -### nsq_channel类 -depth 当前的积压量 -backend_depth 消息缓冲队列积压量 -inflight_count 正在处理消息数 -deferred_count 延迟消息数 -message_count 消息总数 -requeue_count 重新排队消息数 -timeout_count 超时消息数 -client_count 客户端数量 - -### nsq_topic类 -depth 消息队列积压量 -backend_depth 消息缓冲队列积压量 -message_count 消息总数 -channel_count 消费者总数 - -## metrics -此配置可 克隆到nightingale的metrics.yaml文件中作为中文指标解释 -# [nsq] - nsq_server_server_count: "nsq 服务端总计" - nsq_server_topic_count: "nsq topic总数" - - nsq_topic_depth: 消息队列积压量 - nsq_topic_backend_depth: 消息缓冲队列积压量 - nsq_topic_message_count: 消息总数 - nsq_topic_channel_count: 消费者总数 - - nsq_channel_depth: "当前消息数,内存和硬盘转存的消息数,即当前的积压量" - nsq_channel_backend_depth: 消息缓冲队列积压量 - nsq_channel_inflight_count: "当前未完成的消息数,包括发送但未返回FIN/重新入队列REQ/超时TIMEOUT 三种消息数之和,代表已经投递还未消费掉的消息" - nsq_channel_deferred_count: "重新入队的延迟消息数,指还未发布的重入队消息数量,即未消费的定时(延时)消息数" - nsq_channel_message_count: 节点启动后的所有新消息总数,真正的消息次数 - nsq_channel_requeue_count: 重新入队的消息数,即返回REQ的消息数量 - nsq_channel_timeout_count: 已重入队列但按配置的超时时间内还收到响应的消息数 - nsq_channel_client_count: 客户端连接数 - - nsq_client_ready_count: 客户端可消费消息数 - nsq_client_inflight_count: 客户端正在处理消息数 - nsq_client_message_count: 客户端消息总数 - nsq_client_finish_count: 客户端完成的消息数,即返回FIN的消息数 - nsq_client_requeue_count: 客户端重新入队的消息数,即返回REQ的消息数量 + +For configuration options, please refer to the [example configuration](../../conf/input.nsq/nsq.toml). + +```toml +# Collect NSQ metrics +# interval = 15 + +[[instances]] +# endpoints array of NSQd or NSQlookupd HTTP API URLs +endpoints = ["http://localhost:4151"] +``` + +## Metrics + +### nsq_client Metrics +- `ready_count`: Number of messages the client is ready to receive +- `inflight_count`: Number of messages currently in-flight +- `message_count`: Total number of messages received +- `finish_count`: Total number of finished (FIN) messages +- `requeue_count`: Total number of requeued (REQ) messages + +### nsq_channel Metrics +- `depth`: Total number of messages in the channel (memory + disk backlog) +- `backend_depth`: Number of messages in the disk queue +- `inflight_count`: Number of in-flight messages (delivered but not yet FIN/REQ/TIMEOUT) +- `deferred_count`: Number of deferred (delayed) messages +- `message_count`: Total number of messages processed since startup +- `requeue_count`: Total number of requeued messages +- `timeout_count`: Total number of timed-out messages +- `client_count`: Number of clients connected to this channel + +### nsq_topic Metrics +- `depth`: Total number of messages in the topic queue +- `backend_depth`: Number of messages in the topic disk queue +- `message_count`: Total number of messages received +- `channel_count`: Total number of channels connected to the topic + +## Dashboards + +A basic Dashboard (`dashboard.json`) is provided in this directory to monitor NSQ Server status, topic depth, channel depth, and message throughput. diff --git a/inputs/nsq/README_CN.md b/inputs/nsq/README_CN.md new file mode 100644 index 000000000..3ada9ca1d --- /dev/null +++ b/inputs/nsq/README_CN.md @@ -0,0 +1,54 @@ +# nsq +forked from [telegraf/nsq](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/nsq/nsq.go) +## Configuration +- 配置文件,[参考示例](../../conf/input.nsq/nsq.toml) + +## 指标列表 +### nsq_client类 +ready_count 可消费消息数 +inflight_count 正在处理消息数 +message_count 消息总数 +finish_count 完成统计 +requeue_count 重新排队消息数 + +### nsq_channel类 +depth 当前的积压量 +backend_depth 消息缓冲队列积压量 +inflight_count 正在处理消息数 +deferred_count 延迟消息数 +message_count 消息总数 +requeue_count 重新排队消息数 +timeout_count 超时消息数 +client_count 客户端数量 + +### nsq_topic类 +depth 消息队列积压量 +backend_depth 消息缓冲队列积压量 +message_count 消息总数 +channel_count 消费者总数 + +## metrics +此配置可 克隆到nightingale的metrics.yaml文件中作为中文指标解释 +# [nsq] + nsq_server_server_count: "nsq 服务端总计" + nsq_server_topic_count: "nsq topic总数" + + nsq_topic_depth: 消息队列积压量 + nsq_topic_backend_depth: 消息缓冲队列积压量 + nsq_topic_message_count: 消息总数 + nsq_topic_channel_count: 消费者总数 + + nsq_channel_depth: "当前消息数,内存和硬盘转存的消息数,即当前的积压量" + nsq_channel_backend_depth: 消息缓冲队列积压量 + nsq_channel_inflight_count: "当前未完成的消息数,包括发送但未返回FIN/重新入队列REQ/超时TIMEOUT 三种消息数之和,代表已经投递还未消费掉的消息" + nsq_channel_deferred_count: "重新入队的延迟消息数,指还未发布的重入队消息数量,即未消费的定时(延时)消息数" + nsq_channel_message_count: 节点启动后的所有新消息总数,真正的消息次数 + nsq_channel_requeue_count: 重新入队的消息数,即返回REQ的消息数量 + nsq_channel_timeout_count: 已重入队列但按配置的超时时间内还收到响应的消息数 + nsq_channel_client_count: 客户端连接数 + + nsq_client_ready_count: 客户端可消费消息数 + nsq_client_inflight_count: 客户端正在处理消息数 + nsq_client_message_count: 客户端消息总数 + nsq_client_finish_count: 客户端完成的消息数,即返回FIN的消息数 + nsq_client_requeue_count: 客户端重新入队的消息数,即返回REQ的消息数量 diff --git a/inputs/nsq/dashboard.json b/inputs/nsq/dashboard.json new file mode 100644 index 000000000..fb1019103 --- /dev/null +++ b/inputs/nsq/dashboard.json @@ -0,0 +1,93 @@ +{ + "title": "NSQ Metrics", + "uid": "528538cb", + "tags": [ + "nsq metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "NSQ Topic Message Depth", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "nsq_topic_depth", + "legendFormat": "Topic: {{topic}}", + "refId": "A" + } + ] + }, + { + "title": "NSQ Channel Message Depth", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "nsq_channel_depth", + "legendFormat": "Channel: {{channel}} ({{topic}})", + "refId": "A" + } + ] + }, + { + "title": "NSQ Topic Messages/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(nsq_topic_message_count[5m])", + "legendFormat": "Topic: {{topic}}", + "refId": "A" + } + ] + }, + { + "title": "NSQ Channel Inflight/Deferred", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "nsq_channel_inflight_count", + "legendFormat": "Inflight: {{channel}}", + "refId": "A" + }, + { + "expr": "nsq_channel_deferred_count", + "legendFormat": "Deferred: {{channel}}", + "refId": "B" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/nvidia_smi/README.md b/inputs/nvidia_smi/README.md index 5b039a876..400d7c24f 100644 --- a/inputs/nvidia_smi/README.md +++ b/inputs/nvidia_smi/README.md @@ -1,33 +1,43 @@ -# nvidia_smi +# NVIDIA SMI Input Plugin -该采集插件的原理,就是读取 nvidia-smi 的内容输出,转换为监控数据上报。是把 [nvidia_gpu_exporter](https://github.com/utkuozdemir/nvidia_gpu_exporter) 的代码给集成过来了。 +This plugin collects metrics by reading the output of the `nvidia-smi` command-line tool. It integrates the core code of [nvidia_gpu_exporter](https://github.com/utkuozdemir/nvidia_gpu_exporter). + +**Supported Platforms:** Linux, Windows (Requires NVIDIA GPU drivers and the `nvidia-smi` utility installed) ## Configuration -配置文件在 `conf/input.nvidia_smi/nvidia_smi.toml` +The configuration file is located at `conf/input.nvidia_smi/nvidia_smi.toml` ```toml -# # collect interval +# Collect NVIDIA GPU status # interval = 15 -# 下面这个配置是最重要的配置,如果要采集 nvidia-smi 的信息,就打开下面的配置, -# 给出 nvidia-smi 命令的路径,最好是给绝对路径 -# 相当于让 Categraf 执行本机的 nvidia-smi 命令,获取本机 GPU 的状态信息 -# exec local command -# nvidia_smi_command = "nvidia-smi" +[[instances]] +# The following option is critical. To collect nvidia-smi information, uncomment it and provide the absolute path to the nvidia-smi command. +# This instructs Categraf to execute the local nvidia-smi command to get the GPU status. +# nvidia_smi_command = "/usr/bin/nvidia-smi" -# 如果想远程方式采集远端机器的 GPU 状态信息,可以使用 ssh 命令,登录远端机器 -# 在远端机器执行 nvidia-smi 的命令输出,通常 Categraf 是部署在每个物理机上的 -# 所以,ssh 这种方式,理论上用不到 -# exec remote command +# If you want to remotely collect GPU status from another machine, you can use an ssh command. +# (Since Categraf is usually deployed on every physical machine, SSH is rarely needed in practice) # nvidia_smi_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null SSH_USER@SSH_HOST nvidia-smi" -# Comma-separated list of the query fields. -# You can find out possible fields by running `nvidia-smi --help-query-gpus`. -# The value `AUTO` will automatically detect the fields to query. +# Comma-separated list of query fields. You can find out possible fields by running `nvidia-smi --help-query-gpus`. +# Setting the value to `AUTO` will automatically detect and query all supported fields. query_field_names = "AUTO" ``` -## TODO +## Metrics + +This plugin supports hundreds of GPU metrics depending on the driver version and GPU model. All metrics are prefixed with `nvidia_smi_` and automatically tagged with identifiers like `uuid` and `name` (e.g., Tesla T4). + +Key metrics to monitor include: +- `nvidia_smi_utilization_gpu_ratio`: GPU computation utilization (0~1) +- `nvidia_smi_utilization_memory_ratio`: Memory bandwidth utilization (0~1) +- `nvidia_smi_memory_used_bytes` / `nvidia_smi_memory_total_bytes`: GPU memory usage and capacity +- `nvidia_smi_temperature_gpu`: GPU core temperature (Celsius) +- `nvidia_smi_power_draw_watts`: Current GPU power consumption +- `nvidia_smi_fan_speed_ratio`: Fan speed percentage + +## Dashboards -GPU 卡已经关注哪些监控指标,缺少监控大盘JSON和告警规则JSON,欢迎大家 PR \ No newline at end of file +A companion basic Dashboard (`dashboard.json`) is provided in this directory to help you quickly set up visualization for GPU utilization, memory usage, temperature, and power consumption. diff --git a/inputs/nvidia_smi/README_CN.md b/inputs/nvidia_smi/README_CN.md new file mode 100644 index 000000000..2600d28f1 --- /dev/null +++ b/inputs/nvidia_smi/README_CN.md @@ -0,0 +1,43 @@ +# NVIDIA SMI 采集插件 + +该采集插件的原理是读取 `nvidia-smi` 命令行工具的输出,并转换为监控指标进行上报。它集成了 [nvidia_gpu_exporter](https://github.com/utkuozdemir/nvidia_gpu_exporter) 的核心代码。 + +**支持平台:** Linux, Windows (需安装 NVIDIA 显卡驱动并具备 `nvidia-smi` 命令) + +## 配置说明 + +配置文件位于 `conf/input.nvidia_smi/nvidia_smi.toml` + +```toml +# 采集 NVIDIA GPU 状态 +# interval = 15 + +[[instances]] +# 下面的配置是最核心的配置。如果要采集 nvidia-smi 的信息,请取消注释并给出 nvidia-smi 命令的绝对路径。 +# 相当于让 Categraf 执行本机的 nvidia-smi 命令,获取本机 GPU 的状态信息 +# nvidia_smi_command = "/usr/bin/nvidia-smi" + +# 如果想远程采集远端机器的 GPU 状态,可以使用 ssh 命令登录远端机器执行。 +# (由于 Categraf 通常是部署在每台物理机上的,因此绝大多数情况下不需要 SSH 方式) +# nvidia_smi_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null SSH_USER@SSH_HOST nvidia-smi" + +# 逗号分隔的查询字段列表。你可以运行 `nvidia-smi --help-query-gpus` 来查看所有支持的字段。 +# 填写 `AUTO` 将自动检测并采集支持的全部字段。 +query_field_names = "AUTO" +``` + +## 采集指标 + +该插件支持采集数百种 GPU 指标(具体取决于驱动版本和显卡型号),所有的指标均以 `nvidia_smi_` 作为前缀,并默认带有 `uuid`、`name` (如 Tesla T4) 等显卡标识的标签。 + +重点关注的指标有: +- `nvidia_smi_utilization_gpu_ratio`: GPU 算力利用率 (0~1) +- `nvidia_smi_utilization_memory_ratio`: 显存带宽利用率 (0~1) +- `nvidia_smi_memory_used_bytes` / `nvidia_smi_memory_total_bytes`: 显存使用量与总量 +- `nvidia_smi_temperature_gpu`: GPU 核心温度 (摄氏度) +- `nvidia_smi_power_draw_watts`: GPU 当前功耗 +- `nvidia_smi_fan_speed_ratio`: 风扇转速百分比 + +## 监控大盘 + +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),帮助您快速建立 GPU 的利用率、显存使用情况、温度与功耗的监控可视化体系。 \ No newline at end of file diff --git a/inputs/nvidia_smi/dashboard.json b/inputs/nvidia_smi/dashboard.json new file mode 100644 index 000000000..6235ed7bf --- /dev/null +++ b/inputs/nvidia_smi/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "NVIDIA GPU Metrics", + "uid": "ea32322d", + "tags": [ + "nvidia gpu metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "GPU Utilization (%)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "nvidia_smi_utilization_gpu_ratio * 100", + "legendFormat": "{{name}} ({{uuid}})", + "refId": "A" + } + ] + }, + { + "title": "GPU Memory Utilization (%)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "nvidia_smi_utilization_memory_ratio * 100", + "legendFormat": "{{name}} ({{uuid}})", + "refId": "A" + } + ] + }, + { + "title": "GPU Temperature (\u00b0C)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "nvidia_smi_temperature_gpu", + "legendFormat": "{{name}} ({{uuid}})", + "refId": "A" + } + ] + }, + { + "title": "GPU Power Draw (Watts)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "nvidia_smi_power_draw_watts", + "legendFormat": "{{name}} ({{uuid}})", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/oracle/README.md b/inputs/oracle/README.md index 49d4689f0..938abb883 100644 --- a/inputs/oracle/README.md +++ b/inputs/oracle/README.md @@ -56,8 +56,6 @@ cd /opt/categraf nohup ./categraf &> stdout.log & ``` -如果遇到报错 `ORA-01804: Error while trying to retrieve text for error ORA-01804` 请添加`ORACLE_HOME`环境变量 - ## 监控大盘 本 README 文件的同级目录下,提供了 dashboard.json 就是 Oracle 的监控大盘,可以导入夜莺使用。 diff --git a/inputs/phpfpm/README.md b/inputs/phpfpm/README.md index 64ccbd45d..3063f752d 100644 --- a/inputs/phpfpm/README.md +++ b/inputs/phpfpm/README.md @@ -1,24 +1,57 @@ -# nginx +# PHP-FPM Input Plugin -*PHP-FPM* 监控采集插件,由telegraf的phpfpm改造而来。 +This plugin monitors and collects process status metrics from PHP-FPM. It is adapted from the telegraf phpfpm plugin and supports connecting to the PHP-FPM status page via HTTP URLs or Unix Sockets. -该插件需要更改phpfpm的配置文件,开启 *pm.status_path*配置项 -``` +## Prerequisites + +Before using this plugin, you must modify the PHP-FPM configuration file (typically `www.conf`) to enable the `pm.status_path` directive: + +```ini pm.status_path = /status ``` - +After making this change, restart the PHP-FPM process for it to take effect. +If you use Nginx as a reverse proxy, ensure Nginx is also configured to forward the `/status` route to PHP-FPM. ## Configuration -请参考配置[示例](../../conf/input.phpfpm/phpfpm.toml)文件 +You can collect metrics from multiple PHP-FPM instances. Check `conf/input.phpfpm/phpfpm.toml` for details: + +```toml +# Collect PHP-FPM status +# interval = 15 + +[[instances]] +# URLs can be HTTP endpoints or Unix Sockets +# For example: +# urls = ["http://localhost/status", "unix:///var/run/php5-fpm.sock"] +urls = ["http://127.0.0.1/status"] + +# Notes: +# 1. The following timeout and authentication settings ONLY apply to HTTP URLs: +# response_timeout = "5s" +# username = "" +# password = "" +# headers = ["X-Custom-Header: value"] +# TLS/SSL configurations also only apply to HTTP. + +# 2. If you are using a Unix socket, you must ensure that the user running Categraf has read permissions for the socket file. +``` + +## Metrics + +All metrics are prefixed with `phpfpm_` and include `url` and `pool` tags by default. Key metrics include: + +- `phpfpm_accepted_conn`: Total number of requests accepted +- `phpfpm_listen_queue`: Number of requests in the queue of pending connections +- `phpfpm_max_listen_queue`: Maximum number of requests in the queue of pending connections since FPM has started +- `phpfpm_listen_queue_len`: The size of the socket queue of pending connections +- `phpfpm_idle_processes`: Number of idle processes +- `phpfpm_active_processes`: Number of active processes +- `phpfpm_total_processes`: Total number of idle + active processes +- `phpfpm_max_active_processes`: Maximum number of active processes since FPM has started +- `phpfpm_max_children_reached`: Number of times the process limit `pm.max_children` has been reached +- `phpfpm_slow_requests`: Number of slow requests (requires PHP-FPM slowlog to be enabled) - ### 注意事项: - 1. 如下配置 仅生效于HTTP的url - - response_timeout - - username & password - - headers - - TLS config - 2. 如果使用 Unix socket,需要保证 categraf 和 socket path 在同一个主机上,且 categraf 运行用户拥有读取该 path 的权限。 -## 监控大盘和告警规则 +## Dashboards -待更新... \ No newline at end of file +A basic companion Dashboard (`dashboard.json`) is provided in this directory to monitor PHP-FPM connection pool congestion, process distributions (Idle vs Active), and alerts for hitting the maximum child process limit. diff --git a/inputs/phpfpm/README_CN.md b/inputs/phpfpm/README_CN.md new file mode 100644 index 000000000..135760852 --- /dev/null +++ b/inputs/phpfpm/README_CN.md @@ -0,0 +1,57 @@ +# PHP-FPM 采集插件 + +该插件用于监控采集 PHP-FPM 的进程状态指标。它由 telegraf 的 phpfpm 插件改造而来,支持通过 HTTP URL 或 Unix Socket 连接到 PHP-FPM 的状态页获取数据。 + +## 前置要求 + +使用该插件前,必须修改 PHP-FPM 的配置文件(通常是 `www.conf`),开启 `pm.status_path` 配置项: + +```ini +pm.status_path = /status +``` +修改完成后,请重启 PHP-FPM 进程使配置生效。 +如果您使用 Nginx 反向代理,请确保 Nginx 也配置了对应的 `/status` 路由转发到 PHP-FPM。 + +## 配置说明 + +支持采集多个 PHP-FPM 实例,详细配置见 `conf/input.phpfpm/phpfpm.toml`: + +```toml +# 采集 PHP-FPM 状态 +# interval = 15 + +[[instances]] +# URLs 支持 HTTP 协议 或 Unix Socket +# 例如: +# urls = ["http://localhost/status", "unix:///var/run/php5-fpm.sock"] +urls = ["http://127.0.0.1/status"] + +# 注意事项: +# 1. 如下超时、认证等配置仅对 HTTP URL 生效: +# response_timeout = "5s" +# username = "" +# password = "" +# headers = ["X-Custom-Header: value"] +# TLS/SSL 配置同样仅对 HTTP 生效。 + +# 2. 如果使用 Unix socket,需要保证 Categraf 运行用户拥有读取该 socket 文件的权限。 +``` + +## 采集指标 + +所有指标前缀为 `phpfpm_`,默认携带 `url` 和 `pool` 标签。主要指标包括: + +- `phpfpm_accepted_conn`: 累计接收的请求总数 +- `phpfpm_listen_queue`: 请求等待队列中当前的请求数 +- `phpfpm_max_listen_queue`: 请求等待队列历史最高数 +- `phpfpm_listen_queue_len`: 配置的等待队列最大长度 +- `phpfpm_idle_processes`: 当前空闲的进程数 +- `phpfpm_active_processes`: 当前活跃(正在处理请求)的进程数 +- `phpfpm_total_processes`: 当前总进程数 +- `phpfpm_max_active_processes`: 历史最多同时活跃的进程数 +- `phpfpm_max_children_reached`: 进程数达到 `pm.max_children` 限制的次数 +- `phpfpm_slow_requests`: 处理慢的请求数 (需开启 PHP-FPM 的慢日志) + +## 监控大盘 + +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),可用于监控 PHP-FPM 连接池的拥挤情况、进程数分布(空闲 vs 活跃)、以及达到最大子进程数限制的告警指标。 \ No newline at end of file diff --git a/inputs/phpfpm/dashboard.json b/inputs/phpfpm/dashboard.json new file mode 100644 index 000000000..b533b6d31 --- /dev/null +++ b/inputs/phpfpm/dashboard.json @@ -0,0 +1,98 @@ +{ + "title": "PHP-FPM Status", + "uid": "84b323a9", + "tags": [ + "php-fpm status" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "PHP-FPM Active vs Idle Processes", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "phpfpm_active_processes", + "legendFormat": "Active: {{pool}}", + "refId": "A" + }, + { + "expr": "phpfpm_idle_processes", + "legendFormat": "Idle: {{pool}}", + "refId": "B" + } + ] + }, + { + "title": "PHP-FPM Listen Queue", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "phpfpm_listen_queue", + "legendFormat": "Current Queue: {{pool}}", + "refId": "A" + }, + { + "expr": "phpfpm_max_listen_queue", + "legendFormat": "Max Queue: {{pool}}", + "refId": "B" + } + ] + }, + { + "title": "PHP-FPM Requests Accepted/s", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(phpfpm_accepted_conn[5m])", + "legendFormat": "{{pool}}", + "refId": "A" + } + ] + }, + { + "title": "PHP-FPM Max Children Reached", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "increase(phpfpm_max_children_reached[5m])", + "legendFormat": "Limit Hit: {{pool}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/ping/README.md b/inputs/ping/README.md index 3fa794007..ec34ff861 100644 --- a/inputs/ping/README.md +++ b/inputs/ping/README.md @@ -3,9 +3,6 @@ ping 监控插件,探测远端目标地址能否 ping 通,如果机器没有禁 ping,这就是一个很好用的探测机器存活的手段 ## Configuration -这个插件有两种主要的操作方法:`exec` 和 `native`.推荐使用 `native` 方法,因为它具有更好的系统兼容性和性能. -为了向后兼容和更精准的response_ms,`native` 方法是默认的. -使用 `method = "exec"`,将会调用系统ping程序来发送ping packets. 要探测的机器配置到 targets 中,targets 是个数组,可以配置多个,当然也可以拆成多个 `[[instances]]` 配置段,比如: @@ -85,25 +82,3 @@ When using `method = "native"`, you will need permissions similar to the executa 大盘地址 [dashboard-2.0.json](https://github.com/flashcatcloud/categraf/tree/main/inputs/ping/dashboard-2.0.json) -## Example Output - -```text -ping_maximum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0.036 -ping_packets_transmitted agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 1 -ping_packets_received agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 1 -ping_average_response_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0.036 -ping_minimum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0.036 -ping_standard_deviation_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0 -ping_result_code agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0 -ping_percent_packet_loss agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0 -ping_ttl agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 64 -ping_minimum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 20.935 -ping_average_response_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 20.935 -ping_standard_deviation_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 0 -ping_result_code agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 0 -ping_packets_transmitted agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 1 -ping_packets_received agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 1 -ping_ttl agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 50 -ping_percent_packet_loss agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 0 -ping_maximum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 20.935 -``` \ No newline at end of file diff --git a/inputs/processes/README.md b/inputs/processes/README.md index 80b72347e..641dbb30b 100644 --- a/inputs/processes/README.md +++ b/inputs/processes/README.md @@ -1,7 +1,37 @@ -# processes +# Processes Input Plugin -统计进程数量,比如 running 的有多少,sleeping 的有多少,total 有多少 +This plugin counts the total number of processes in the operating system categorized by their current state. For example, it tracks how many processes are Running, Sleeping, or Zombie. -## 监控大盘 +**Supported Platforms:** Linux, FreeBSD, OpenBSD, macOS -该插件没有单独的监控大盘,OS 的监控大盘统一放到 system 下面了 \ No newline at end of file +*Note: This plugin is NOT supported on Windows.* + +## Configuration + +In most cases, no specific configuration is required; just leave it enabled. + +```toml +# Collect OS process state distributions +[[instances]] +# No specific configuration required +``` + +## Metrics + +All metrics are prefixed with `processes_`. Key metrics include but are not limited to: + +- `processes_total`: Total number of processes in the system +- `processes_running`: Number of running processes +- `processes_sleeping`: Number of sleeping processes +- `processes_zombies`: Number of zombie processes +- `processes_stopped`: Number of stopped processes +- `processes_paging`: Number of paging processes +- `processes_dead`: Number of dead processes +- `processes_idle`: Number of idle processes +- `processes_threads`: Total number of threads in the system +- `processes_total_threads`: Same as above, total number of threads + +## Dashboards + +These metrics are part of basic host monitoring. Typically, process counts are grouped under a global **System** dashboard alongside CPU and memory. +A basic companion Dashboard focusing strictly on OS process states is also provided in this directory. diff --git a/inputs/processes/README_CN.md b/inputs/processes/README_CN.md new file mode 100644 index 000000000..16d1950c5 --- /dev/null +++ b/inputs/processes/README_CN.md @@ -0,0 +1,37 @@ +# Processes 采集插件 + +该插件用于统计操作系统的总体进程数量分布。例如,系统中当前处于 Running、Sleeping、Zombie 等状态的进程各有多少个。 + +**支持平台:** Linux, FreeBSD, OpenBSD, macOS + +*注意:此插件在 Windows 上不受支持。Windows 系统下相关逻辑不生效。* + +## 配置说明 + +通常无需特殊配置,保持默认启用即可。 + +```toml +# 采集系统进程状态分布 +[[instances]] +# 无特别配置项 +``` + +## 采集指标 + +采集的指标统一使用 `processes_` 作为前缀。主要指标包括但不限于: + +- `processes_total`: 系统中总的进程数量 +- `processes_running`: 处于正在运行状态的进程数 +- `processes_sleeping`: 处于睡眠状态的进程数 +- `processes_zombies`: 处于僵尸状态的进程数 +- `processes_stopped`: 被暂停的进程数 +- `processes_paging`: 处于 paging 状态的进程数 +- `processes_dead`: 处于 dead 状态的进程数 +- `processes_idle`: 处于 idle 状态的进程数 +- `processes_threads`: 系统中总的线程数 +- `processes_total_threads`: 同上,系统中总的线程数 + +## 监控大盘 + +这些指标是主机基础监控的一部分。通常,OS 的进程监控会与其他硬件指标统一放置在 **System (主机系统)** 大盘中。 +本目录下也为您提供了一个仅包含进程状态分布的基础 Dashboard。 \ No newline at end of file diff --git a/inputs/processes/dashboard.json b/inputs/processes/dashboard.json new file mode 100644 index 000000000..edd76bc3e --- /dev/null +++ b/inputs/processes/dashboard.json @@ -0,0 +1,93 @@ +{ + "title": "OS Process Statistics", + "uid": "4165ad23", + "tags": [ + "os process statistics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Total Processes", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "processes_total", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Total Threads", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "processes_threads", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Running vs Sleeping", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "processes_running", + "legendFormat": "Running", + "refId": "A" + }, + { + "expr": "processes_sleeping", + "legendFormat": "Sleeping", + "refId": "B" + } + ] + }, + { + "title": "Zombie Processes", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "processes_zombies", + "legendFormat": "Zombies", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/prometheus/README.md b/inputs/prometheus/README.md index e95fe6a59..cb9955b71 100644 --- a/inputs/prometheus/README.md +++ b/inputs/prometheus/README.md @@ -1,45 +1,44 @@ -# prometheus +# Prometheus Input Plugin -prometheus 插件的作用,就是抓取 `/metrics` 接口的数据,上报给服务端。通过,各类 exporter 会暴露 `/metrics` 接口数据,越来越多的开源组件也会内置 prometheus SDK,吐出 prometheus 格式的监控数据,比如 rabbitmq 插件,其 README 中就有介绍。 +The purpose of this plugin is to generically scrape data exposed via HTTP `/metrics` endpoints (i.e., Prometheus format) and report it to the backend. +As the cloud-native ecosystem expands, an increasing number of open-source components and business applications natively embed a Prometheus SDK to expose monitoring data. You can easily scrape them using this plugin. -这个插件 fork 自 [telegraf/prometheus](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/prometheus),做了一些删减改造,仍然支持通过 consul 做服务发现,管理所有的目标地址,删掉了 Kubernetes 部分,Kubernetes 部分准备放到其他插件里实现。 +*(Note: Due to naming conventions and feature evolution, some of this functionality may also be handled by the `openmetrics` plugin, and their configuration logic is largely similar).* -增加了两个配置:url_label_key 和 url_label_value。为了标识监控数据是从哪个 scrape url 拉取的,会为监控数据附一个标签来标识这个 url,默认的标签 KEY 是用 instance,当然,也可以改成别的,不过不建议。url_label_value 是标签值,支持 go template 语法,如果为空,就是整个 url 的内容,也可以通过模板变量只取一部分,比如 `http://localhost:9104/metrics`,只想取 IP 和端口部分,就可以写成: +## Configuration -```ini -url_label_value = "{{.Host}}" +It supports scraping from a static list of URLs as well as dynamic scraping via Consul service discovery. + +```toml +# Scrape generic Prometheus endpoints +# interval = 15 + +[[instances]] +# Example of static URL configuration +urls = ["http://localhost:9100/metrics", "http://localhost:9104/metrics"] + +# Metric Label Extraction Control +# By default, Categraf attaches the scraped URL as the `instance` label to every metric. +# You can customize the preserved portion using `url_label_value` (supports Go Template syntax). +# url_label_key = "instance" +# url_label_value = "{{.Host}}" ``` -如果 HTTP scheme 部分和 `/metrics` Path 部分都想取,可以这么写: +### Advanced URL Label Configuration + +`url_label_value` supports various Go Template variables: +`{{.Scheme}}`, `{{.Host}}`, `{{.Hostname}}`, `{{.Port}}`, `{{.Path}}`, `{{.Query}}`, `{{.Fragment}}` -```ini -url_label_value = "{{.Scheme}}://{{.Host}}{{.Path}}" +If you only want the hostname and port: +```toml +url_label_value = "{{.Host}}" ``` -相关变量是用这个方法生成的,供大家参考: - -```go -func (ul *UrlLabel) GenerateLabel(u *url.URL) (string, string, error) { - if ul.LabelValue == "" { - return ul.LabelKey, u.String(), nil - } - - dict := map[string]string{ - "Scheme": u.Scheme, - "Host": u.Host, - "Hostname": u.Hostname(), - "Port": u.Port(), - "Path": u.Path, - "Query": u.RawQuery, - "Fragment": u.Fragment, - } - - var buffer bytes.Buffer - err := ul.LabelValueTpl.Execute(&buffer, dict) - if err != nil { - return "", "", err - } - - return ul.LabelKey, buffer.String(), nil -} -``` \ No newline at end of file +## Metrics + +The collected metric names and Labels will **100% faithfully preserve** the raw data returned by the target `/metrics` endpoint. + +## Dashboards + +Because this plugin is a generic data scraping tool, the collected metrics depend entirely on the target service being scraped. Therefore, there is no single fixed Dashboard that fits all use cases. +When configuring panels in Grafana or Nightingale, please customize your charts based on the specific metric names of your business service. The Dashboard provided in this directory is a text-based guidance panel. diff --git a/inputs/prometheus/README_CN.md b/inputs/prometheus/README_CN.md new file mode 100644 index 000000000..780b133d5 --- /dev/null +++ b/inputs/prometheus/README_CN.md @@ -0,0 +1,44 @@ +# Prometheus 采集插件 + +该插件的作用是通用抓取 HTTP `/metrics` 接口暴露的数据(即 Prometheus 格式数据),并将其上报给服务端。 +随着云原生生态的发展,越来越多的开源组件和业务程序原生内置了 Prometheus SDK 并暴露监控数据,您可以通过这个插件非常方便地抓取它们。 + +*(注意:由于命名冲突和功能演进,此插件的部分功能也可能由名为 `openmetrics` 的插件承载,配置逻辑基本互通)* + +## 配置说明 + +支持通过直接提供静态 URL 列表抓取,也支持通过 Consul 服务发现动态抓取。 + +```toml +# 采集通用 Prometheus 接口 +# interval = 15 + +[[instances]] +# 静态地址抓取配置示例 +urls = ["http://localhost:9100/metrics", "http://localhost:9104/metrics"] + +# 指标标签提取控制 +# 默认情况下,Categraf 会将抓取来源的 URL 地址作为 `instance` 标签附加到每条监控数据中。 +# 您可以通过 url_label_value 自定义要保留的部分 (支持 Go Template 语法) +# url_label_key = "instance" +# url_label_value = "{{.Host}}" +``` + +### URL 标签的高级配置 + +`url_label_value` 支持多种 Go Template 变量: +`{{.Scheme}}`, `{{.Host}}`, `{{.Hostname}}`, `{{.Port}}`, `{{.Path}}`, `{{.Query}}`, `{{.Fragment}}` + +如果你只想要主机名加端口: +```toml +url_label_value = "{{.Host}}" +``` + +## 采集指标 + +采集到的指标名和 Label 会 **100% 忠实保留** 目标 `/metrics` 接口返回的原始数据。 + +## 监控大盘 + +由于此插件是通用的数据抓取工具,采集的指标完全取决于它所抓取的目标服务。因此,不存在一个适用于所有情况的固定 Dashboard。 +在 Grafana 或夜莺中配置面板时,请根据您具体抓取的业务服务的指标名进行自定义绘制。本目录中提供的是一个纯文本指引说明的 Dashboard。 \ No newline at end of file diff --git a/inputs/prometheus/dashboard.json b/inputs/prometheus/dashboard.json new file mode 100644 index 000000000..46920af23 --- /dev/null +++ b/inputs/prometheus/dashboard.json @@ -0,0 +1,34 @@ +{ + "title": "Prometheus Generic Scraper", + "uid": "db5db5b5", + "tags": [ + "prometheus generic scraper" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Prometheus Plugin Info", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "up", + "legendFormat": "Scrape Status", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/rabbitmq/README.md b/inputs/rabbitmq/README.md index c9488030d..67a1d61f0 100644 --- a/inputs/rabbitmq/README.md +++ b/inputs/rabbitmq/README.md @@ -1,6 +1,6 @@ # rabbitmq -rabbitmq 监控采集插件,fork 自:[telegraf/rabbitmq](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/rabbitmq) 。不过,这个插件用处不大了,因为从 rabbitmq 3.8 版本开始,就内置了 prometheus 的支持,即,如果 rabbitmq 启用了 prometheus,可以直接暴露 metrics 接口,Categraf 从这个 metrics 接口拉取数据即可 +rabbitmq 监控采集插件,fork 自:telegraf/rabbitmq 。不过,这个插件用处不大了,因为从 rabbitmq 3.8 版本开始,就内置了 prometheus 的支持,即,如果 rabbitmq 启用了 prometheus,可以直接暴露 metrics 接口,Categraf 从这个 metrics 接口拉取数据即可 rabbitmq 启用 prometheus 插件: diff --git a/inputs/redfish/README.md b/inputs/redfish/README.md new file mode 100644 index 000000000..f1b68c86d --- /dev/null +++ b/inputs/redfish/README.md @@ -0,0 +1,58 @@ +# Redfish Input Plugin + +This plugin collects hardware sensor and status metrics from Out-of-Band (OOB) management interfaces that support the Redfish protocol (such as Dell iDRAC, HPE iLO, Lenovo XClarity, etc.). +Compared to the legacy IPMI protocol, Redfish provides richer hardware metrics formatted in JSON via modern HTTP/RESTful APIs. + +## Configuration + +```toml +# Collect Redfish hardware status metrics +# interval = 60 + +[[instances]] +# Configure connection addresses, accounts, and passwords for Redfish +# [[instances.addresses]] +# url = "https://10.0.0.1" +# username = "admin" +# password = "password" +# (Redfish often uses self-signed certificates, so you may want to skip TLS verification) +# insecure_skip_verify = true + +# ================================ +# Examples for defining metric collection paths (Sets/Metrics) +# The plugin parses specific numeric metrics based on the defined URN and JSON Paths +# ================================ + +[[instances.sets]] +urn = "/redfish/v1/Chassis/System.Embedded.1/Thermal" +prefix = "thermal_" +[[instances.sets.metrics]] +name = "temperature" +path = "Temperatures.#.ReadingCelsius" +[[instances.sets.metrics.tags]] +name = "name" +path = "Temperatures.#.Name" + +[[instances.sets]] +urn = "/redfish/v1/Chassis/System.Embedded.1/Power" +prefix = "power_" +[[instances.sets.metrics]] +name = "consumed_watts" +path = "PowerControl.#.PowerConsumedWatts" +``` + +## Metrics + +The metrics gathered by this plugin are completely dynamic, determined by the `sets` and `metrics` (parsed using JSON Path) in the configuration file. Typically, we monitor: + +- **Temperatures**: `redfish_thermal_temperature` (Celsius readings of various sensors) +- **Power**: `redfish_power_consumed_watts` (Current system power consumption) +- **Fans**: Fan speeds (RPM or percentage) +- **Disks**: Health statuses of physical disks and logical volumes +- **Power Supplies**: Operational statuses of redundant power supply modules + +By default, all metrics carry labels like the Redfish request URL, and you can also use `tags` to extract name fields from the JSON as Labels (e.g., extracting `Temperatures.#.Name` as the sensor name). + +## Dashboards + +A basic companion Dashboard (`dashboard.json`) is provided in this directory to quickly visualize critical hardware health indicators like server ambient temperatures and total power consumption collected via Redfish. diff --git a/inputs/redfish/README_CN.md b/inputs/redfish/README_CN.md new file mode 100644 index 000000000..b65c8c63d --- /dev/null +++ b/inputs/redfish/README_CN.md @@ -0,0 +1,58 @@ +# Redfish 采集插件 + +该插件用于采集支持 Redfish 协议的物理机带外管理(OOB)接口(如 Dell iDRAC、HPE iLO、Lenovo XClarity 等)的硬件传感器与状态指标。 +相比于传统的 IPMI 协议,Redfish 基于现代的 HTTP/RESTful API,提供更丰富的 JSON 格式硬件指标。 + +## 配置说明 + +```toml +# 采集 Redfish 硬件状态指标 +# interval = 60 + +[[instances]] +# 配置 Redfish 的连接地址、账户和密码 +# [[instances.addresses]] +# url = "https://10.0.0.1" +# username = "admin" +# password = "password" +# (由于 Redfish 通常使用自签名证书,可以忽略 TLS 校验) +# insecure_skip_verify = true + +# ================================ +# 以下为指标采集路径 (Sets/Metrics) 的配置示例 +# 插件会根据设定的 URN 和 JSON Path 解析出具体的数值型指标 +# ================================ + +[[instances.sets]] +urn = "/redfish/v1/Chassis/System.Embedded.1/Thermal" +prefix = "thermal_" +[[instances.sets.metrics]] +name = "temperature" +path = "Temperatures.#.ReadingCelsius" +[[instances.sets.metrics.tags]] +name = "name" +path = "Temperatures.#.Name" + +[[instances.sets]] +urn = "/redfish/v1/Chassis/System.Embedded.1/Power" +prefix = "power_" +[[instances.sets.metrics]] +name = "consumed_watts" +path = "PowerControl.#.PowerConsumedWatts" +``` + +## 采集指标 + +该插件的指标完全由配置文件中的 `sets` 和 `metrics` (基于 JSON Path 解析) 动态决定。通常我们会采集: + +- **温度**: `redfish_thermal_temperature` (各传感器的摄氏度) +- **功耗**: `redfish_power_consumed_watts` (系统当前功耗) +- **风扇**: 风扇转速 (RPM 或百分比) +- **磁盘**: 物理磁盘与逻辑卷的健康状态 +- **电源**: 冗余电源模块的运行状态 + +所有指标默认都会附带 Redfish 的请求地址等标签,也可以通过 `tags` 提取 JSON 中的名称字段作为 Label(例如提取 `Temperatures.#.Name` 作为传感器名字)。 + +## 监控大盘 + +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),可用于快速监控通过 Redfish 抓取到的服务器环境温度、整体功耗等关键硬件健康指标。 diff --git a/inputs/redfish/dashboard.json b/inputs/redfish/dashboard.json new file mode 100644 index 000000000..d0e679eaa --- /dev/null +++ b/inputs/redfish/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "Redfish Hardware Metrics", + "uid": "39980085", + "tags": [ + "redfish hardware metrics" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Redfish Temperature (\u00b0C)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "redfish_thermal_temperature", + "legendFormat": "{{name}} ({{url}})", + "refId": "A" + } + ] + }, + { + "title": "Redfish Power Consumption (Watts)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "redfish_power_consumed_watts", + "legendFormat": "{{url}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/redis/README.md b/inputs/redis/README.md index 47a7f811c..d84d6e3e2 100644 --- a/inputs/redis/README.md +++ b/inputs/redis/README.md @@ -36,3 +36,123 @@ labels = { instance="n9e-10.23.25.3:6379" } 该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置,alerts.json 是告警规则,可以导入夜莺使用。 +## 监控指标 (Metrics) + +以下指标由 `inputs/redis` 插件生成。所有指标默认带有 `redis_` 前缀。 + +### 基础指标 (General Metrics) + +| 指标名称 (Metric Name) | 类型 (Type) | 含义 (Description) | Labels | +| :--- | :--- | :--- | :--- | +| `scrape_use_seconds` | Gauge | 从该 Redis 实例采集数据的耗时 (秒) | `address` (or `cluster_name`, `source_node`) | +| `ping_use_seconds` | Gauge | 执行 PING 命令的耗时 (秒) | 同上 | +| `up` | Gauge | 实例存活状态。`1`: 正常 (Ping 成功), `0`: 异常 | 同上 | +| `instance_role` | Gauge | 实例角色。`1`: Master, `2`: Slave, `3`: Sentinel, `4`: Other | 同上, `replica_role` | + +### INFO 命令指标 (INFO Command Metrics) + +插件会执行 `INFO ALL` (或 `INFO`) 命令,并将结果解析为指标。 + +#### Server Section +| 指标名称 | 含义 | +| :--- | :--- | +| `uptime_in_seconds` | 运行时间 (秒) | + +#### Memory Section +*收集该部分的**所有**数值型字段,常见包括:* +| 指标名称 | 含义 | +| :--- | :--- | +| `used_memory` | 已分配内存总量 (Bytes) | +| `used_memory_rss` | 操作系统角度的驻留集大小 (RSS) | +| `used_memory_lua` | Lua 引擎使用的内存 | +| `maxmemory` | 配置的最大内存限制 | +| `mem_fragmentation_ratio` | 内存碎片率 | + +#### Stats Section +*收集该部分的**所有**数值型字段,常见包括:* +| 指标名称 | 含义 | +| :--- | :--- | +| `total_connections_received` | 累计接受的连接总数 | +| `total_commands_processed` | 累计处理的命令总数 | +| `instantaneous_ops_per_sec` | 当前 QPS | +| `keyspace_hits` | 键空间查找命中次数 | +| `keyspace_misses` | 键空间查找未命中次数 | +| `keyspace_hitrate` | **[计算指标]** 键空间命中率 (`hits / (hits + misses)`) | +| `rejected_connections` | 因达到最大连接数限制而拒绝的连接数 | +| `expired_keys` | 已过期的 Key 数量 | +| `evicted_keys` | 因内存限制被驱逐的 Key 数量 | + +#### Persistence Section +*收集该部分的**所有**数值型字段,常见包括:* +| 指标名称 | 含义 | +| :--- | :--- | +| `rdb_last_save_time` | 最后一次 RDB 成功保存的时间戳 | +| `rdb_last_save_time_elapsed` | **[计算指标]** 距离最后一次 RDB 保存经过的秒数 | +| `rdb_changes_since_last_save` | 上次 RDB 保存以来改变的 Key 数量 | +| `aof_enabled` | AOF 是否开启 (0: 否, 1: 是) | + +#### Clients Section +*收集该部分的**所有**数值型字段,常见包括:* +| 指标名称 | 含义 | +| :--- | :--- | +| `connected_clients` | 当前连接的客户端数量 | +| `blocked_clients` | 正在等待阻塞命令 (BLPOP 等) 的客户端数量 | + +#### Replication Section +*收集该部分的**所有**数值型字段,常见包括:* +| 指标名称 | 含义 | Labels | +| :--- | :--- | :--- | +| `connected_slaves` | 连接的从节点数量 | - | +| `master_repl_offset` | 全局复制偏移量 | - | +| `replication_lag` | 复制延迟 | - | +| `replication_` | 从节点详情 (如 `offset`, `lag`) | `replica_id`, `replica_ip`, `replica_port` | + +#### CPU Section +*收集该部分的**所有**数值型字段,常见包括:* +| 指标名称 | 含义 | +| :--- | :--- | +| `used_cpu_sys` | Redis 服务进程消耗的系统 CPU | +| `used_cpu_user` | Redis 服务进程消耗的用户 CPU | + +#### Cluster Section +| 指标名称 | 含义 | +| :--- | :--- | +| `cluster_enabled` | 集群模式是否开启 (0/1) | + +### Keyspace 指标 + +解析 `INFO keyspace` 部分,格式如 `db0:keys=...,expires=...,avg_ttl=...` + +| 指标名称 | 含义 | Labels | +| :--- | :--- | :--- | +| `keyspace_keys` | 数据库中的 Key 总数 | `db` (e.g., "db0") | +| `keyspace_expires` | 带有过期时间的 Key 数量 | `db` | +| `keyspace_avg_ttl` | Key 的平均生存时间 (毫秒) | `db` | + +### Command Stats 指标 + +解析 `INFO commandstats` 部分,格式如 `cmdstat_get:calls=...,usec=...,...` + +| 指标名称 | 含义 | Labels | +| :--- | :--- | :--- | +| `cmdstat_calls` | 命令调用次数 | `command` (e.g., "get", "set") | +| `cmdstat_usec` | 命令执行总耗时 (微秒) | `command` | +| `cmdstat_usec_per_call` | 命令平均耗时 (微秒) | `command` | +| `cmdstat_rejected_calls` | 拒绝执行次数 | `command` | +| `cmdstat_failed_calls` | 执行失败次数 | `command` | + +### 慢查询日志 (Slow Log) + +仅当配置 `gather_slowlog = true` 时采集。 + +| 指标名称 | 含义 | Labels | +| :--- | :--- | :--- | +| `slow_log` | 慢查询执行耗时 (微秒) | `client_addr`, `client_name`, `log_id`, `cmd` | + +### 自定义命令 (Custom Commands) + +根据配置文件中 `commands` 列表执行的命令结果。 + +| 指标名称 | 含义 | +| :--- | :--- | +| `exec_result_` | 自定义命令的返回值 (需可转换为 float) | diff --git a/inputs/redis/alerts.json b/inputs/redis/alerts.json index 30d50a158..46da6674f 100644 --- a/inputs/redis/alerts.json +++ b/inputs/redis/alerts.json @@ -128,7 +128,7 @@ "severity": 3, "disabled": 0, "prom_for_duration": 0, - "prom_ql": "redis_uptime_in_seconds < 60", + "prom_ql": "redis_uptime_in_seconds < 600", "prom_eval_interval": 15, "enable_stime": "00:00", "enable_etime": "23:59", diff --git a/inputs/redis_sentinel/README.md b/inputs/redis_sentinel/README.md index 30e658758..64ae2c87d 100644 --- a/inputs/redis_sentinel/README.md +++ b/inputs/redis_sentinel/README.md @@ -1,3 +1,41 @@ -# redis_sentinel +# Redis Sentinel Input Plugin -forked from [telegraf/redis_sentinel](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/redis_sentinel) +This plugin is specifically designed to collect operational and cluster topology metrics from Redis Sentinel nodes. +It is forked from `telegraf/redis_sentinel` and has been adapted and optimized for Categraf. By using this plugin, you can monitor the health and topology of backend Master and Slave nodes as seen by the Sentinels in real-time. + +## Configuration + +You can configure single or multiple Sentinel nodes within an `instance`. If you configure a list of `servers`, the instance will concurrently connect to each Sentinel, providing redundancy. + +```toml +# Collect Redis Sentinel status +# interval = 15 + +[[instances]] +# List of Sentinel node addresses, formatted as "tcp://host:port" or "host:port" +servers = ["tcp://localhost:26379"] + +# (Optional) Sentinel password +# password = "secret_password" + +# TLS/SSL Configuration (if TLS is enabled) +# insecure_skip_verify = true +``` + +## Metrics + +All metrics are prefixed with `redis_sentinel_`. Depending on the data collected, they are mainly divided into two categories: + +### Basic Sentinel Metrics (`redis_sentinel_*`) +E.g., `redis_sentinel_uptime_in_seconds`, `redis_sentinel_connected_clients`, `redis_sentinel_mem_used`, etc., which reflect the Sentinel process's liveness and basic resource usage. + +### Master / Slave Status Metrics +These metrics carry labels such as `master` (the master's name) to reflect the cluster topology as seen by Sentinel: +- `redis_sentinel_master_slaves`: Number of Slaves attached to the current Master +- `redis_sentinel_master_sentinels`: Number of Sentinel nodes monitoring this Master +- `redis_sentinel_master_status`: Master status (typically "ok" maps to 1, others map to 0 or specific error codes) +- `redis_sentinel_master_failover_state`: Current state value of the failover process + +## Dashboards + +A companion Dashboard (`dashboard.json`) is provided in this directory to centrally observe the Redis Master liveness, Slave counts, and the basic operational status of the Sentinels themselves. diff --git a/inputs/redis_sentinel/README_CN.md b/inputs/redis_sentinel/README_CN.md new file mode 100644 index 000000000..f10572a41 --- /dev/null +++ b/inputs/redis_sentinel/README_CN.md @@ -0,0 +1,41 @@ +# Redis Sentinel 采集插件 + +该插件专门用于采集 Redis Sentinel(哨兵节点)的运行状态与集群拓扑指标。 +它 fork 自 `telegraf/redis_sentinel`,并进行了适配和优化。使用此插件,您可以实时监控 Sentinel 对后端 Master 和 Slave 节点的监控状态。 + +## 配置说明 + +支持通过 `instances` 配置单个 Sentinel 节点或多个 Sentinel 节点,如果您配置了一个包含多个 Sentinel 的 `servers` 列表,该 Instance 内部将会并发连接每个 Sentinel,从而提供一定的冗余探测。 + +```toml +# 采集 Redis Sentinel 状态 +# interval = 15 + +[[instances]] +# Sentinel 节点地址列表,格式为 "tcp://host:port" 或 "host:port" +servers = ["tcp://localhost:26379"] + +# (可选) Sentinel 密码 +# password = "secret_password" + +# TLS/SSL 配置 (如果启用了 TLS) +# insecure_skip_verify = true +``` + +## 采集指标 + +所有的指标均以 `redis_sentinel_` 作为前缀。根据采集内容不同,主要包含两类数据: + +### Sentinel 自身基础指标 (`redis_sentinel_*`) +例如 `redis_sentinel_uptime_in_seconds`, `redis_sentinel_connected_clients`, `redis_sentinel_mem_used` 等,用于反映 Sentinel 进程的存活与基础资源开销。 + +### Master / Slave 状态指标 +这些指标携带 `master` (名字) 等标签,用于反映 Sentinel 眼中的集群拓扑: +- `redis_sentinel_master_slaves`: 当前 Master 下挂载的 Slave 数量 +- `redis_sentinel_master_sentinels`: 监控该 Master 的 Sentinel 节点数 +- `redis_sentinel_master_status`: Master 状态 (通常 "ok" 映射为 1,其他映射为 0 或具体错误码) +- `redis_sentinel_master_failover_state`: 故障转移(Failover)的当前状态值 + +## 监控大盘 + +本目录下提供了一个配套的 Dashboard (`dashboard.json`),用于集中观测 Sentinel 集群监控下的 Redis Master 存活状态、Slave 挂载数量,以及 Sentinel 自身的基础运行状态。 diff --git a/inputs/redis_sentinel/dashboard.json b/inputs/redis_sentinel/dashboard.json new file mode 100644 index 000000000..bb03b6395 --- /dev/null +++ b/inputs/redis_sentinel/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "Redis Sentinel Status", + "uid": "173f588c", + "tags": [ + "redis sentinel status" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Sentinel Uptime (days)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "redis_sentinel_uptime_in_seconds / 86400", + "legendFormat": "{{server}}", + "refId": "A" + } + ] + }, + { + "title": "Sentinel Connected Clients", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "redis_sentinel_connected_clients", + "legendFormat": "{{server}}", + "refId": "A" + } + ] + }, + { + "title": "Master Monitored Sentinels", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "redis_sentinel_master_sentinels", + "legendFormat": "Master: {{master}}", + "refId": "A" + } + ] + }, + { + "title": "Master Attached Slaves", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "redis_sentinel_master_slaves", + "legendFormat": "Master: {{master}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/redis_sentinel/redis_sentinel.go b/inputs/redis_sentinel/redis_sentinel.go index 8c2c09fbd..d2988d6fa 100644 --- a/inputs/redis_sentinel/redis_sentinel.go +++ b/inputs/redis_sentinel/redis_sentinel.go @@ -467,4 +467,4 @@ func prepareFieldValues(fields map[string]string, typeMap map[string]configField } return preparedFields, nil -} +} \ No newline at end of file diff --git a/inputs/rocketmq_offset/README.md b/inputs/rocketmq_offset/README.md new file mode 100644 index 000000000..d111191c0 --- /dev/null +++ b/inputs/rocketmq_offset/README.md @@ -0,0 +1,37 @@ +# RocketMQ Offset Input Plugin + +This plugin collects metrics for message offsets, broker limits, and consumer lags (diffs) across RocketMQ Topics and Consumer Groups by querying the HTTP API of the [RocketMQ Dashboard (formerly RocketMQ Console)](https://github.com/apache/rocketmq-dashboard). + +This provides a non-intrusive way of gathering metrics, particularly useful for older RocketMQ clusters that do not expose native OTLP/Prometheus endpoints. + +## Prerequisites + +You must have the RocketMQ Dashboard component deployed, and your Categraf instance must have network access to its HTTP port. + +## Configuration + +```toml +# Collect RocketMQ consumer offset and lag +# interval = 60 + +[[instances]] +# The host and port of the RocketMQ Dashboard (without http:// prefix) +rocketmq_console_ip_port = "127.0.0.1:8080" + +# (Optional) List of Topics to ignore. Useful for reducing unnecessary requests and metric cardinality. +# ignored_topics = ["RETRY_GROUP_TOPIC", "DLQ_GROUP_TOPIC"] +``` + +## Metrics + +By default, the plugin fetches the list of all Topics and then queries the consumer details for each Topic. Key metrics collected include: + +- `rocketmq_offset_diff`: The message backlog (lag) for a consumer group on a specific Broker/Queue +- `rocketmq_offset_broker_offset`: The maximum message offset at the Broker +- `rocketmq_offset_consumer_offset`: The offset up to which the Consumer has consumed + +These metrics are automatically tagged with `topic`, `consumerGroup`, `brokerName`, and `queueId` labels. + +## Dashboards + +A basic companion Dashboard (`dashboard.json`) is provided in this directory to monitor critical RocketMQ consumer lags and observe overall offset states. diff --git a/inputs/rocketmq_offset/README_CN.md b/inputs/rocketmq_offset/README_CN.md new file mode 100644 index 000000000..ef50d7101 --- /dev/null +++ b/inputs/rocketmq_offset/README_CN.md @@ -0,0 +1,37 @@ +# RocketMQ Offset 采集插件 + +该插件通过调用 [RocketMQ Dashboard (原 RocketMQ Console)](https://github.com/apache/rocketmq-dashboard) 的 HTTP API 来采集 RocketMQ 各个 Topic 和 Consumer Group 的消费偏移量 (Offset)、消息积压数 (Lag/Diff) 等指标。 + +这是一种非侵入式采集,适用于没有直接暴露出 OTLP/Prometheus 接口的较老版本 RocketMQ 集群。 + +## 前置要求 + +您需要预先部署好 RocketMQ Dashboard 组件,并确保 Categraf 可以访问该组件的 HTTP 端口。 + +## 配置说明 + +```toml +# 采集 RocketMQ 消费组积压与偏移量 +# interval = 60 + +[[instances]] +# RocketMQ Dashboard 的访问地址和端口 (无需带 http://) +rocketmq_console_ip_port = "127.0.0.1:8080" + +# (可选) 需要忽略采集的 Topic 列表,减少不必要的请求和指标基数 +# ignored_topics = ["RETRY_GROUP_TOPIC", "DLQ_GROUP_TOPIC"] +``` + +## 采集指标 + +该插件默认会抓取所有的 Topic 列表,然后查询每个 Topic 关联的消费组详情,核心采集指标包括: + +- `rocketmq_offset_diff`: 消费组在某个 Broker/Queue 上的消息积压量 (Lag) +- `rocketmq_offset_broker_offset`: Broker 端的最大偏移量 +- `rocketmq_offset_consumer_offset`: Consumer 端的已消费偏移量 + +这些指标将自动携带 `topic`, `consumerGroup`, `brokerName`, `queueId` 等标签。 + +## 监控大盘 + +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),可用于核心的 RocketMQ 消费积压 (Lag) 监控以及整体的偏移量情况观测。 diff --git a/inputs/rocketmq_offset/dashboard.json b/inputs/rocketmq_offset/dashboard.json new file mode 100644 index 000000000..45d2440dc --- /dev/null +++ b/inputs/rocketmq_offset/dashboard.json @@ -0,0 +1,70 @@ +{ + "title": "RocketMQ Consumer Lag Dashboard", + "uid": "5fd1862e", + "tags": [ + "rocketmq consumer lag dashboard" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "RocketMQ Consumer Lag", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "sum by (topic, consumerGroup) (rocketmq_offset_diff)", + "legendFormat": "{{topic}} - {{consumerGroup}}", + "refId": "A" + } + ] + }, + { + "title": "RocketMQ Topic Total Broker Offset", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "sum by (topic) (rocketmq_offset_broker_offset)", + "legendFormat": "{{topic}}", + "refId": "A" + } + ] + }, + { + "title": "Consumer Lag by Broker", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 24, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "sum by (brokerName, consumerGroup) (rocketmq_offset_diff)", + "legendFormat": "{{brokerName}} - {{consumerGroup}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/self_metrics/README.md b/inputs/self_metrics/README.md new file mode 100644 index 000000000..e9e1b3b28 --- /dev/null +++ b/inputs/self_metrics/README.md @@ -0,0 +1,36 @@ +# Self Metrics Input Plugin + +This plugin collects internal operational metrics of Categraf itself. It gathers Go runtime metrics (such as Goroutine counts, GC latencies, memory allocations) as well as Categraf-specific metrics regarding the metric pushing queues. + +This is critical for monitoring the health of the monitoring Agent itself, particularly for diagnosing queue backlogs or memory leaks. + +## Configuration + +Since it is a built-in plugin gathering its own state, the configuration is extremely simple and only needs to be enabled. + +```toml +# Collect Categraf's own metrics +# interval = 15 + +# [[instances]] +# No specific configuration required +``` + +## Metrics + +All relevant metrics are prefixed with `categraf_` or Go's default `go_` / `process_`. Core self-monitoring metrics include: + +- `categraf_info`: Categraf version information (value is 1, carrying a `version` tag) +- `categraf_metrics_enqueue_sum`: Total number of metrics enqueued to the sending queue +- `categraf_metrics_enqueue_failed_sum`: Total number of metrics that failed to enqueue +- `categraf_current_queue_size`: Current number of pending metrics in the memory queue (if this value keeps rising, it means the pushing rate to the backend is slower than the scraping rate, or the backend is failing) +- `go_goroutines`: Current number of Goroutines +- `go_memstats_alloc_bytes`: Memory allocated by the Go runtime +- `process_cpu_seconds_total`: Total CPU time consumed by the Categraf process +- `process_resident_memory_bytes`: Resident Set Size (RSS) physical memory used by the Categraf process + +These metrics are automatically tagged with `version` and other environmental tags. + +## Dashboards + +A companion basic Dashboard (`dashboard.json`) is provided in this directory to quickly monitor the Categraf process's CPU/Memory usage, Goroutine counts, and most importantly, the **metric sending queue backlog**. diff --git a/inputs/self_metrics/README_CN.md b/inputs/self_metrics/README_CN.md new file mode 100644 index 000000000..6a96ef5e3 --- /dev/null +++ b/inputs/self_metrics/README_CN.md @@ -0,0 +1,36 @@ +# Self Metrics 采集插件 + +该插件用于采集 Categraf 自身的运行状态指标,包含 Go 运行时的基础指标(如 Goroutine 数量、GC 耗时、内存分配)以及 Categraf 特有的指标推送队列状态。 + +这对于监控监控客户端 (Agent) 本身的健康度至关重要,特别是判断是否存在发送队列堆积或内存泄漏。 + +## 配置说明 + +由于是内置插件采集自身状态,配置通常极其简单,只需启用即可。 + +```toml +# 采集 Categraf 自身指标 +# interval = 15 + +# [[instances]] +# 无特殊配置项 +``` + +## 采集指标 + +所有相关指标均以 `categraf_` 或 Go 默认的 `go_` / `process_` 为前缀。核心自监控指标包括: + +- `categraf_info`: Categraf 版本信息,值为 1,带有 `version` 标签 +- `categraf_metrics_enqueue_sum`: 指标入队总数 (推送到发送队列) +- `categraf_metrics_enqueue_failed_sum`: 指标入队失败总数 +- `categraf_current_queue_size`: 当前待发送指标在内存队列中的堆积量 (如果此值持续上升,说明发送到服务端的速率跟不上采集速率,或服务端出现故障) +- `go_goroutines`: 当前 Goroutine 的数量 +- `go_memstats_alloc_bytes`: Go 运行时分配的内存大小 +- `process_cpu_seconds_total`: Categraf 进程累计消耗的 CPU 时间 +- `process_resident_memory_bytes`: Categraf 进程占用的常驻物理内存大小 (RSS) + +这些指标都会自动打上 `version` 等标签。 + +## 监控大盘 + +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),用于快速监控 Categraf 自身进程的 CPU/内存使用率、Goroutine 数量,以及最重要的 **指标发送队列堆积情况**。 diff --git a/inputs/self_metrics/dashboard.json b/inputs/self_metrics/dashboard.json new file mode 100644 index 000000000..e7605324c --- /dev/null +++ b/inputs/self_metrics/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "Categraf Self Monitoring Dashboard", + "uid": "e88865e7", + "tags": [ + "categraf self monitoring dashboard" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Categraf Queue Size", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "categraf_current_queue_size", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Categraf Enqueue Rate", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(categraf_metrics_enqueue_sum[5m])", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Categraf Memory (RSS Bytes)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "process_resident_memory_bytes{job=~'.*categraf.*'}", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Categraf Goroutines", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "go_goroutines{job=~'.*categraf.*'}", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/smart/README.md b/inputs/smart/README.md index 8bad5c4eb..a290b51d3 100644 --- a/inputs/smart/README.md +++ b/inputs/smart/README.md @@ -1,299 +1,60 @@ -# S.M.A.R.T. 插件 +# S.M.A.R.T. Input Plugin -从[telegraf](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/smart/README.md) fork,略作改动 +This plugin uses the command-line utility `smartctl` to collect S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage device health and status metrics. SMART is a monitoring system included in computer hard disk drives (HDDs) and solid-state drives (SSDs) that detects and reports on various indicators of drive reliability, with the intent of enabling the anticipation of hardware failures. -Get metrics using the command line utility `smartctl` for -S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage -devices. SMART is a monitoring system included in computer hard disk drives -(HDDs) and solid-state drives (SSDs) that detects and reports on various -indicators of drive reliability, with the intent of enabling the anticipation of -hardware failures. See smartmontools (). +This plugin is forked from `telegraf/smart` and adapted for Categraf. -SMART information is separated between different measurements: `smart_device` is -used for general information, while `smart_attribute` stores the detailed -attribute information if `attributes = true` is enabled in the plugin -configuration. - -If no devices are specified, the plugin will scan for SMART devices via the -following command: - -```sh -smartctl --scan -``` - -Metrics will be reported from the following `smartctl` command: - -```sh -smartctl --info --attributes --health -n --format=brief -``` - -This plugin supports _smartmontools_ version 5.41 and above, but v. 5.41 and -v. 5.42 might require setting `nocheck`, see the comment in the sample -configuration. Also, NVMe capabilities were introduced in version 6.5. - -To enable SMART on a storage device run: - -```sh -smartctl -s on -``` - -## NVMe vendor specific attributes - -For NVMe disk type, plugin can use command line utility `nvme-cli`. It has a -feature to easy access a vendor specific attributes. This plugin supports -nmve-cli version 1.5 and above (). In -case of `nvme-cli` absence NVMe vendor specific metrics will not be obtained. - -Vendor specific SMART metrics for NVMe disks may be reported from the following -`nvme` command: - -```sh -nvme smart-log-add -``` - -Note that vendor plugins for `nvme-cli` could require different naming -convention and report format. - -To see installed plugin extensions, depended on the nvme-cli version, look at -the bottom of: - -```sh -nvme help -``` - -To gather disk vendor id (vid) `id-ctrl` could be used: - -```sh -nvme id-ctrl -``` - -Association between a vid and company can be found there: -. - -Devices affiliation to being NVMe or non NVMe will be determined thanks to: - -```sh -smartctl --scan -``` - -and: - -```sh -smartctl --scan -d nvme -``` +## Prerequisites +- `smartmontools` (which includes the `smartctl` utility) must be installed on your system. + - Ubuntu/Debian: `sudo apt-get install smartmontools` + - CentOS/RHEL: `sudo yum install smartmontools` +- The user running Categraf generally requires `root` privileges to read disk SMART information. If you prefer to run Categraf as a non-root user, you can configure `sudo` for passwordless execution of `smartctl` and set `use_sudo = true` in the configuration. ## Configuration -```toml @示例 -# Read metrics from storage devices supporting S.M.A.R.T. -[[instances]] -## Optionally specify the path to the smartctl executable -# path_smartctl = "/usr/bin/smartctl" +```toml +# Collect S.M.A.R.T. hardware status +# interval = 60 -## Optionally specify the path to the nvme-cli executable -# path_nvme = "/usr/bin/nvme" +[[instances]] +# Optionally use sudo to execute smartctl +# use_sudo = false -## Optionally specify if vendor specific attributes should be propagated for NVMe disk case -## ["auto-on"] - automatically find and enable additional vendor specific disk info -## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info -# enable_extensions = ["auto-on"] +# (Optional) Specify the path to smartctl if it's not in the environment PATH +# path_smartctl = "/usr/sbin/smartctl" -## On most platforms used cli utilities requires root access. -## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli. -## Sudo must be configured to allow the categraf user to run smartctl or nvme-cli -## Sudo must be configured to allow the categraf user to run smartctl or nvme-cli -## without a password. -use_sudo = true +# (Optional) List of specific devices to monitor. +# If omitted (left empty), the plugin will automatically discover all drives using `smartctl --scan`. +# devices = [ "/dev/sda", "/dev/nvme0n1" ] -## Skip checking disks in this power mode. Defaults to -## "standby" to not wake up disks that have stopped rotating. -## See --nocheck in the man pages for smartctl. -## smartctl version 5.41 and 5.42 have faulty detection of -## power mode and might require changing this value to -## "never" depending on your disks. -# nocheck = "standby" +# Command timeout +# timeout = "5s" -## Gather all returned S.M.A.R.T. attribute metrics and the detailed -## information from each drive into the 'smart_attribute' measurement. +# Whether to collect detailed SMART attributes (generates more granular metrics) attributes = true - -## Optionally specify devices to exclude from reporting if disks auto-discovery is performed. -# excludes = [ "/dev/pass6" ] - -## Optionally specify devices and device type, if unset -## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done -## and all found will be included except for the excluded in excludes. -# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"] -# devices = ["dev/nvme0 -d nvme", "/dev/nvme0"] - -## Timeout for the cli command to complete. -timeout = "30s" - -## Optionally call smartctl and nvme-cli with a specific concurrency policy. -## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. -## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of -## SMART data - one individual array drive at the time. In such case please set this configuration option -## to "sequential" to get readings for all drives. -## valid options: concurrent, sequential -# read_method = "concurrent" ``` -## Permissions -采集需要sudo权限 - ## Metrics -- smart_device: - - tags: - - capacity - - device - - enabled - - model - - serial_no - - wwn - - fields: - - exit_status - - health_ok - - media_wearout_indicator - - percent_lifetime_remain - - read_error_rate - - seek_error - - temp_c - - udma_crc_errors - - wear_leveling_count - -- smart_attribute: - - tags: - - capacity - - device - - enabled - - fail - - flags - - id - - model - - name - - serial_no - - wwn - - fields: - - exit_status - - threshold - - value - - worst - - critical_warning - - temperature_celsius - - available_spare - - available_spare_threshold - - percentage_used - - data_units_read - - data_units_written - - host_read_commands - - host_write_commands - - controller_busy_time - - power_cycle_count - - power_on_hours - - unsafe_shutdowns - - media_and_data_integrity_errors - - error_information_log_entries - - warning_temperature_time - - critical_temperature_time - - program_fail_count - - erase_fail_count - - wear_leveling_count - - end_to_end_error_detection_count - - crc_error_count - - media_wear_percentage - - host_reads - - timed_workload_timer - - thermal_throttle_status - - retry_buffer_overflow_count - - pll_lock_loss_count - -### Flags +The collected metrics are separated into two main prefixes (depending on whether `attributes` is enabled): -The interpretation of the tag `flags` is: +### smart_device (General Device Metrics) +- `smart_device_health_ok`: Disk health status, 1 for healthy (PASSED), 0 for failure +- `smart_device_temp_c`: Current disk temperature (in Celsius) +- `smart_device_power_on_hours`: Total power-on hours +- `smart_device_power_cycle_count`: Power cycle count +- ... -- `K` auto-keep -- `C` event count -- `R` error rate -- `S` speed/performance -- `O` updated online -- `P` prefailure warning +### smart_attribute (Detailed Attribute Metrics) +If `attributes = true` is enabled, the plugin generates the following metrics for every specific SMART Attribute (e.g., Raw_Read_Error_Rate, Reallocated_Sector_Ct, etc.): +- `smart_attribute_value`: Current normalized value +- `smart_attribute_worst`: Worst recorded value +- `smart_attribute_threshold`: The failure threshold +- `smart_attribute_raw_value`: The raw sensor value (usually the most diagnostic) -### Exit Status +All metrics are tagged with `device` (e.g., `/dev/sda`) and the specific `serial_no` of the drive. -The `exit_status` field captures the exit status of the used cli utilities -command which is defined by a bitmask. For the interpretation of the bitmask see -the man page for smartctl or nvme-cli. - -## Device Names - -Device names, e.g., `/dev/sda`, are _not persistent_, and may be -subject to change across reboots or system changes. Instead, you can use the -_World Wide Name_ (WWN) or serial number to identify devices. On Linux block -devices can be referenced by the WWN in the following location: -`/dev/disk/by-id/`. - -## Troubleshooting - -If you expect to see more SMART metrics than this plugin shows, be sure to use a -proper version of smartctl or nvme-cli utility which has the functionality to -gather desired data. Also, check your device capability because not every SMART -metrics are mandatory. For example the number of temperature sensors depends on -the device specification. - -If this plugin is not working as expected for your SMART enabled device, -please run these commands and include the output in a bug report: - -For non NVMe devices (from smartctl version >= 7.0 this will also return NVMe -devices by default): - -```sh -smartctl --scan -``` +## Dashboards -For NVMe devices: - -```sh -smartctl --scan -d nvme -``` - -Run the following command replacing your configuration setting for NOCHECK and -the DEVICE (name of the device could be taken from the previous command): - -```sh -smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE -``` - -If you try to gather vendor specific metrics, please provide this command -and replace vendor and device to match your case: - -```sh -nvme VENDOR smart-log-add DEVICE -``` - -If you have specified devices array in configuration file, and categraf only -shows data from one device, you should change the plugin configuration to -sequentially gather disk attributes instead of collecting it in separate threads -(goroutines). To do this find in plugin configuration read_method and change it -to sequential: - -```toml - ## Optionally call smartctl and nvme-cli with a specific concurrency policy. - ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. - ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of - ## SMART data - one individual array drive at the time. In such case please set this configuration option - ## to "sequential" to get readings for all drives. - ## valid options: concurrent, sequential - read_method = "sequential" -``` - -## Example Output - -```text -smart_device_health_ok agent_hostname=1.2.3.4 device=nvme0 model=INTEL_SSDPE2KX040T8 serial_no=PHLJ830200CH4P0DGN 1 -smart_device_temp_c agent_hostname=1.2.3.4 device=nvme0 model=INTEL_SSDPE2KX040T8 serial_no=PHLJ830200CH4P0DGN 53 -smart_attribute_program_fail_count agent_hostname=1.2.3.4 device=nvme0 model= name=Program_Fail_Count serial_no=PHLJ830200CH4P0DGN 0 -smart_attribute_erase_fail_count agent_hostname=1.2.3.4 device=nvme0 model= name=Erase_Fail_Count serial_no=PHLJ830200CH4P0DGN 0 -smart_attribute_wear_leveling_count agent_hostname=1.2.3.4 device=nvme0 model= name=Wear_Leveling_Count serial_no=PHLJ830200CH4P0DGN 34360328200 -``` +A basic companion Dashboard (`dashboard.json`) is provided in this directory to monitor the overall health status (Health PASSED/FAILED), temperature distribution, and power-on hours of your server disks. diff --git a/inputs/smart/README_CN.md b/inputs/smart/README_CN.md new file mode 100644 index 000000000..b467d682b --- /dev/null +++ b/inputs/smart/README_CN.md @@ -0,0 +1,60 @@ +# S.M.A.R.T. 采集插件 + +该插件通过命令行工具 `smartctl` 来采集 S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) 存储设备的硬件状态和健康指标。S.M.A.R.T. 是集成在硬盘(HDD)和固态硬盘(SSD)中的监控系统,用于检测并报告各种驱动器可靠性指标,旨在预测硬件故障。 + +该插件从 `telegraf/smart` fork 而来,并针对 Categraf 进行了适配和优化。 + +## 前置要求 + +- 操作系统中必须安装有 `smartmontools` (包含 `smartctl` 命令行工具)。 + - Ubuntu/Debian: `sudo apt-get install smartmontools` + - CentOS/RHEL: `sudo yum install smartmontools` +- 运行 Categraf 的用户通常需要 `root` 权限才能读取磁盘 S.M.A.R.T. 信息。如果你希望以非 root 用户运行,可以通过配置 `sudo` 免密执行 `smartctl`,并在配置中开启 `use_sudo = true`。 + +## 配置说明 + +```toml +# 采集 S.M.A.R.T. 硬件状态指标 +# interval = 60 + +[[instances]] +# 是否使用 sudo 执行 smartctl +# use_sudo = false + +# (可选) 可选地提供一个特定的 smartctl 路径 +# path_smartctl = "/usr/sbin/smartctl" + +# (可选) 要采集监控的特定设备列表。 +# 如果不指定(留空),插件会默认执行 `smartctl --scan` 自动发现系统上的所有磁盘。 +# devices = [ "/dev/sda", "/dev/nvme0n1" ] + +# 采集超时时间 +# timeout = "5s" + +# 是否采集 SMART 的底层具体 Attribute 详情(会生成更多详细指标) +attributes = true +``` + +## 采集指标 + +SMART 采集的指标被拆分为两个主要前缀(这取决于你是否开启了 `attributes`): + +### smart_device (设备通用概览指标) +- `smart_device_health_ok`: 磁盘健康度,1 为健康 (PASSED),0 为异常 +- `smart_device_temp_c`: 磁盘当前温度(摄氏度) +- `smart_device_power_on_hours`: 通电小时数 +- `smart_device_power_cycle_count`: 电源循环(开关机)次数 +- ... + +### smart_attribute (详细 Attribute 指标) +如果开启了 `attributes = true`,则会为每一项特定的 SMART Attribute(如 Raw_Read_Error_Rate, Reallocated_Sector_Ct 等)生成以下指标: +- `smart_attribute_value`: 当前标准化数值 +- `smart_attribute_worst`: 历史最差数值 +- `smart_attribute_threshold`: 报警阈值 +- `smart_attribute_raw_value`: 传感器原始记录数值 (通常是最具诊断意义的) + +所有指标都会打上 `device` (如 `/dev/sda`) 以及具体的 `serial_no` (硬盘序列号) 标签。 + +## 监控大盘 + +本目录下提供了一个配套的 Dashboard (`dashboard.json`),用于监控服务器磁盘的整体健康度 (Health PASSED/FAILED)、温度分布以及累计通电时间。 diff --git a/inputs/smart/dashboard.json b/inputs/smart/dashboard.json new file mode 100644 index 000000000..7ea9a4b85 --- /dev/null +++ b/inputs/smart/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "S.M.A.R.T. Disk Health", + "uid": "87f72283", + "tags": [ + "s.m.a.r.t. disk health" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Disk Health Status (1=OK)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "smart_device_health_ok", + "legendFormat": "{{device}} ({{serial_no}})", + "refId": "A" + } + ] + }, + { + "title": "Disk Temperature (\u00b0C)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "smart_device_temp_c", + "legendFormat": "{{device}} ({{serial_no}})", + "refId": "A" + } + ] + }, + { + "title": "Power-On Hours", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "smart_device_power_on_hours", + "legendFormat": "{{device}} ({{serial_no}})", + "refId": "A" + } + ] + }, + { + "title": "Power Cycle Count", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "smart_device_power_cycle_count", + "legendFormat": "{{device}} ({{serial_no}})", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/snmp/README.md b/inputs/snmp/README.md index 1cc4f297b..a0e3adb62 100644 --- a/inputs/snmp/README.md +++ b/inputs/snmp/README.md @@ -1,20 +1,43 @@ -forked from [telegraf/snmp](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/snmp) +# SNMP Input Plugin -目前只修改了netsnmp的部分 ,配置中为了兼容,保留了path参数。 +This plugin actively polls monitoring metrics from network devices (e.g., switches, routers, firewalls) that support the SNMP protocol. +It is forked from [telegraf/snmp](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/snmp) and has been adapted and optimized for Categraf's underlying logic (like the netsnmp integration). -snmp_up代表设备是否存活,1 存活 0不存活,依赖ICMP +## Configuration + +You can flexibly collect scalar fields (`field`) or tabular data (`table`) by configuring the respective OIDs. + +```toml +# Collect SNMP monitoring data +# interval = 60 -配置示例 -``` [[instances]] +# SNMP Agent addresses agents = ["udp://172.30.15.189:161"] +# SNMP Timeout and Retries timeout = "5s" +retries = 1 + +# SNMP Version, supports 1, 2, 3 version = 2 community = "public" + +# (SNMP v3 Configurations, required if version=3) +# sec_name = "" +# sec_level = "authPriv" +# context_name = "" +# auth_protocol = "MD5" +# auth_password = "" +# priv_protocol = "DES" +# priv_password = "" + +# Automatically inject the target agent's IP into a specific tag agent_host_tag = "ident" -retries = 1 +# ================================ +# Scalar Fields Configuration +# ================================ [[instances.field]] oid = "RFC1213-MIB::sysUpTime.0" name = "uptime" @@ -22,16 +45,32 @@ name = "uptime" [[instances.field]] oid = "RFC1213-MIB::sysName.0" name = "source" -is_tag = true +is_tag = true # Extract this field as a Tag instead of a numeric metric +# ================================ +# Tables Configuration +# ================================ [[instances.table]] oid = "IF-MIB::ifTable" name = "interface" +# Inherit specified Tags from outer fields into all rows of the table inherit_tags = ["source"] [[instances.table.field]] oid = "IF-MIB::ifDescr" name = "ifDescr" is_tag = true +``` + +## Metrics + +The names of the collected metrics and tags are entirely determined by the `name` parameters you define in the `field` and `table` sections of the configuration file. +Common network metrics collected typically include: +- `uptime`: Device uptime +- `interface_ifInOctets` / `interface_ifOutOctets`: Port inbound/outbound traffic +- `interface_ifInErrors` / `interface_ifOutErrors`: Port inbound/outbound errors + +## Dashboards -``` \ No newline at end of file +Because the SNMP metrics are entirely driven by your custom OID configurations, there is no one-size-fits-all Dashboard. +A basic universal Dashboard is provided in this directory targeted at the classic network interfaces (IF-MIB) shown in the configuration example, mainly used for monitoring port traffic and error packets. diff --git a/inputs/snmp/README_CN.md b/inputs/snmp/README_CN.md new file mode 100644 index 000000000..030149180 --- /dev/null +++ b/inputs/snmp/README_CN.md @@ -0,0 +1,76 @@ +# SNMP 采集插件 + +该插件用于主动拉取支持 SNMP 协议的网络设备(如交换机、路由器、防火墙等)的监控指标。 +它从 [telegraf/snmp](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/snmp) fork 而来,并针对 Categraf 的底层逻辑(如 netsnmp 的集成)做出了适配与优化。 + +## 配置说明 + +通过配置 OID,可以灵活采集标量字段(`field`)或表格类(`table`)数据。 + +```toml +# 采集 SNMP 监控数据 +# interval = 60 + +[[instances]] +# SNMP Agent 地址 +agents = ["udp://172.30.15.189:161"] + +# SNMP 超时与重试 +timeout = "5s" +retries = 1 + +# SNMP 版本,支持 1, 2, 3 +version = 2 +community = "public" + +# (SNMP v3 相关配置,若 version=3 时填写) +# sec_name = "" +# sec_level = "authPriv" +# context_name = "" +# auth_protocol = "MD5" +# auth_password = "" +# priv_protocol = "DES" +# priv_password = "" + +# 自动将目标 agent 的 IP 注入到指定标签中 +agent_host_tag = "ident" + +# ================================ +# 标量字段 (Scalar Fields) 配置 +# ================================ +[[instances.field]] +oid = "RFC1213-MIB::sysUpTime.0" +name = "uptime" + +[[instances.field]] +oid = "RFC1213-MIB::sysName.0" +name = "source" +is_tag = true # 将该字段作为 Tag 提取,而不是数值指标 + +# ================================ +# 表格 (Tables) 配置 +# ================================ +[[instances.table]] +oid = "IF-MIB::ifTable" +name = "interface" +# 从外层字段中继承指定的 Tag 到表内的所有行中 +inherit_tags = ["source"] + +[[instances.table.field]] +oid = "IF-MIB::ifDescr" +name = "ifDescr" +is_tag = true +``` + +## 采集指标 + +所有采集的指标名和标签完全由你在配置文件中的 `field` 和 `table` 中的 `name` 参数决定。 +通常采集的通用网络指标包括: +- `uptime`: 设备运行时间 +- `interface_ifInOctets` / `interface_ifOutOctets`: 端口进出流量 +- `interface_ifInErrors` / `interface_ifOutErrors`: 端口错包数 + +## 监控大盘 + +由于 SNMP 采集内容由您的自定义 OID 配置完全决定,因此不存在固定普适的 Dashboard。 +本目录下为您提供了一个针对上述配置中经典网络接口 (IF-MIB) 指标的通用基础面板,主要用于监控端口流量和错误包。 \ No newline at end of file diff --git a/inputs/snmp/dashboard.json b/inputs/snmp/dashboard.json new file mode 100644 index 000000000..77eb2e181 --- /dev/null +++ b/inputs/snmp/dashboard.json @@ -0,0 +1,88 @@ +{ + "title": "SNMP Base Network Interfaces", + "uid": "59840654", + "tags": [ + "snmp base network interfaces" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "SNMP Device Uptime (days)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "uptime / 8640000", + "legendFormat": "{{ident}}", + "refId": "A" + } + ] + }, + { + "title": "Interface Inbound Traffic (bps)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(interface_ifInOctets[5m]) * 8", + "legendFormat": "{{ifDescr}} @ {{ident}}", + "refId": "A" + } + ] + }, + { + "title": "Interface Outbound Traffic (bps)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(interface_ifOutOctets[5m]) * 8", + "legendFormat": "{{ifDescr}} @ {{ident}}", + "refId": "A" + } + ] + }, + { + "title": "Interface Errors", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(interface_ifInErrors[5m]) + rate(interface_ifOutErrors[5m])", + "legendFormat": "{{ifDescr}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/snmp_trap/README.md b/inputs/snmp_trap/README.md index 4ad821519..36b07d5bf 100644 --- a/inputs/snmp_trap/README.md +++ b/inputs/snmp_trap/README.md @@ -89,17 +89,17 @@ details. ## Metric Aggregation and Mapping ## ----------------------------------------------------------------------- + ## Global Fields To Labels (Optional) + ## Promotes translated varbinds matching these names into labels for the core metric + ## and all dispersed metrics. + # fields_to_labels = ["ifIndex", "ifAdminStatus", "ifOperStatus"] + ## Global Varbind Mapping (Optional) ## Replaces or renames varbinds matching the numeric OID prefix. # [instances.varbind_mapping] # ".1.3.6.1.2.1.2.2.1.1" = "ifIndex" # ".1.3.6.1.2.1.2.2.1.7" = "ifAdminStatus" - ## Global Fields To Labels (Optional) - ## Promotes translated varbinds matching these names into labels for the core metric - ## and all dispersed metrics. - # fields_to_labels = ["ifIndex", "ifAdminStatus", "ifOperStatus"] - ## Trap Specific Mappings (Optional) ## Defines rules for specific traps. Takes precedence over global ## configurations for matched varbinds; unmatched varbinds still diff --git a/inputs/snmp_trap/README_CN.md b/inputs/snmp_trap/README_CN.md new file mode 100644 index 000000000..ddbe6c3b0 --- /dev/null +++ b/inputs/snmp_trap/README_CN.md @@ -0,0 +1,72 @@ +# SNMP Trap 采集插件 + +该插件用于接收网络设备主动发出的 SNMP Notifications (包括 Traps 和 Inform 请求)。 +通过监听 UDP 端口(默认 162),Categraf 可以接收设备的告警事件,并将其解析转换为时序监控指标。 + +本插件 fork 自 `telegraf/snmp_trap`,并在其基础上增强了 MIB 解析和指标名称的重映射能力。 + +## 前置要求 + +在 Linux 上监听 1024 以下的特权端口(如 162),通常需要 `root` 权限。为了遵循最小权限原则,建议不要直接使用 root 运行,而是通过 `setcap` 赋予 Categraf 二进制文件网络绑定特权: + +```shell +setcap cap_net_bind_service=+ep /usr/bin/categraf +``` + +## 配置说明 + +```toml +# 接收 SNMP Traps +[[instances]] +# 监听的传输协议、本地地址和端口。留空 IP 表示监听所有网卡。 +# service_address = "udp://:162" + +# 指定 MIB 文件的加载路径,供 gosmi 引擎进行 OID 翻译使用。 +# path = ["/usr/share/snmp/mibs"] + +# 使用的解析翻译引擎,推荐使用默认的 "gosmi" +# translator = "gosmi" + +# (SNMPv3 配置,如果在设备侧配置了 V3 trap 转发,则需在此配置对应的凭证) +# sec_name = "myuser" +# auth_protocol = "MD5" +# auth_password = "pass" +# sec_level = "authNoPriv" + +# ======================================================================= +# 指标聚合与映射配置 (Metric Aggregation and Mapping) +# ======================================================================= + +# 1. 全局字段转标签 (fields_to_labels) +# 如果 Trap 中带有的 Varbind 变量名匹配这些名称,它们将自动从指标值转换为 Labels。 +# fields_to_labels = ["ifIndex", "ifAdminStatus", "ifOperStatus"] + +# 2. 全局 Varbind OID 重命名映射 (varbind_mapping) +# [instances.varbind_mapping] +# ".1.3.6.1.2.1.2.2.1.1" = "ifIndex" + +# 3. 针对特定 Trap 的映射规则 (trap_mapping) +# 可以针对特定的 Trap OID 设置核心指标名 (name) 及主值 (value) 的提取。 +# [[instances.trap_mapping]] +# oid = ".1.3.6.1.6.3.1.1.5.3" +# name = "link_down" +# value = ".1.3.6.1.2.1.1.3" # 将 sysUpTime 提取为主指标的值 +``` + +## 采集指标 + +SNMP Trap 默认输出的核心指标名为 `snmp_trap`(或通过 `trap_mapping` 映射的其他名字如 `snmp_trap_link_down`)。 + +该指标将携带以下核心标签: +- `source`: 发送 Trap 的源 IP +- `name`: 翻译后的 Trap 名称 (如 `linkDown`) +- `oid`: Trap 的 OID +- `mib`: 所在的 MIB 模块名 +- `version`: Trap 版本 ("1", "2c" 或 "3") + +**关于字段处理:** +Trap 报文内附带的各个 Varbind 将作为这个指标的 fields 或被解析提取为 Tags(如果配置了 `fields_to_labels`)。 + +## 监控大盘 + +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),用于监控 SNMP Trap 的接收总量、Trap 名称分布以及来源 IP 的事件分布情况。 diff --git a/inputs/snmp_trap/dashboard.json b/inputs/snmp_trap/dashboard.json new file mode 100644 index 000000000..e684be1e6 --- /dev/null +++ b/inputs/snmp_trap/dashboard.json @@ -0,0 +1,70 @@ +{ + "title": "SNMP Trap Event Dashboard", + "uid": "9571fd7d", + "tags": [ + "snmp trap event dashboard" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "SNMP Traps Received", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "sum(snmp_trap)", + "legendFormat": "Total Traps", + "refId": "A" + } + ] + }, + { + "title": "Traps by Name / Event", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "sum by (name) (snmp_trap)", + "legendFormat": "{{name}}", + "refId": "A" + } + ] + }, + { + "title": "Traps by Source IP", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 24, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "sum by (source) (snmp_trap)", + "legendFormat": "{{source}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/snmp_zabbix/README.md b/inputs/snmp_zabbix/README.md index b49e0b997..0a990d851 100644 --- a/inputs/snmp_zabbix/README.md +++ b/inputs/snmp_zabbix/README.md @@ -1,1514 +1,49 @@ -## 一、插件概述 -### 1.1 插件功能介绍 -snmp_zabbix 是一款兼容Zabbix采集模板的SNMP 数据采集插件,其最大特色是能够直接使用 Zabbix 的 YAML 格式模板文件。这意味着您可以利用 Zabbix 丰富的模板生态系统,无需重新编写监控配置。 -主要功能包括: +# SNMP Zabbix Input Plugin - - 完整的 SNMP 协议支持:支持 SNMPv1、v2c、v3 所有版本 - - Zabbix 模板兼容:直接使用 Zabbix 6.0+ 的 YAML 格式模板 - - 自动发现机制:自动发现网络接口、文件系统等资源并动态创建监控项 - - 强大的预处理:支持正则表达式、JavaScript、数值计算等多种数据预处理方式 - - 精细化的调度:支持item粒度调度采集任务 - - 健康检查与自动恢复:自动检测连接状态并重连 +The `snmp_zabbix` plugin is an advanced SNMP data collection plugin that is fully compatible with Zabbix monitoring templates. Its killer feature is the ability to directly parse and execute Zabbix YAML template files. This means you can leverage the rich ecosystem of Zabbix templates without rewriting your monitoring configurations from scratch. -### 1.2 与snmp 插件的区别 -|特性|SNMP_Zabbix 插件| SNMP 插件| -|--------|------|-------| -|配置方式|Zabbix 模板 + 简单配置| 手动配置每个 OID| -|自动发现|支持 LLD(低级别发现)|需手动配置| -|预处理 |支持 20+ 种预处理方式|基本数值转换| -|模板复用|可直接使用 Zabbix 模板库|需从零开始| -|配置复杂度|低(使用现成模板)|高(逐个配置) -|动态监控项|支持(通过发现规则)|不支持| +This is highly recommended for users migrating from Zabbix to Categraf, or for monitoring a massive array of diverse network devices (Switches, Routers, Firewalls) using existing community templates. -### 1.3 适用场景 -从 Zabbix 迁移到 Categraf,希望复用现有监控模板 -需要监控大量网络设备(交换机、路由器、防火墙等) -需要动态发现和监控变化的资源(如网络接口) -需要对采集数据进行复杂预处理 -希望利用 Zabbix 社区丰富的模板资源 +## Key Features -### 1.4 系统要求和依赖 -- Categraf 版本:开源版 >= v0.4.24 企业版 >= v0.4.40 -- 网络要求:能够访问目标 SNMP 设备的 UDP (默认161) 端口 -- 目标设备:启用 SNMP 服务的网络设备或服务器 -- 模板要求:Zabbix 6.0+ 及以上的YAML 格式模板(不支持旧版 XML 格式) +- **Zabbix Template Compatibility:** Directly uses Zabbix 6.0+ YAML format templates. +- **Low-Level Discovery (LLD):** Automatically discovers network interfaces, file systems, and other resources to dynamically create monitoring items. +- **Advanced Preprocessing:** Supports 20+ preprocessing steps including Regex, JavaScript, Custom multipliers, etc. +- **Granular Scheduling:** Supports item-level scheduling tasks. +- **Full SNMP Protocol Support:** Supports SNMPv1, v2c, and v3. -## 二、Zabbix 模板获取与管理 -### 2.1 什么是 Zabbix 模板 -Zabbix 模板是预定义的监控配置集合,包含了监控项、发现规则、触发器等配置。每个模板针对特定类型的设备或服务,如 "Cisco Switch"、"Linux SNMP" 等。 +## Configuration -### 2.2 获取模板的方式 -#### 2.2.1 从 Zabbix Web 界面导出 -- 步骤 1:登录 Zabbix Web 界面 -比如,https://your-zabbix-server/ - -- 步骤2: 导航到模板页面 -数据采集(Data Collection) -> 模板(Templates) - -- 步骤 3:选择要导出的模板 -勾选需要导出的模板(可多选),点击底部"导出(Export)"按钮 -- 步骤 4:选择导出格式 -重要:选择 "YAML" 格式(Zabbix 6.0+),如果只有 XML 选项,说明 Zabbix 版本过低 -- 步骤 5:保存文件 -文件将自动下载,默认名称如:zbx_export_templates.yaml - -#### 2.2.2 使用 Zabbix API 导出 -方法 1:使用 curl 命令 -``` -# 1. 获取认证 token -# 7.0 以上版本, 请参考https://flashcat.cloud/blog/zabbix-to-flashcat/ 如何申请api token -# 以下接口均在7.2版本上验证, 7.0 以下版本接口和参数可能有些差异,请自行查询zabbix官网文档 - -# 2. 获取模板 ID -curl -s -X POST \ - -H 'Content-Type: application/json' \ - -H "Authorization: Bearer $TOKEN" \ - -d "{ - \"jsonrpc\": \"2.0\", - \"method\": \"template.get\", - \"params\": { - \"output\": [\"templateid\", \"name\"], - \"filter\": { - \"name\": [\"Template Net Cisco IOS SNMPv2\"] - } - }, - \"id\": 2 - }" \ - http://your-zabbix-server/api_jsonrpc.php | jq . - -# 3. 导出模板(获取到 templateid 后) -curl -s -X POST \ - -H 'Content-Type: application/json' \ - -H "Authorization: Bearer $TOKEN" \ - -d "{ - \"jsonrpc\": \"2.0\", - \"method\": \"configuration.export\", - \"params\": { - \"format\": \"yaml\", - \"options\": { - \"templates\": [\"10255\"] - } - }, - \"id\": 3 - }" \ - http://your-zabbix-server/api_jsonrpc.php | jq -r .result > template_cisco.yaml -``` - -#### 2.2.3 使用官方/社区模板 - 访问 Zabbix Git 仓库: -```bash -# 克隆整个模板仓库,从仓库中拷贝相关 yaml 文件 -git clone git@github.com:zabbix/zabbix.git - -# 或者直接下载特定模板 -wget https://github.com/zabbix/zabbix/blob/master/templates/net/cisco/cisco_snmp/template_net_cisco_snmp.yaml -``` -常用网络设备模板推荐: - -|设备类型|模板名称|文件路径| -|--------|------|-------| -|Cisco 交换机| Cisco IOS by SNMP|templates/net/cisco/cisco_snmp/template_net_cisco_snmp.yaml| -|Huawei 交换机|Huawei VRP by SNMP|templates/net/huawei_snmp/template_net_huawei_snmp.yaml| -|HP 交换机|HP Enterprise Switch by SNMP|templates/net/hp_hpn_snmp/template_net_hp_hpn_snmp.yaml| -|Linux 服务器|Linux by SNMP|templates/os/linux_snmp_snmp/template_os_linux_snmp_snmp.yaml| -|Windows 服务器|Windows by SNMP|templates/os/windows_snmp/template_os_windows_snmp.yaml| -|通用网络设备|Network Generic Device by SNMP|templates/net/generic_snmp/template_net_generic_snmp.yaml| - -这部分模板已经放到https://github.com/flashcatcloud/categraf/blob/master/conf/zbx_templates -### 2.3 模板文件格式转换(XML to YAML) - -如果只有 XML 格式的模板,需要进行转换: - -1. 把 XML 模板导入 Zabbix -2. 重新导出为 YAML 格式的模板 - - -## 三、Zabbix 模板结构详解 -### 3.1 模板基本结构 -一个典型的 Zabbix YAML 模板结构如下: -``` -zabbix_export: - version: '7.0' - date: '2024-01-15T10:00:00Z' - templates: - - template: Template Net Example Device - name: Template Net Example Device - description: Template for monitoring example network device - groups: - - name: Templates/Network devices - items: # 监控项 - - name: Interface {#IFNAME} incoming traffic - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} - key: net.if.in[{#IFNAME}] - delay: 60s - value_type: UNSIGNED - units: bps - preprocessing: - - type: CHANGE_PER_SECOND - discovery_rules: # 发现规则 - - name: Network interfaces discovery - type: SNMP_AGENT - snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2] - key: net.if.discovery - delay: 1h - filter: - conditions: - - macro: '{#IFNAME}' - value: '{$NET.IF.NAME.MATCHES}' - #value: '^(eth|bond|eno|ens)' # 直接这么写也行,上一行的写法只是为了用户宏的说明 - operator: MATCHES_REGEX - item_prototypes: # 项目原型 - - name: 'Interface {#IFNAME}: Bits received' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} - key: net.if.in[{#IFNAME}] - macros: # 用户宏 - - macro: '{$NET.IF.NAME.MATCHES}' - value: '^(eth|bond|eno|ens)' -``` -### 3.2 核心配置项说明 -#### 3.2.1 Items(监控项) -监控项定义了要采集的具体指标: -```yaml -items: - - name: CPU utilization # 监控项名称(用于显示) - type: SNMP_AGENT # 类型(插件只处理 SNMP_AGENT) - snmp_oid: .1.3.6.1.4.1.9.9.109.1.1.1.1.7.1 # SNMP OID - key: system.cpu.util # 唯一标识符 - value_type: FLOAT # 数据类型 - units: '%' # 单位 - delay: 30s # 采集间隔 - preprocessing: # 预处理步骤 - - type: MULTIPLIER - parameters: ['0.01'] # 乘以 0.01 转换为百分比 -``` -Type 类型说明: -- SNMP_AGENT:SNMPv2c(插件支持) -- SNMPV1_AGENT:SNMPv1(插件支持) -- SNMPV3_AGENT:SNMPv3(插件支持) -- 其他类型(如 ZABBIX_AGENT、HTTP_AGENT):插件忽略 - -Value_type 数据类型: -- FLOAT:浮点数 -- CHAR:字符(作为标签处理) -- LOG:日志 -- UNSIGNED:无符号整数 -- TEXT:文本(作为标签处理) - -Units 单位处理: -常见单位: -- B、KB、MB、GB:字节单位 -- bps、Kbps、Mbps:速率单位 -- %:百分比 -- ms、s:时间单位 - -#### 3.2.2 Discovery Rules(发现规则) -发现规则用于动态发现资源并创建监控项: -```yaml -discovery_rules: - - name: Network interfaces discovery - type: SNMP_AGENT - key: net.if.discovery - delay: 1h # 发现间隔 - snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2,{#IFTYPE},.1.3.6.1.2.1.2.2.1.3] - filter: - evaltype: AND # 过滤条件组合方式 - conditions: - - macro: '{#IFNAME}' - value: '^eth' - operator: MATCHES_REGEX - - macro: '{#IFTYPE}' - value: '6' # 以太网接口 - operator: EQUALS - item_prototypes: # 基于发现结果创建的监控项 - - name: 'Interface {#IFNAME}: Incoming traffic' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} - key: net.if.in[{#IFNAME}] -``` -Delay 采集间隔语法: -- 30s:30 秒 -- 5m:5 分钟 -- 1h:1 小时 -- 1d:1 天 -- 30:30 秒(纯数字默认为秒) -Filter 过滤器详解: - -过滤器用于筛选发现的资源,只为符合条件的资源创建监控项。 - -操作符(operator)类型: - -|操作符|说明|示例| -|--------|------|-------| -|EQUALS|完全匹配|value: `eth0`| -|NOT_EQUALS|不等于|value: `lo`| -|LIKE|包含|value: `eth`| -|NOT_LIKE|不包含|value: `docker`| -|MATCHES_REGEX|正则匹配 |value: `^(eth\|ens)`| -|NOT_MATCHES_REGEX|正则不匹配|value: `^lo`| - -条件组合方式(evaltype): - -`AND`:所有条件都满足 -`OR`:任一条件满足 -`FORMULA`:自定义表达式 - -Filter 经典案例: - -只监控物理网络接口: -```yaml -filter: - evaltype: AND - conditions: - - macro: '{#IFNAME}' - value: '^(eth|eno|ens|em)\d+$' - operator: MATCHES_REGEX - - macro: '{#IFADMINSTATUS}' - value: '1' # 管理状态为 UP - operator: EQUALS -``` - -排除虚拟接口和环回接口: -```yaml -filter: - evaltype: AND - conditions: - - macro: '{#IFNAME}' - value: '^(lo|docker|virbr|veth)' - operator: NOT_MATCHES_REGEX - - macro: '{#IFTYPE}' - value: '24' # 排除环回接口类型 - operator: NOT_EQUALS -``` -只监控特定 VLAN 接口: -``` -filter: - conditions: - - macro: '{#IFNAME}' - value: '\.(100|200|300)$' # VLAN 100, 200, 300 - operator: MATCHES_REGEX - -``` -使用复杂表达式: -``` -filter: - evaltype: FORMULA - formula: (A and B) or C - conditions: - - macro: '{#IFNAME}' - value: '^eth' - operator: MATCHES_REGEX - formulaid: A - - macro: '{#IFSPEED}' - value: '1000000000' # 1Gbps - operator: EQUALS - formulaid: B - - macro: '{#IFALIAS}' - value: 'IMPORTANT' - operator: LIKE - formulaid: C -``` - -#### 3.2.3 Preprocessing(预处理) -预处理步骤在数据存储前对其进行转换: -``` -preprocessing: - - type: CHANGE_PER_SECOND # 计算每秒变化率 - - type: MULTIPLIER # 乘法运算 - parameters: ['8'] # 字节转比特 - - type: REGEX # 正则提取 - parameters: - - 'Temperature: ([\d.]+)' - - '\1' -``` - -支持的预处理类型: -|类型|说明|参数示例| -|--------|------|-------| -|MULTIPLIER|乘数|`['0.001']`| -|SIMPLE_CHANGE|简单变化|无参数| -|CHANGE_PER_SECOND|每秒变化率|无参数| -|REGEX|正则表达式|`['pattern', 'output']`| -|JSONPATH|JSON路径|`['$.value']`| -|SNMP_WALK_TO_JSON|SNMP Walk转JSON|`['{#MACRO}', 'oid', '0']`| -|HEX_TO_DECIMAL|十六进制转十进制|无参数| -|JAVASCRIPT|JavaScript脚本|`['return value * 100']`| - -#### 3.2.4 Macros(宏) -宏用于动态替换配置中的值 - -用户宏 `{$MACRO}`: -``` -macros: - - macro: '{$SNMP.TIMEOUT}' - value: '5' - - macro: '{$CPU.UTIL.CRIT}' - value: '90' -``` -LLD 宏 `{#MACRO}`: - -在发现过程中自动填充: - -- `{#SNMPINDEX}`:SNMP 索引 -- `{#IFNAME}`:接口名称 -- `{#IFTYPE}`:接口类型 -- 自定义宏:通过 discovery 配置定义 -宏替换机制: - -- 发现阶段:提取 LLD 宏值 -- 展开阶段:将宏替换为实际值 -- 优先级:LLD 宏 > 用户宏 > 默认值 -## 四、插件配置详解 -### 4.1 基础配置 -#### 4.1.1 SNMP 连接参数 -``` -[[instances]] -# 目标设备列表 -agents = [ - "192.168.1.1", # 简单 IP - "192.168.1.2:161", # 指定端口 - "udp://192.168.1.3:161", # 指定协议 - "tcp://switch.example.com:161", # TCP 传输 - "192.168.1.0/24", # CIDR 网段(自动扫描) -] - -# SNMP 版本(1, 2, 3) -version = 2 - -# SNMPv1/v2c 参数 -community = "public" - -# SNMPv3 参数 -username = "snmpuser" -security_level = "authPriv" # noAuthNoPriv, authNoPriv, authPriv -auth_protocol = "SHA" # MD5, SHA, SHA224, SHA256, SHA384, SHA512 -auth_password = "auth_pass_123" -priv_protocol = "AES" # DES, AES, AES192, AES256 -priv_password = "priv_pass_456" -context_name = "" - -# 连接参数 -port = 161 # 默认 SNMP 端口 -timeout = "5s" # 超时时间 -retries = 3 # 重试次数 -max_repetitions = 10 # BULK 请求最大重复数 - -# UDP 套接字模式 -unconnected_udp_socket = false # 使用非连接模式(处理大量设备时更高效) -``` -Agents 配置格式说明: -- 支持多种格式混合使用 -- CIDR 网段会自动展开为单个 IP(全量IP) -- 默认使用 UDP 协议和 161 端口 -#### 4.1.2 模板加载 -方式一:加载外部文件 ```toml -template_files = [ - "/etc/categraf/templates/cisco_switch.yaml", - "/etc/categraf/templates/interface_addon.yaml" -] -``` -方式二:内嵌模板内容 -```toml -[instances.template_file_contents] -# 直接在配置中嵌入模板内容 -basic_template = ''' -zabbix_export: - version: '6.0' - templates: - - template: Embedded Template - items: - - name: System uptime - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.1.3.0 - key: system.uptime -''' -``` -多模板合并规则: -- 后加载的模板覆盖前面的同名项 -- 宏定义按名称合并 -- 发现规则独立处理 -### 4.2 高级配置 -#### 4.2.1 发现功能配置 - -发现功能会自动从模板中读取 discovery_rules,发现调度机制: -- 每个发现规则按其 delay 独立调度 -- 首次启动时立即执行一次发现 -- 发现结果会缓存,避免重复执行 - -4.2.2 健康检查 - -健康检查自动进行,默认参数: -- 检查间隔:30秒 -- 检查超时:5秒 -- 最大重试:3次 -- 自动重连:启用 - -健康检查是通过计数标记方式进行的,默认参数下,设备标记位不健康会通过3次检查来完成. 如下: - -第一次检查失败--30秒-->第二次检查失败--30秒-->第三次检查失败,达到最大重试次数,标记设备为不健康。 - -## 五、数据类型处理机制 -### 5.1 SNMP 数据类型映射 -|SNMP PDU 类型| Go 类型| 处理方式| -|--------|------|-------| -|Integer|int|直接使用| -|Counter32|uint32|转为 uint64| -|Counter64|uint64|直接使用| -|Gauge32|uint32|直接使用| -|TimeTicks|uint32|转换为秒(÷100)| -|OctetString|[]byte|自动识别可打印字符串| -|ObjectIdentifier|string|字符串表示| -|IPAddress|string|点分十进制表示| -|Opaque|[]byte|十六进制字符串| - -### 5.2 特殊类型处理 -#### 5.2.1 CHAR/TEXT 类型作为标签 -当监控项的 value_type 为 CHAR 或 TEXT 时,插件会将其作为标签而非指标值: -```yaml -item_prototypes: - - name: Interface {#IFNAME} alias - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.31.1.1.1.18.{#SNMPINDEX} - key: net.if.alias[{#IFNAME}] - value_type: TEXT # 文本类型 -``` -Label Provider 机制: -- 识别 CHAR/TEXT 类型的监控项 -- 提取标签键(从 key 中解析,如 net.if.alias -> net_if_alias) -- 缓存标签值并关联到相同索引的其他监控项 -- 在输出指标时自动添加这些标签 - -对于一些枚举的CHAR或者TEXT类型的item, 最佳实践是通过预处理将其转换为数值。比如 -```yaml -items: - - name: "Interface {#IFNAME}: Operational status" - key: "net.if.status[{#IFNAME}]" - type: SNMP_AGENT - snmp_oid: ".1.3.6.1.2.1.2.2.1.8.{#SNMPINDEX}" - value_type: UNSIGNED # 改为数值类型 - preprocessing: - - type: JAVASCRIPT - parameters: - - | - // 将 SNMP 返回的状态值转换为标准数值 - // IF-MIB::ifOperStatus 值: - // 1 = up, 2 = down, 3 = testing, 4 = unknown, - // 5 = dormant, 6 = notPresent, 7 = lowerLayerDown - - var statusMap = { - '1': 1, // up -> 1 - '2': 0, // down -> 0 - '3': 0, // testing -> 0 - '4': 0, // unknown -> 0 - '5': 0, // dormant -> 0 - '6': 0, // notPresent -> 0 - '7': 0 // lowerLayerDown -> 0 - }; - - return statusMap[value] !== undefined ? statusMap[value] : 0; -``` - -#### 5.2.2 Counter 类型处理 -计数器类型会自动处理溢出: -```yaml -preprocessing: - - type: CHANGE_PER_SECOND # 自动处理计数器溢出 - # 32位计数器最大值:4294967295 - # 64位计数器最大值:18446744073709551615 -``` -速率计算公式: -- 正常情况:(新值 - 旧值) / 时间差 -- 溢出情况:(最大值 - 旧值 + 新值) / 时间差 -#### 5.2.3 OctetString 处理 -OctetString 的处理取决于内容: -- 可打印字符串:直接转换为 string -- 二进制数据:转换为十六进制 -- 特殊处理(通过预处理): - - MAC 地址:MAC_FORMAT - - IP 地址:IP_FORMAT - - 十六进制数值:HEX_TO_DECIMAL -## 六、自动发现功能 -### 6.1 自动发现流程 -``` -1. 执行 SNMP Walk - ↓ -2. 提取索引和宏值 - ↓ -3. 应用过滤器 - ↓ -4. 生成监控项 - ↓ -5. 动态调度采集 -``` -### 6.2 支持的发现类型 -#### 6.2.1 网络接口发现 -``` -discovery_rules: - - name: Network interfaces discovery - snmp_oid: .1.3.6.1.2.1.2.2.1.2 # ifDescr - # 或使用多 OID 发现 - snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2,{#IFTYPE},.1.3.6.1.2.1.2.2.1.3] -``` -自动生成的宏: -- {#SNMPINDEX}:接口索引 -- {#IFNAME}:接口名称 -- {#IFTYPE}:接口类型 -#### 6.2.2 文件系统发现 -```yaml -discovery_rules: - - name: Mounted filesystem discovery - snmp_oid: .1.3.6.1.2.1.25.2.3.1.3 # hrStorageDescr -``` -#### 6.2.3 自定义发现 -使用 walk[] 语法执行多个 OID walk: -```yaml -discovery_rules: - - name: Custom discovery - snmp_oid: walk[.1.3.6.1.4.1.9.9.48.1.1.1.2,.1.3.6.1.4.1.9.9.48.1.1.1.5] - preprocessing: - - type: SNMP_WALK_TO_JSON - parameters: - - '{#VLANID}' - - '.1.3.6.1.4.1.9.9.48.1.1.1.2' - - '0' - - '{#VLANNAME}' - - '.1.3.6.1.4.1.9.9.48.1.1.1.5' - - '0' -``` -举个例子,原始 SNMP Walk 数据: -``` -.1.3.6.1.2.1.2.2.1.2.1 = "lo" -.1.3.6.1.2.1.2.2.1.2.2 = "eth0" -.1.3.6.1.2.1.2.2.1.2.3 = "eth1" -.1.3.6.1.2.1.2.2.1.3.1 = 24 # loopback type -.1.3.6.1.2.1.2.2.1.3.2 = 6 # ethernet type -.1.3.6.1.2.1.2.2.1.3.3 = 6 # ethernet type -.1.3.6.1.2.1.2.2.1.5.1 = 10000000 # 10 Mbps -.1.3.6.1.2.1.2.2.1.5.2 = 1000000000 # 1 Gbps -.1.3.6.1.2.1.2.2.1.5.3 = 10000000000 # 10 Gbps -``` -配置: -``` -discovery_rules: - - name: Network interface discovery - type: SNMP_AGENT - key: net.if.discovery - # 使用 walk[] 语法执行多个 OID walk - snmp_oid: walk[.1.3.6.1.2.1.2.2.1.2,.1.3.6.1.2.1.2.2.1.3,.1.3.6.1.2.1.2.2.1.5] - preprocessing: - - type: SNMP_WALK_TO_JSON - parameters: - - '{#IFNAME}' # 宏名称 - - '.1.3.6.1.2.1.2.2.1.2' # OID 基础 - - '0' # 批量提取标志 (0=单个值) - - '{#IFTYPE}' # 第二个宏 - - '.1.3.6.1.2.1.2.2.1.3' # 第二个 OID - - '0' - - '{#IFSPEED}' # 第三个宏 - - '.1.3.6.1.2.1.2.2.1.5' # 第三个 OID - - '0' -``` -生成的 JSON: -``` -[ - { - "{#SNMPINDEX}": "1", - "{#IFNAME}": "lo", - "{#IFTYPE}": "24", - "{#IFSPEED}": "10000000" - }, - { - "{#SNMPINDEX}": "2", - "{#IFNAME}": "eth0", - "{#IFTYPE}": "6", - "{#IFSPEED}": "1000000000" - }, - { - "{#SNMPINDEX}": "3", - "{#IFNAME}": "eth1", - "{#IFTYPE}": "6", - "{#IFSPEED}": "10000000000" - } -] -``` - -### 6.3 发现数据处理流程 -#### 6.3.1 OID Walk 执行 -插件使用 BulkWalk 提高效率: -- 自动处理 SNMP v1 的 Walk -- v2c/v3 使用 BulkWalk -- 支持并发多个 OID walk -#### 6.3.2 宏值提取 -从 OID 结果中提取: -``` -OID: .1.3.6.1.2.1.2.2.1.2.1 = "eth0" - └─ 基础 OID ─┘└─索引─┘ └值┘ - -提取结果: -{#SNMPINDEX} = "1" -{#IFNAME} = "eth0" -``` -#### 6.3.3 过滤器应用 -按照 filter 配置筛选发现的项目(见 3.2.2 Filter 部分) - -#### 6.3.4 监控项生成 -基于 item_prototypes 和宏值生成实际监控项: -``` -模板:net.if.in[{#IFNAME}] -宏值:{#IFNAME} = "eth0" -结果:net.if.in[eth0] -``` -## 七、预处理功能详解 -### 7.1 支持的预处理类型列表 -|类型| 用途| 示例| -|--------|------|-------| -|MULTIPLIER|数值乘法 |字节转比特(×8)| -|SIMPLE_CHANGE |简单变化 |当前值-上次值| -|CHANGE_PER_SECOND |速率计算 |流量速率| -|REGEX |正则提取 |从字符串提取数值| -|JSONPATH |JSON解析 |提取JSON字段| -|TRIM/LTRIM/RTRIM|字符串修剪|去除空白| -|JAVASCRIPT |JS脚本|复杂逻辑处理| -|HEX_TO_DECIMAL |进制转换 |0xFF -> 255| -|MAC_FORMAT |MAC格式化| 标准化MAC地址| -|IP_FORMAT |IP格式化| 提取IP地址| - -### 7.2 常用预处理案例 -#### 7.2.1 数值计算 -字节转比特: -```yaml -preprocessing: - - type: MULTIPLIER - parameters: ['8'] -``` -百分比转换: -```yaml -preprocessing: - - type: MULTIPLIER - parameters: ['0.01'] # 如果原值是 0-10000,转为 0-100 -``` -计算速率: -```yaml -preprocessing: - - type: CHANGE_PER_SECOND # 自动计算每秒变化 -``` -#### 7.2.2 字符串处理 -提取温度数值: -```yaml -preprocessing: - - type: REGEX - parameters: - - 'Temperature: ([\d.]+)°C' - - '\1' -``` -去除空白: -```yaml -preprocessing: - - type: TRIM # 去除前后空白 -``` -#### 7.2.3 格式转换 -MAC 地址格式化: -```yaml -preprocessing: - - type: MAC_FORMAT - parameters: [':'] # 分隔符(默认冒号) -# 输入:001122334455 或 00-11-22-33-44-55 -# 输出:00:11:22:33:44:55 -``` -十六进制转十进制: -```yaml -preprocessing: - - type: HEX_TO_DECIMAL -# 输入:FF 或 0xFF -# 输出:255 -``` -7.2.4 JavaScript 脚本 - -简单计算: -```yaml -preprocessing: - - type: JAVASCRIPT - parameters: ['return value * 100 / 1024'] -``` -条件处理: -```yaml -preprocessing: - - type: JAVASCRIPT - parameters: - - | - if (value > 1000000) { - return value / 1000000; // 转为 MB - } - return value / 1000; // 转为 KB -``` -字符串处理: -```yaml -preprocessing: - - type: JAVASCRIPT - parameters: - - 'return value.toLowerCase().replace(/\s+/g, "_")' -``` -#### 7.2.5 JSONPath 提取 -提取 JSON 字段: -```yaml -preprocessing: - - type: JSONPATH - parameters: ['$.temperature.value'] -``` -提取数组元素: -```yaml -preprocessing: - - type: JSONPATH - parameters: ['$.interfaces[0].name'] -``` -## 八、实际配置示例 -### 8.1 最小化配置示例 -```toml -# /etc/categraf/conf/inputs.snmp_zabbix/snmp_zabbix.toml - -[[instances]] -# 最简配置:监控单个设备的系统信息 -agents = ["192.168.1.1"] -version = 2 -community = "public" -template_files = ["/etc/categraf/templates/basic_system.yaml"] -``` -对应的最简模板: -```yaml -# /etc/categraf/templates/basic_system.yaml -zabbix_export: - version: '6.0' - templates: - - template: Basic System - items: - - name: System uptime - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.1.3.0 - key: system.uptime - value_type: UNSIGNED - preprocessing: - - type: MULTIPLIER - parameters: ['0.01'] # TimeTicks to seconds -``` -### 8.2 完整配置示例 -```toml -# /etc/categraf/conf/inputs.snmp_zabbix/snmp_zabbix.toml +# Collect SNMP metrics via Zabbix Templates +# interval = 60 [[instances]] -# 基础标签 -labels = { region = "beijing", env = "production" } +# Target SNMP Agent address +agent = "192.168.1.1:161" -# SNMP 连接配置 -agents = [ - "192.168.1.0/24", # 扫描整个网段 - "core-switch.example.com" # 域名 -] -version = 2 +# SNMP credentials +version = "2c" community = "public" -port = 161 -timeout = "5s" -retries = 3 -max_repetitions = 25 -# 加载多个模板(会自动合并) -template_files = [ - "/etc/categraf/templates/cisco_catalyst.yaml", - "/etc/categraf/templates/custom_oids.yaml" -] +# The path to your Zabbix YAML templates directory +# The plugin will parse these templates to execute discovery and item polling +template_dir = "/opt/categraf/conf/zabbix_templates/" -# 设备映射标签 -[instances.mappings] -"192.168.1.1" = { device_name = "core-sw-01", location = "DC1" } -"192.168.1.2" = { device_name = "core-sw-02", location = "DC2" } -``` +# Specify which templates to link to this instance +templates = ["Template Net Cisco IOS SNMP"] -## 8.3 常见场景配置 -### 8.3.1 交换机端口监控 -```toml -[[instances]] -agents = ["192.168.1.1"] -version = 2 -community = "public" - -# 使用 Cisco 官方模板 -template_files = ["/etc/categraf/templates/net/cisco/cisco_snmp/template_net_cisco_snmp.yaml"] - -# 或内嵌简化模板 -[instances.template_file_contents] -switch_interfaces = ''' -zabbix_export: - version: '6.0' - templates: - - template: Switch Interfaces - discovery_rules: - - name: Interface discovery - type: SNMP_AGENT - key: net.if.discovery - delay: 1h - snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2,{#IFTYPE},.1.3.6.1.2.1.2.2.1.3,{#IFADMINSTATUS},.1.3.6.1.2.1.2.2.1.7] - filter: - evaltype: AND - conditions: - - macro: '{#IFTYPE}' - value: '6' # Ethernet - operator: EQUALS - - macro: '{#IFADMINSTATUS}' - value: '1' # UP - operator: EQUALS - item_prototypes: - - name: 'Interface {#IFNAME}: Incoming traffic' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} - key: net.if.in[{#IFNAME}] - value_type: UNSIGNED - units: bps - preprocessing: - - type: CHANGE_PER_SECOND - - type: MULTIPLIER - parameters: ['8'] - - name: 'Interface {#IFNAME}: Outgoing traffic' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.2.2.1.16.{#SNMPINDEX} - key: net.if.out[{#IFNAME}] - value_type: UNSIGNED - units: bps - preprocessing: - - type: CHANGE_PER_SECOND - - type: MULTIPLIER - parameters: ['8'] - - name: 'Interface {#IFNAME}: Description' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.31.1.1.1.18.{#SNMPINDEX} - key: net.if.alias[{#IFNAME}] - value_type: TEXT # 作为标签 -''' -``` -#### 8.3.2 路由器监控 -```toml -[[instances]] -agents = ["10.0.0.1"] -version = 3 -username = "snmpv3user" -security_level = "authPriv" -auth_protocol = "SHA" -auth_password = "authpass123" -priv_protocol = "AES" -priv_password = "privpass456" - -[instances.template_file_contents] -router_template = ''' -zabbix_export: - version: '6.0' - templates: - - template: Router Monitoring - items: - # CPU 使用率 - - name: CPU utilization - type: SNMP_AGENT - snmp_oid: .1.3.6.1.4.1.9.9.109.1.1.1.1.7.1 - key: system.cpu.util - value_type: FLOAT - units: '%' - # 内存使用 - - name: Memory used - type: SNMP_AGENT - snmp_oid: .1.3.6.1.4.1.9.9.48.1.1.1.5.1 - key: vm.memory.used - value_type: UNSIGNED - units: B - # 路由表大小 - - name: Routing table size - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.4.24.3.0 - key: net.routing.table.size - value_type: UNSIGNED - discovery_rules: - # BGP 邻居发现 - - name: BGP peer discovery - type: SNMP_AGENT - key: bgp.peer.discovery - snmp_oid: .1.3.6.1.2.1.15.3.1.2 - item_prototypes: - - name: 'BGP peer {#PEER}: State' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.15.3.1.2.{#SNMPINDEX} - key: bgp.peer.state[{#PEER}] - value_type: UNSIGNED -''' -``` -#### 8.3.3 存储设备监控 -``` -[[instances]] -agents = ["storage.example.com"] -version = 2 -community = "public" - -[instances.template_file_contents] -storage_template = ''' -zabbix_export: - version: '6.0' - templates: - - template: Storage Device - discovery_rules: - - name: Storage discovery - type: SNMP_AGENT - key: storage.discovery - delay: 30m - snmp_oid: discovery[{#STORAGEDESCR},.1.3.6.1.2.1.25.2.3.1.3,{#STORAGETYPE},.1.3.6.1.2.1.25.2.3.1.2] - filter: - conditions: - - macro: '{#STORAGETYPE}' - value: '.1.3.6.1.2.1.25.2.1.4' # Fixed disk - operator: EQUALS - item_prototypes: - - name: '{#STORAGEDESCR}: Total space' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.25.2.3.1.5.{#SNMPINDEX} - key: vfs.fs.size[{#STORAGEDESCR},total] - value_type: UNSIGNED - units: B - preprocessing: - - type: MULTIPLIER - parameters: ['4096'] # 块大小 - - name: '{#STORAGEDESCR}: Used space' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.25.2.3.1.6.{#SNMPINDEX} - key: vfs.fs.size[{#STORAGEDESCR},used] - value_type: UNSIGNED - units: B - preprocessing: - - type: MULTIPLIER - parameters: ['4096'] - - name: '{#STORAGEDESCR}: Usage in %' - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.25.2.3.1.6.{#SNMPINDEX} - key: vfs.fs.pused[{#STORAGEDESCR}] - value_type: FLOAT - units: '%' - preprocessing: - - type: JAVASCRIPT - parameters: - - | - var used = value; - var total = 1000000; // 需要从其他地方获取 - return (used / total) * 100; -''' -``` -#### 8.3.4 打印机监控 -``` -[[instances]] -agents = ["printer.example.com"] -version = 1 # 很多打印机只支持 v1 -community = "public" - -[instances.template_file_contents] -printer_template = ''' -zabbix_export: - version: '6.0' - templates: - - template: Printer Monitoring - items: - - name: Printer status - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.25.3.5.1.1.1 - key: printer.status - value_type: UNSIGNED - - name: Printer error state - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.25.3.5.1.2.1 - key: printer.error - value_type: TEXT - - name: Toner level black - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.43.11.1.1.9.1.1 - key: printer.toner.black - value_type: UNSIGNED - units: '%' - - name: Pages printed - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.43.10.2.1.4.1.1 - key: printer.pages.total - value_type: UNSIGNED -''' -``` -## 九、故障排查 -### 9.1 常见问题及解决方案 -#### 9.1.1 连接问题 -问题:无法连接到 SNMP 设备 - -检查步骤: -```bash -# 1. 测试网络连通性 -ping 192.168.1.1 - -# 2. 测试 SNMP 端口 -nc -zvu 192.168.1.1 161 - -# 3. 使用 snmpwalk 测试 -snmpwalk -v2c -c public 192.168.1.1 system - -# 4. 检查防火墙 -sudo iptables -L -n | grep 161 -``` -常见原因: -- 防火墙阻止 UDP 161 端口 -- SNMP 服务未启动 -- Community 字符串错误 -- ACL 限制访问 -#### 9.1.2 认证问题 -SNMPv3 认证失败 - -检查配置: -```toml -# 确保所有参数匹配 -username = "snmpuser" -security_level = "authPriv" -auth_protocol = "SHA" # 大小写敏感 -auth_password = "password" # 至少8个字符 -priv_protocol = "AES" -priv_password = "password" # 至少8个字符 -``` -测试命令: -```bash -snmpget -v3 -l authPriv -u snmpuser -a SHA -A authpass123 -x AES -X privpass456 192.168.1.1 sysDescr.0 -``` -#### 9.1.3 OID 不存在 -错误:OID not found on device - -排查方法: -```bash -# 1. 列出设备支持的所有 OID -snmpwalk -v2c -c public 192.168.1.1 .1 - -# 2. 检查特定 OID -snmpget -v2c -c public 192.168.1.1 .1.3.6.1.2.1.2.2.1.10.1 - -# 3. 查看 MIB 支持 -snmpwalk -v2c -c public 192.168.1.1 sysORTable -``` -解决方案: -- 确认设备支持该 MIB -- 使用正确的 OID -- 某些设备需要启用特定 MIB -#### 9.1.4 发现失败 -发现规则未返回任何项目 - -调试步骤: - -手动执行 walk: -```bash -snmpwalk -v2c -c public 192.168.1.1 .1.3.6.1.2.1.2.2.1.2 +# Optional: Set host macros that are referenced in the Zabbix template +# [instances.macros] +# "{$SNMP_PORT}" = "161" ``` -检查过滤器: -```yaml -filter: - conditions: - - macro: '{#IFNAME}' - value: 'eth' # 可能过滤太严格 - operator: LIKE -``` -查看日志: -```bash -tail -f /var/log/categraf/categraf.log | grep -i discovery -``` -#### 9.1.5 预处理错误 -预处理失败的常见原因: -正则表达式错误: -```yaml -# 错误:未转义特殊字符 -- type: REGEX - parameters: ['Temp: (\d+).(\d+)', '\1.\2'] - -# 正确: -- type: REGEX - parameters: ['Temp: (\d+)\.(\d+)', '\1.\2'] -``` -JavaScript 语法错误: -```yaml -# 错误:缺少 return -- type: JAVASCRIPT - parameters: ['value * 100'] - -# 正确: -- type: JAVASCRIPT - parameters: ['return value * 100'] -``` -类型不匹配: -```yaml -# 错误:对字符串使用数值运算 -- type: MULTIPLIER - parameters: ['8'] - -# 正确:先转换类型 -- type: REGEX - parameters: ['(\d+)', '\1'] -- type: MULTIPLIER - parameters: ['8'] -``` -### 9.2 调试模式使用 -启用调试模式: -``` -# 启动时添加 debug 参数 -./categraf --debug --inputs snmp_zabbix - -# 查看详细日志 -tail -f /var/log/categraf/categraf.log -``` -### 9.3 日志分析 -关键日志标识: -- E! - 错误 -- W! - 警告 -- I! - 信息 -- D! - 调试 -常见日志分析: -```bash -# 查看错误 -grep "E!" /var/log/categraf/categraf.log +## Metrics -# 查看发现相关 -grep -i discovery /var/log/categraf/categraf.log +Because the metrics are dynamically discovered and generated based on the chosen Zabbix template, the exact metric names will vary. Typically, the plugin automatically normalizes Zabbix item keys into Prometheus-style metric names. -# 查看特定设备 -grep "192.168.1.1" /var/log/categraf/categraf.log - -# 查看预处理错误 -grep -i preprocessing /var/log/categraf/categraf.log -``` -### 9.4 性能问题排查 -采集延迟或超时 - -优化建议: - -调整超时和重试: -```toml -timeout = "10s" # 增加超时 -retries = 2 # 减少重试 -max_repetitions = 10 # 减少批量大小 -``` -减少并发请求: -```toml -# 分散不同设备的采集时间 -[[instances]] -agents = ["192.168.1.1"] - -[[instances]] -agents = ["192.168.1.2"] -``` -优化发现规则: -```yaml -discovery_rules: - - delay: 6h # 减少发现频率 - filter: - conditions: # 严格过滤,减少生成的监控项 - - macro: '{#IFTYPE}' - value: '6' - operator: EQUALS -``` - -### 9.5 标签relabel -跟snmp插件相比,默认的设备标签从agent_host变成了snmp_agent , 如果你想修改,假如你想把key从snmp_agent修改回agent_host, 可以添加如下配置 -``` -[[instances.relabel_configs]] -source_labels = ["snmp_agent"] -target_label = "agent_host" -replacement = '$1' -action = "replace" - -[[instances.relabel_configs]] -regex = "snmp_agent" -action = "labeldrop" -``` - -## 十、限制和注意事项 -### 10.1 功能限制 -#### 10.1.1 只支持 SNMP_AGENT 类型 -插件只处理以下类型的监控项: - - SNMP_AGENT - - SNMPV1_AGENT - - SNMPV3_AGENT) -不支持的类型(会被忽略): -- ZABBIX_AGENT -- HTTP_AGENT -- CALCULATED -- DEPENDENT -- TRAP -#### 10.1.2 不支持的 Zabbix 功能 -|功能| 支持情况| 说明| -|--------|------|-------| -|Items| ✅ 部分支持| 仅 SNMP 类型| -|Discovery| ✅ 支持 |完整支持| -|Triggers| ❌ 不支持| 插件不处理告警| -|Graphs| ❌ 不支持| 忽略图表定义| -|Dashboards| ❌ 不支持 |忽略仪表板| -|Actions| ❌ 不支持| 不执行动作| -|Trends |❌ 不支持| 不存储趋势数据| -|Events |❌ 不支持 |不生成事件| - -#### 10.1.3 模板版本兼容性 -- 完全支持:Zabbix 6.0+ YAML 格式 -- 不支持:Zabbix 5.x 及以下的 XML 格式 -- 部分支持:可能无法识别最新版本的新特性 - -### 10.2 性能考虑 -建议限制: -- 单个实例最多监控 100 个设备 -- 每个设备最多 1000 个监控项 -- 发现规则生成的项目不超过 10000 个 -- 批量请求大小固定为 60 个 OID -资源消耗: -- 每个设备一个 SNMP 连接 -- 每个监控项占用约 1KB 内存 -- CPU 使用主要在预处理阶段 -### 10.3 安全建议 -使用 SNMPv3: -```toml -version = 3 -security_level = "authPriv" -``` -限制 community 权限: -- 使用只读 community -- 配置设备 ACL - -网络隔离: -- SNMP 流量不应跨越不信任网络 -- 使用 VLAN 隔离管理网络 - -定期更新: -- 及时更新 Categraf -- 更新设备固件 - -## 十一、迁移指南 -### 11.1 从原生 SNMP 插件迁移 -步骤 1:导出现有配置 -原生 snmp插件 配置示例: -``` -[[instances]] -agents = ["192.168.1.1"] -version = 2 -community = "public" - -[[instances.field]] -oid = ".1.3.6.1.2.1.1.3.0" -name = "uptime" - -[[instances.field]] -oid = ".1.3.6.1.2.1.2.2.1.10.1" -name = "interface.eth0.in" -``` -步骤 2: 转换为模板格式 -创建模板文件 migration_template.yaml: -```yaml -zabbix_export: - version: '6.0' - templates: - - template: Migrated from SNMP - items: - - name: System Uptime - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.1.3.0 - key: uptime - value_type: UNSIGNED - - name: Interface eth0 In - type: SNMP_AGENT - snmp_oid: .1.3.6.1.2.1.2.2.1.10.1 - key: interface.eth0.in - value_type: UNSIGNED -``` -步骤 3: 添加snmp_zabbix插件配置文件 -```toml -# 新的 snmp_zabbix 配置 -[[instances]] -agents = ["192.168.1.1"] -version = 2 -community = "public" -template_files = ["new_template.yaml"] -``` - -### 11.2 从 Zabbix 迁移 -步骤 1:导出 Zabbix 配置 -推荐使用 Web 界面(见 2.2.1) - -步骤 2:分析和筛选模板 -```bash -# 查找包含 SNMP 项的模板 -grep -l "type: SNMP" templates/*.yaml - -# 统计每个模板的 SNMP 项数量 -for f in templates/*.yaml; do - count=$(grep -c "type: SNMP" "$f" 2>/dev/null || echo 0) - if [ $count -gt 0 ]; then - echo "$f: $count SNMP items" - fi -done -``` -步骤 3:配置映射表 -如果 Zabbix 中使用了主机变量,创建映射: -```toml -[instances.mappings] -"192.168.1.1" = { - device_name = "core-sw-01", - location = "DC1", - contact = "admin@example.com" -} - -``` -步骤 4:验证迁移 -```bash -# 测试配置 -categraf --test --inputs snmp_zabbix - -# 与zabbix对比指标 -``` - -## 十二、附录 -A. 配置参数速查表 - -|参数 |类型 |默认值 |说明| -|--------|------|-------|-------| -|agents |[]string |必填 |目标设备列表| -|version |int |2 |SNMP 版本(1,2,3)| -|community |string |public |团体字符串| -|username |string |- |SNMPv3 用户名| -|security_level| string |noAuthNoPriv |安全级别| -|auth_protocol| string |MD5 |认证协议| -|auth_password| string |- |认证密码| -|priv_protocol| string |DES |加密协议| -|priv_password| string |- |加密密码| -|port| int |161 |SNMP 端口| -|timeout |duration |5s |超时时间| -|retries |int |3 |重试次数| -|max_repetitions |int |10 |BULK单次请求返回的数据| -|template_files| []string |- |模板文件路径| - -B. 预处理类型对照表 - -|Zabbix 类型|插件支持| 说明| -|--------|------|-------| -|MULTIPLIER|✅| 乘数| -|SIMPLE_CHANGE|✅| 简单变化| -|CHANGE_PER_SECOND|✅| 每秒变化率| -|REGEX|✅| 正则表达式| -|JSONPATH|✅| JSON 路径| -|SNMP_WALK_TO_JSON|✅| Walk 转 JSON| -|HEX_TO_DECIMAL|✅| 十六进制转十进制| -|JAVASCRIPT|✅| JavaScript| -|TRIM |✅| 去除空白| -|MAC_FORMAT|✅ |MAC 格式化| -|IP_FORMAT| - ✅ |IP 格式化| - -C. 常用 OID 列表 -系统信息: -``` -.1.3.6.1.2.1.1.1.0 - sysDescr -.1.3.6.1.2.1.1.3.0 - sysUpTime -.1.3.6.1.2.1.1.5.0 - sysName -.1.3.6.1.2.1.1.6.0 - sysLocation -.1.3.6.1.2.1.1.7.0 - sysServices -``` -网络接口: -``` -.1.3.6.1.2.1.2.2.1.2 - ifDescr -.1.3.6.1.2.1.2.2.1.3 - ifType -.1.3.6.1.2.1.2.2.1.5 - ifSpeed -.1.3.6.1.2.1.2.2.1.7 - ifAdminStatus -.1.3.6.1.2.1.2.2.1.8 - ifOperStatus -.1.3.6.1.2.1.2.2.1.10 - ifInOctets -.1.3.6.1.2.1.2.2.1.16 - ifOutOctets -``` -CPU/内存(企业 MIB): - -``` -# Cisco -.1.3.6.1.4.1.9.9.109.1.1.1.1.7 - CPU 使用率 -.1.3.6.1.4.1.9.9.48.1.1.1.5 - 内存已用 -.1.3.6.1.4.1.9.9.48.1.1.1.6 - 内存空闲 - -# HP -.1.3.6.1.4.1.11.2.14.11.5.1.9.6.1 - CPU 使用率 -``` -D. 正则表达式示例 -``` -# 提取数字 -- type: REGEX - parameters: ['(\d+)', '\1'] - -# 提取温度值 -- type: REGEX - parameters: ['Temperature:\s*(\d+\.?\d*)', '\1'] - -# 提取接口名称 -- type: REGEX - parameters: ['([\w-]+)\s*:\s*(.+)', '\1'] - -# 提取 IP 地址 -- type: REGEX - parameters: ['(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', '\1'] - -# 提取 MAC 地址 -- type: REGEX - parameters: ['([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})', '\0'] -``` - -E. JavaScript 脚本模板 -``` -// 基本计算 -return value * 100; - -// 条件判断 -if (value > 1000) { - return value / 1000; -} else { - return value; -} - -// 字符串处理 -return value.toUpperCase(); -return value.replace(/\s+/g, '_'); - -// JSON 处理 -var obj = JSON.parse(value); -return obj.temperature; - -// 数组处理 -var parts = value.split(','); -return parts[0]; - -// 复杂逻辑 -function convertBytes(bytes) { - if (bytes >= 1099511627776) { - return (bytes / 1099511627776).toFixed(2) + " TB"; - } else if (bytes >= 1073741824) { - return (bytes / 1073741824).toFixed(2) + " GB"; - } else if (bytes >= 1048576) { - return (bytes / 1048576).toFixed(2) + " MB"; - } else if (bytes >= 1024) { - return (bytes / 1024).toFixed(2) + " KB"; - } else { - return bytes + " B"; - } -} -return convertBytes(value); -``` +For example, a Zabbix item key `net.if.in[ifInOctets.1]` might be translated to `zabbix_net_if_in` with appropriate tags for the interface name and index. -F. 术语表 +## Dashboards -|术语| 说明| -|--|--| -|OID |Object Identifier,对象标识符| -|MIB |Management Information Base,管理信息库| -|PDU |Protocol Data Unit,协议数据单元| -|LLD |Low-Level Discovery,低级别发现| -|SNMP Walk |遍历 SNMP 子树的操作| -|Community |SNMPv1/v2c 的认证字符串| -|Bulk Request |SNMPv2c/v3 的批量请求| -|Trap |SNMP 主动推送的告警| -|Counter |累加计数器,会溢出| -|Gauge| 测量值,可增可减| -|TimeTicks| 时间计数器,单位 1/100 秒| -|Item| 监控项,定义要采集的指标| -|Item Prototype| 项目原型,发现后生成监控项的模板| -|Macro |宏,用于动态替换的变量| -|Preprocessing |预处理,数据采集后的转换步骤| +Since the collected metrics are entirely dependent on the specific Zabbix template you load, a universal Dashboard cannot cover all scenarios. However, we provide a generic Dashboard (`dashboard.json`) in this directory that visualizes the fundamental network interface metrics (Traffic In/Out) which are standard across almost all Zabbix Network Templates. diff --git a/inputs/snmp_zabbix/README_CN.md b/inputs/snmp_zabbix/README_CN.md new file mode 100644 index 000000000..b49e0b997 --- /dev/null +++ b/inputs/snmp_zabbix/README_CN.md @@ -0,0 +1,1514 @@ +## 一、插件概述 +### 1.1 插件功能介绍 +snmp_zabbix 是一款兼容Zabbix采集模板的SNMP 数据采集插件,其最大特色是能够直接使用 Zabbix 的 YAML 格式模板文件。这意味着您可以利用 Zabbix 丰富的模板生态系统,无需重新编写监控配置。 +主要功能包括: + + - 完整的 SNMP 协议支持:支持 SNMPv1、v2c、v3 所有版本 + - Zabbix 模板兼容:直接使用 Zabbix 6.0+ 的 YAML 格式模板 + - 自动发现机制:自动发现网络接口、文件系统等资源并动态创建监控项 + - 强大的预处理:支持正则表达式、JavaScript、数值计算等多种数据预处理方式 + - 精细化的调度:支持item粒度调度采集任务 + - 健康检查与自动恢复:自动检测连接状态并重连 + +### 1.2 与snmp 插件的区别 +|特性|SNMP_Zabbix 插件| SNMP 插件| +|--------|------|-------| +|配置方式|Zabbix 模板 + 简单配置| 手动配置每个 OID| +|自动发现|支持 LLD(低级别发现)|需手动配置| +|预处理 |支持 20+ 种预处理方式|基本数值转换| +|模板复用|可直接使用 Zabbix 模板库|需从零开始| +|配置复杂度|低(使用现成模板)|高(逐个配置) +|动态监控项|支持(通过发现规则)|不支持| + +### 1.3 适用场景 +从 Zabbix 迁移到 Categraf,希望复用现有监控模板 +需要监控大量网络设备(交换机、路由器、防火墙等) +需要动态发现和监控变化的资源(如网络接口) +需要对采集数据进行复杂预处理 +希望利用 Zabbix 社区丰富的模板资源 + +### 1.4 系统要求和依赖 +- Categraf 版本:开源版 >= v0.4.24 企业版 >= v0.4.40 +- 网络要求:能够访问目标 SNMP 设备的 UDP (默认161) 端口 +- 目标设备:启用 SNMP 服务的网络设备或服务器 +- 模板要求:Zabbix 6.0+ 及以上的YAML 格式模板(不支持旧版 XML 格式) + +## 二、Zabbix 模板获取与管理 +### 2.1 什么是 Zabbix 模板 +Zabbix 模板是预定义的监控配置集合,包含了监控项、发现规则、触发器等配置。每个模板针对特定类型的设备或服务,如 "Cisco Switch"、"Linux SNMP" 等。 + +### 2.2 获取模板的方式 +#### 2.2.1 从 Zabbix Web 界面导出 +- 步骤 1:登录 Zabbix Web 界面 +比如,https://your-zabbix-server/ + +- 步骤2: 导航到模板页面 +数据采集(Data Collection) -> 模板(Templates) + +- 步骤 3:选择要导出的模板 +勾选需要导出的模板(可多选),点击底部"导出(Export)"按钮 +- 步骤 4:选择导出格式 +重要:选择 "YAML" 格式(Zabbix 6.0+),如果只有 XML 选项,说明 Zabbix 版本过低 +- 步骤 5:保存文件 +文件将自动下载,默认名称如:zbx_export_templates.yaml + +#### 2.2.2 使用 Zabbix API 导出 +方法 1:使用 curl 命令 +``` +# 1. 获取认证 token +# 7.0 以上版本, 请参考https://flashcat.cloud/blog/zabbix-to-flashcat/ 如何申请api token +# 以下接口均在7.2版本上验证, 7.0 以下版本接口和参数可能有些差异,请自行查询zabbix官网文档 + +# 2. 获取模板 ID +curl -s -X POST \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $TOKEN" \ + -d "{ + \"jsonrpc\": \"2.0\", + \"method\": \"template.get\", + \"params\": { + \"output\": [\"templateid\", \"name\"], + \"filter\": { + \"name\": [\"Template Net Cisco IOS SNMPv2\"] + } + }, + \"id\": 2 + }" \ + http://your-zabbix-server/api_jsonrpc.php | jq . + +# 3. 导出模板(获取到 templateid 后) +curl -s -X POST \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $TOKEN" \ + -d "{ + \"jsonrpc\": \"2.0\", + \"method\": \"configuration.export\", + \"params\": { + \"format\": \"yaml\", + \"options\": { + \"templates\": [\"10255\"] + } + }, + \"id\": 3 + }" \ + http://your-zabbix-server/api_jsonrpc.php | jq -r .result > template_cisco.yaml +``` + +#### 2.2.3 使用官方/社区模板 + 访问 Zabbix Git 仓库: +```bash +# 克隆整个模板仓库,从仓库中拷贝相关 yaml 文件 +git clone git@github.com:zabbix/zabbix.git + +# 或者直接下载特定模板 +wget https://github.com/zabbix/zabbix/blob/master/templates/net/cisco/cisco_snmp/template_net_cisco_snmp.yaml +``` +常用网络设备模板推荐: + +|设备类型|模板名称|文件路径| +|--------|------|-------| +|Cisco 交换机| Cisco IOS by SNMP|templates/net/cisco/cisco_snmp/template_net_cisco_snmp.yaml| +|Huawei 交换机|Huawei VRP by SNMP|templates/net/huawei_snmp/template_net_huawei_snmp.yaml| +|HP 交换机|HP Enterprise Switch by SNMP|templates/net/hp_hpn_snmp/template_net_hp_hpn_snmp.yaml| +|Linux 服务器|Linux by SNMP|templates/os/linux_snmp_snmp/template_os_linux_snmp_snmp.yaml| +|Windows 服务器|Windows by SNMP|templates/os/windows_snmp/template_os_windows_snmp.yaml| +|通用网络设备|Network Generic Device by SNMP|templates/net/generic_snmp/template_net_generic_snmp.yaml| + +这部分模板已经放到https://github.com/flashcatcloud/categraf/blob/master/conf/zbx_templates +### 2.3 模板文件格式转换(XML to YAML) + +如果只有 XML 格式的模板,需要进行转换: + +1. 把 XML 模板导入 Zabbix +2. 重新导出为 YAML 格式的模板 + + +## 三、Zabbix 模板结构详解 +### 3.1 模板基本结构 +一个典型的 Zabbix YAML 模板结构如下: +``` +zabbix_export: + version: '7.0' + date: '2024-01-15T10:00:00Z' + templates: + - template: Template Net Example Device + name: Template Net Example Device + description: Template for monitoring example network device + groups: + - name: Templates/Network devices + items: # 监控项 + - name: Interface {#IFNAME} incoming traffic + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} + key: net.if.in[{#IFNAME}] + delay: 60s + value_type: UNSIGNED + units: bps + preprocessing: + - type: CHANGE_PER_SECOND + discovery_rules: # 发现规则 + - name: Network interfaces discovery + type: SNMP_AGENT + snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2] + key: net.if.discovery + delay: 1h + filter: + conditions: + - macro: '{#IFNAME}' + value: '{$NET.IF.NAME.MATCHES}' + #value: '^(eth|bond|eno|ens)' # 直接这么写也行,上一行的写法只是为了用户宏的说明 + operator: MATCHES_REGEX + item_prototypes: # 项目原型 + - name: 'Interface {#IFNAME}: Bits received' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} + key: net.if.in[{#IFNAME}] + macros: # 用户宏 + - macro: '{$NET.IF.NAME.MATCHES}' + value: '^(eth|bond|eno|ens)' +``` +### 3.2 核心配置项说明 +#### 3.2.1 Items(监控项) +监控项定义了要采集的具体指标: +```yaml +items: + - name: CPU utilization # 监控项名称(用于显示) + type: SNMP_AGENT # 类型(插件只处理 SNMP_AGENT) + snmp_oid: .1.3.6.1.4.1.9.9.109.1.1.1.1.7.1 # SNMP OID + key: system.cpu.util # 唯一标识符 + value_type: FLOAT # 数据类型 + units: '%' # 单位 + delay: 30s # 采集间隔 + preprocessing: # 预处理步骤 + - type: MULTIPLIER + parameters: ['0.01'] # 乘以 0.01 转换为百分比 +``` +Type 类型说明: +- SNMP_AGENT:SNMPv2c(插件支持) +- SNMPV1_AGENT:SNMPv1(插件支持) +- SNMPV3_AGENT:SNMPv3(插件支持) +- 其他类型(如 ZABBIX_AGENT、HTTP_AGENT):插件忽略 + +Value_type 数据类型: +- FLOAT:浮点数 +- CHAR:字符(作为标签处理) +- LOG:日志 +- UNSIGNED:无符号整数 +- TEXT:文本(作为标签处理) + +Units 单位处理: +常见单位: +- B、KB、MB、GB:字节单位 +- bps、Kbps、Mbps:速率单位 +- %:百分比 +- ms、s:时间单位 + +#### 3.2.2 Discovery Rules(发现规则) +发现规则用于动态发现资源并创建监控项: +```yaml +discovery_rules: + - name: Network interfaces discovery + type: SNMP_AGENT + key: net.if.discovery + delay: 1h # 发现间隔 + snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2,{#IFTYPE},.1.3.6.1.2.1.2.2.1.3] + filter: + evaltype: AND # 过滤条件组合方式 + conditions: + - macro: '{#IFNAME}' + value: '^eth' + operator: MATCHES_REGEX + - macro: '{#IFTYPE}' + value: '6' # 以太网接口 + operator: EQUALS + item_prototypes: # 基于发现结果创建的监控项 + - name: 'Interface {#IFNAME}: Incoming traffic' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} + key: net.if.in[{#IFNAME}] +``` +Delay 采集间隔语法: +- 30s:30 秒 +- 5m:5 分钟 +- 1h:1 小时 +- 1d:1 天 +- 30:30 秒(纯数字默认为秒) +Filter 过滤器详解: + +过滤器用于筛选发现的资源,只为符合条件的资源创建监控项。 + +操作符(operator)类型: + +|操作符|说明|示例| +|--------|------|-------| +|EQUALS|完全匹配|value: `eth0`| +|NOT_EQUALS|不等于|value: `lo`| +|LIKE|包含|value: `eth`| +|NOT_LIKE|不包含|value: `docker`| +|MATCHES_REGEX|正则匹配 |value: `^(eth\|ens)`| +|NOT_MATCHES_REGEX|正则不匹配|value: `^lo`| + +条件组合方式(evaltype): + +`AND`:所有条件都满足 +`OR`:任一条件满足 +`FORMULA`:自定义表达式 + +Filter 经典案例: + +只监控物理网络接口: +```yaml +filter: + evaltype: AND + conditions: + - macro: '{#IFNAME}' + value: '^(eth|eno|ens|em)\d+$' + operator: MATCHES_REGEX + - macro: '{#IFADMINSTATUS}' + value: '1' # 管理状态为 UP + operator: EQUALS +``` + +排除虚拟接口和环回接口: +```yaml +filter: + evaltype: AND + conditions: + - macro: '{#IFNAME}' + value: '^(lo|docker|virbr|veth)' + operator: NOT_MATCHES_REGEX + - macro: '{#IFTYPE}' + value: '24' # 排除环回接口类型 + operator: NOT_EQUALS +``` +只监控特定 VLAN 接口: +``` +filter: + conditions: + - macro: '{#IFNAME}' + value: '\.(100|200|300)$' # VLAN 100, 200, 300 + operator: MATCHES_REGEX + +``` +使用复杂表达式: +``` +filter: + evaltype: FORMULA + formula: (A and B) or C + conditions: + - macro: '{#IFNAME}' + value: '^eth' + operator: MATCHES_REGEX + formulaid: A + - macro: '{#IFSPEED}' + value: '1000000000' # 1Gbps + operator: EQUALS + formulaid: B + - macro: '{#IFALIAS}' + value: 'IMPORTANT' + operator: LIKE + formulaid: C +``` + +#### 3.2.3 Preprocessing(预处理) +预处理步骤在数据存储前对其进行转换: +``` +preprocessing: + - type: CHANGE_PER_SECOND # 计算每秒变化率 + - type: MULTIPLIER # 乘法运算 + parameters: ['8'] # 字节转比特 + - type: REGEX # 正则提取 + parameters: + - 'Temperature: ([\d.]+)' + - '\1' +``` + +支持的预处理类型: +|类型|说明|参数示例| +|--------|------|-------| +|MULTIPLIER|乘数|`['0.001']`| +|SIMPLE_CHANGE|简单变化|无参数| +|CHANGE_PER_SECOND|每秒变化率|无参数| +|REGEX|正则表达式|`['pattern', 'output']`| +|JSONPATH|JSON路径|`['$.value']`| +|SNMP_WALK_TO_JSON|SNMP Walk转JSON|`['{#MACRO}', 'oid', '0']`| +|HEX_TO_DECIMAL|十六进制转十进制|无参数| +|JAVASCRIPT|JavaScript脚本|`['return value * 100']`| + +#### 3.2.4 Macros(宏) +宏用于动态替换配置中的值 + +用户宏 `{$MACRO}`: +``` +macros: + - macro: '{$SNMP.TIMEOUT}' + value: '5' + - macro: '{$CPU.UTIL.CRIT}' + value: '90' +``` +LLD 宏 `{#MACRO}`: + +在发现过程中自动填充: + +- `{#SNMPINDEX}`:SNMP 索引 +- `{#IFNAME}`:接口名称 +- `{#IFTYPE}`:接口类型 +- 自定义宏:通过 discovery 配置定义 +宏替换机制: + +- 发现阶段:提取 LLD 宏值 +- 展开阶段:将宏替换为实际值 +- 优先级:LLD 宏 > 用户宏 > 默认值 +## 四、插件配置详解 +### 4.1 基础配置 +#### 4.1.1 SNMP 连接参数 +``` +[[instances]] +# 目标设备列表 +agents = [ + "192.168.1.1", # 简单 IP + "192.168.1.2:161", # 指定端口 + "udp://192.168.1.3:161", # 指定协议 + "tcp://switch.example.com:161", # TCP 传输 + "192.168.1.0/24", # CIDR 网段(自动扫描) +] + +# SNMP 版本(1, 2, 3) +version = 2 + +# SNMPv1/v2c 参数 +community = "public" + +# SNMPv3 参数 +username = "snmpuser" +security_level = "authPriv" # noAuthNoPriv, authNoPriv, authPriv +auth_protocol = "SHA" # MD5, SHA, SHA224, SHA256, SHA384, SHA512 +auth_password = "auth_pass_123" +priv_protocol = "AES" # DES, AES, AES192, AES256 +priv_password = "priv_pass_456" +context_name = "" + +# 连接参数 +port = 161 # 默认 SNMP 端口 +timeout = "5s" # 超时时间 +retries = 3 # 重试次数 +max_repetitions = 10 # BULK 请求最大重复数 + +# UDP 套接字模式 +unconnected_udp_socket = false # 使用非连接模式(处理大量设备时更高效) +``` +Agents 配置格式说明: +- 支持多种格式混合使用 +- CIDR 网段会自动展开为单个 IP(全量IP) +- 默认使用 UDP 协议和 161 端口 +#### 4.1.2 模板加载 +方式一:加载外部文件 +```toml +template_files = [ + "/etc/categraf/templates/cisco_switch.yaml", + "/etc/categraf/templates/interface_addon.yaml" +] +``` +方式二:内嵌模板内容 +```toml +[instances.template_file_contents] +# 直接在配置中嵌入模板内容 +basic_template = ''' +zabbix_export: + version: '6.0' + templates: + - template: Embedded Template + items: + - name: System uptime + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.1.3.0 + key: system.uptime +''' +``` +多模板合并规则: +- 后加载的模板覆盖前面的同名项 +- 宏定义按名称合并 +- 发现规则独立处理 +### 4.2 高级配置 +#### 4.2.1 发现功能配置 + +发现功能会自动从模板中读取 discovery_rules,发现调度机制: +- 每个发现规则按其 delay 独立调度 +- 首次启动时立即执行一次发现 +- 发现结果会缓存,避免重复执行 + +4.2.2 健康检查 + +健康检查自动进行,默认参数: +- 检查间隔:30秒 +- 检查超时:5秒 +- 最大重试:3次 +- 自动重连:启用 + +健康检查是通过计数标记方式进行的,默认参数下,设备标记位不健康会通过3次检查来完成. 如下: + +第一次检查失败--30秒-->第二次检查失败--30秒-->第三次检查失败,达到最大重试次数,标记设备为不健康。 + +## 五、数据类型处理机制 +### 5.1 SNMP 数据类型映射 +|SNMP PDU 类型| Go 类型| 处理方式| +|--------|------|-------| +|Integer|int|直接使用| +|Counter32|uint32|转为 uint64| +|Counter64|uint64|直接使用| +|Gauge32|uint32|直接使用| +|TimeTicks|uint32|转换为秒(÷100)| +|OctetString|[]byte|自动识别可打印字符串| +|ObjectIdentifier|string|字符串表示| +|IPAddress|string|点分十进制表示| +|Opaque|[]byte|十六进制字符串| + +### 5.2 特殊类型处理 +#### 5.2.1 CHAR/TEXT 类型作为标签 +当监控项的 value_type 为 CHAR 或 TEXT 时,插件会将其作为标签而非指标值: +```yaml +item_prototypes: + - name: Interface {#IFNAME} alias + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.31.1.1.1.18.{#SNMPINDEX} + key: net.if.alias[{#IFNAME}] + value_type: TEXT # 文本类型 +``` +Label Provider 机制: +- 识别 CHAR/TEXT 类型的监控项 +- 提取标签键(从 key 中解析,如 net.if.alias -> net_if_alias) +- 缓存标签值并关联到相同索引的其他监控项 +- 在输出指标时自动添加这些标签 + +对于一些枚举的CHAR或者TEXT类型的item, 最佳实践是通过预处理将其转换为数值。比如 +```yaml +items: + - name: "Interface {#IFNAME}: Operational status" + key: "net.if.status[{#IFNAME}]" + type: SNMP_AGENT + snmp_oid: ".1.3.6.1.2.1.2.2.1.8.{#SNMPINDEX}" + value_type: UNSIGNED # 改为数值类型 + preprocessing: + - type: JAVASCRIPT + parameters: + - | + // 将 SNMP 返回的状态值转换为标准数值 + // IF-MIB::ifOperStatus 值: + // 1 = up, 2 = down, 3 = testing, 4 = unknown, + // 5 = dormant, 6 = notPresent, 7 = lowerLayerDown + + var statusMap = { + '1': 1, // up -> 1 + '2': 0, // down -> 0 + '3': 0, // testing -> 0 + '4': 0, // unknown -> 0 + '5': 0, // dormant -> 0 + '6': 0, // notPresent -> 0 + '7': 0 // lowerLayerDown -> 0 + }; + + return statusMap[value] !== undefined ? statusMap[value] : 0; +``` + +#### 5.2.2 Counter 类型处理 +计数器类型会自动处理溢出: +```yaml +preprocessing: + - type: CHANGE_PER_SECOND # 自动处理计数器溢出 + # 32位计数器最大值:4294967295 + # 64位计数器最大值:18446744073709551615 +``` +速率计算公式: +- 正常情况:(新值 - 旧值) / 时间差 +- 溢出情况:(最大值 - 旧值 + 新值) / 时间差 +#### 5.2.3 OctetString 处理 +OctetString 的处理取决于内容: +- 可打印字符串:直接转换为 string +- 二进制数据:转换为十六进制 +- 特殊处理(通过预处理): + - MAC 地址:MAC_FORMAT + - IP 地址:IP_FORMAT + - 十六进制数值:HEX_TO_DECIMAL +## 六、自动发现功能 +### 6.1 自动发现流程 +``` +1. 执行 SNMP Walk + ↓ +2. 提取索引和宏值 + ↓ +3. 应用过滤器 + ↓ +4. 生成监控项 + ↓ +5. 动态调度采集 +``` +### 6.2 支持的发现类型 +#### 6.2.1 网络接口发现 +``` +discovery_rules: + - name: Network interfaces discovery + snmp_oid: .1.3.6.1.2.1.2.2.1.2 # ifDescr + # 或使用多 OID 发现 + snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2,{#IFTYPE},.1.3.6.1.2.1.2.2.1.3] +``` +自动生成的宏: +- {#SNMPINDEX}:接口索引 +- {#IFNAME}:接口名称 +- {#IFTYPE}:接口类型 +#### 6.2.2 文件系统发现 +```yaml +discovery_rules: + - name: Mounted filesystem discovery + snmp_oid: .1.3.6.1.2.1.25.2.3.1.3 # hrStorageDescr +``` +#### 6.2.3 自定义发现 +使用 walk[] 语法执行多个 OID walk: +```yaml +discovery_rules: + - name: Custom discovery + snmp_oid: walk[.1.3.6.1.4.1.9.9.48.1.1.1.2,.1.3.6.1.4.1.9.9.48.1.1.1.5] + preprocessing: + - type: SNMP_WALK_TO_JSON + parameters: + - '{#VLANID}' + - '.1.3.6.1.4.1.9.9.48.1.1.1.2' + - '0' + - '{#VLANNAME}' + - '.1.3.6.1.4.1.9.9.48.1.1.1.5' + - '0' +``` +举个例子,原始 SNMP Walk 数据: +``` +.1.3.6.1.2.1.2.2.1.2.1 = "lo" +.1.3.6.1.2.1.2.2.1.2.2 = "eth0" +.1.3.6.1.2.1.2.2.1.2.3 = "eth1" +.1.3.6.1.2.1.2.2.1.3.1 = 24 # loopback type +.1.3.6.1.2.1.2.2.1.3.2 = 6 # ethernet type +.1.3.6.1.2.1.2.2.1.3.3 = 6 # ethernet type +.1.3.6.1.2.1.2.2.1.5.1 = 10000000 # 10 Mbps +.1.3.6.1.2.1.2.2.1.5.2 = 1000000000 # 1 Gbps +.1.3.6.1.2.1.2.2.1.5.3 = 10000000000 # 10 Gbps +``` +配置: +``` +discovery_rules: + - name: Network interface discovery + type: SNMP_AGENT + key: net.if.discovery + # 使用 walk[] 语法执行多个 OID walk + snmp_oid: walk[.1.3.6.1.2.1.2.2.1.2,.1.3.6.1.2.1.2.2.1.3,.1.3.6.1.2.1.2.2.1.5] + preprocessing: + - type: SNMP_WALK_TO_JSON + parameters: + - '{#IFNAME}' # 宏名称 + - '.1.3.6.1.2.1.2.2.1.2' # OID 基础 + - '0' # 批量提取标志 (0=单个值) + - '{#IFTYPE}' # 第二个宏 + - '.1.3.6.1.2.1.2.2.1.3' # 第二个 OID + - '0' + - '{#IFSPEED}' # 第三个宏 + - '.1.3.6.1.2.1.2.2.1.5' # 第三个 OID + - '0' +``` +生成的 JSON: +``` +[ + { + "{#SNMPINDEX}": "1", + "{#IFNAME}": "lo", + "{#IFTYPE}": "24", + "{#IFSPEED}": "10000000" + }, + { + "{#SNMPINDEX}": "2", + "{#IFNAME}": "eth0", + "{#IFTYPE}": "6", + "{#IFSPEED}": "1000000000" + }, + { + "{#SNMPINDEX}": "3", + "{#IFNAME}": "eth1", + "{#IFTYPE}": "6", + "{#IFSPEED}": "10000000000" + } +] +``` + +### 6.3 发现数据处理流程 +#### 6.3.1 OID Walk 执行 +插件使用 BulkWalk 提高效率: +- 自动处理 SNMP v1 的 Walk +- v2c/v3 使用 BulkWalk +- 支持并发多个 OID walk +#### 6.3.2 宏值提取 +从 OID 结果中提取: +``` +OID: .1.3.6.1.2.1.2.2.1.2.1 = "eth0" + └─ 基础 OID ─┘└─索引─┘ └值┘ + +提取结果: +{#SNMPINDEX} = "1" +{#IFNAME} = "eth0" +``` +#### 6.3.3 过滤器应用 +按照 filter 配置筛选发现的项目(见 3.2.2 Filter 部分) + +#### 6.3.4 监控项生成 +基于 item_prototypes 和宏值生成实际监控项: +``` +模板:net.if.in[{#IFNAME}] +宏值:{#IFNAME} = "eth0" +结果:net.if.in[eth0] +``` +## 七、预处理功能详解 +### 7.1 支持的预处理类型列表 +|类型| 用途| 示例| +|--------|------|-------| +|MULTIPLIER|数值乘法 |字节转比特(×8)| +|SIMPLE_CHANGE |简单变化 |当前值-上次值| +|CHANGE_PER_SECOND |速率计算 |流量速率| +|REGEX |正则提取 |从字符串提取数值| +|JSONPATH |JSON解析 |提取JSON字段| +|TRIM/LTRIM/RTRIM|字符串修剪|去除空白| +|JAVASCRIPT |JS脚本|复杂逻辑处理| +|HEX_TO_DECIMAL |进制转换 |0xFF -> 255| +|MAC_FORMAT |MAC格式化| 标准化MAC地址| +|IP_FORMAT |IP格式化| 提取IP地址| + +### 7.2 常用预处理案例 +#### 7.2.1 数值计算 +字节转比特: +```yaml +preprocessing: + - type: MULTIPLIER + parameters: ['8'] +``` +百分比转换: +```yaml +preprocessing: + - type: MULTIPLIER + parameters: ['0.01'] # 如果原值是 0-10000,转为 0-100 +``` +计算速率: +```yaml +preprocessing: + - type: CHANGE_PER_SECOND # 自动计算每秒变化 +``` +#### 7.2.2 字符串处理 +提取温度数值: +```yaml +preprocessing: + - type: REGEX + parameters: + - 'Temperature: ([\d.]+)°C' + - '\1' +``` +去除空白: +```yaml +preprocessing: + - type: TRIM # 去除前后空白 +``` +#### 7.2.3 格式转换 +MAC 地址格式化: +```yaml +preprocessing: + - type: MAC_FORMAT + parameters: [':'] # 分隔符(默认冒号) +# 输入:001122334455 或 00-11-22-33-44-55 +# 输出:00:11:22:33:44:55 +``` +十六进制转十进制: +```yaml +preprocessing: + - type: HEX_TO_DECIMAL +# 输入:FF 或 0xFF +# 输出:255 +``` +7.2.4 JavaScript 脚本 + +简单计算: +```yaml +preprocessing: + - type: JAVASCRIPT + parameters: ['return value * 100 / 1024'] +``` +条件处理: +```yaml +preprocessing: + - type: JAVASCRIPT + parameters: + - | + if (value > 1000000) { + return value / 1000000; // 转为 MB + } + return value / 1000; // 转为 KB +``` +字符串处理: +```yaml +preprocessing: + - type: JAVASCRIPT + parameters: + - 'return value.toLowerCase().replace(/\s+/g, "_")' +``` +#### 7.2.5 JSONPath 提取 +提取 JSON 字段: +```yaml +preprocessing: + - type: JSONPATH + parameters: ['$.temperature.value'] +``` +提取数组元素: +```yaml +preprocessing: + - type: JSONPATH + parameters: ['$.interfaces[0].name'] +``` +## 八、实际配置示例 +### 8.1 最小化配置示例 +```toml +# /etc/categraf/conf/inputs.snmp_zabbix/snmp_zabbix.toml + +[[instances]] +# 最简配置:监控单个设备的系统信息 +agents = ["192.168.1.1"] +version = 2 +community = "public" +template_files = ["/etc/categraf/templates/basic_system.yaml"] +``` +对应的最简模板: +```yaml +# /etc/categraf/templates/basic_system.yaml +zabbix_export: + version: '6.0' + templates: + - template: Basic System + items: + - name: System uptime + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.1.3.0 + key: system.uptime + value_type: UNSIGNED + preprocessing: + - type: MULTIPLIER + parameters: ['0.01'] # TimeTicks to seconds +``` +### 8.2 完整配置示例 +```toml +# /etc/categraf/conf/inputs.snmp_zabbix/snmp_zabbix.toml + +[[instances]] +# 基础标签 +labels = { region = "beijing", env = "production" } + +# SNMP 连接配置 +agents = [ + "192.168.1.0/24", # 扫描整个网段 + "core-switch.example.com" # 域名 +] +version = 2 +community = "public" +port = 161 +timeout = "5s" +retries = 3 +max_repetitions = 25 + +# 加载多个模板(会自动合并) +template_files = [ + "/etc/categraf/templates/cisco_catalyst.yaml", + "/etc/categraf/templates/custom_oids.yaml" +] + +# 设备映射标签 +[instances.mappings] +"192.168.1.1" = { device_name = "core-sw-01", location = "DC1" } +"192.168.1.2" = { device_name = "core-sw-02", location = "DC2" } +``` + +## 8.3 常见场景配置 +### 8.3.1 交换机端口监控 +```toml +[[instances]] +agents = ["192.168.1.1"] +version = 2 +community = "public" + +# 使用 Cisco 官方模板 +template_files = ["/etc/categraf/templates/net/cisco/cisco_snmp/template_net_cisco_snmp.yaml"] + +# 或内嵌简化模板 +[instances.template_file_contents] +switch_interfaces = ''' +zabbix_export: + version: '6.0' + templates: + - template: Switch Interfaces + discovery_rules: + - name: Interface discovery + type: SNMP_AGENT + key: net.if.discovery + delay: 1h + snmp_oid: discovery[{#IFNAME},.1.3.6.1.2.1.2.2.1.2,{#IFTYPE},.1.3.6.1.2.1.2.2.1.3,{#IFADMINSTATUS},.1.3.6.1.2.1.2.2.1.7] + filter: + evaltype: AND + conditions: + - macro: '{#IFTYPE}' + value: '6' # Ethernet + operator: EQUALS + - macro: '{#IFADMINSTATUS}' + value: '1' # UP + operator: EQUALS + item_prototypes: + - name: 'Interface {#IFNAME}: Incoming traffic' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.2.2.1.10.{#SNMPINDEX} + key: net.if.in[{#IFNAME}] + value_type: UNSIGNED + units: bps + preprocessing: + - type: CHANGE_PER_SECOND + - type: MULTIPLIER + parameters: ['8'] + - name: 'Interface {#IFNAME}: Outgoing traffic' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.2.2.1.16.{#SNMPINDEX} + key: net.if.out[{#IFNAME}] + value_type: UNSIGNED + units: bps + preprocessing: + - type: CHANGE_PER_SECOND + - type: MULTIPLIER + parameters: ['8'] + - name: 'Interface {#IFNAME}: Description' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.31.1.1.1.18.{#SNMPINDEX} + key: net.if.alias[{#IFNAME}] + value_type: TEXT # 作为标签 +''' +``` +#### 8.3.2 路由器监控 +```toml +[[instances]] +agents = ["10.0.0.1"] +version = 3 +username = "snmpv3user" +security_level = "authPriv" +auth_protocol = "SHA" +auth_password = "authpass123" +priv_protocol = "AES" +priv_password = "privpass456" + +[instances.template_file_contents] +router_template = ''' +zabbix_export: + version: '6.0' + templates: + - template: Router Monitoring + items: + # CPU 使用率 + - name: CPU utilization + type: SNMP_AGENT + snmp_oid: .1.3.6.1.4.1.9.9.109.1.1.1.1.7.1 + key: system.cpu.util + value_type: FLOAT + units: '%' + # 内存使用 + - name: Memory used + type: SNMP_AGENT + snmp_oid: .1.3.6.1.4.1.9.9.48.1.1.1.5.1 + key: vm.memory.used + value_type: UNSIGNED + units: B + # 路由表大小 + - name: Routing table size + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.4.24.3.0 + key: net.routing.table.size + value_type: UNSIGNED + discovery_rules: + # BGP 邻居发现 + - name: BGP peer discovery + type: SNMP_AGENT + key: bgp.peer.discovery + snmp_oid: .1.3.6.1.2.1.15.3.1.2 + item_prototypes: + - name: 'BGP peer {#PEER}: State' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.15.3.1.2.{#SNMPINDEX} + key: bgp.peer.state[{#PEER}] + value_type: UNSIGNED +''' +``` +#### 8.3.3 存储设备监控 +``` +[[instances]] +agents = ["storage.example.com"] +version = 2 +community = "public" + +[instances.template_file_contents] +storage_template = ''' +zabbix_export: + version: '6.0' + templates: + - template: Storage Device + discovery_rules: + - name: Storage discovery + type: SNMP_AGENT + key: storage.discovery + delay: 30m + snmp_oid: discovery[{#STORAGEDESCR},.1.3.6.1.2.1.25.2.3.1.3,{#STORAGETYPE},.1.3.6.1.2.1.25.2.3.1.2] + filter: + conditions: + - macro: '{#STORAGETYPE}' + value: '.1.3.6.1.2.1.25.2.1.4' # Fixed disk + operator: EQUALS + item_prototypes: + - name: '{#STORAGEDESCR}: Total space' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.25.2.3.1.5.{#SNMPINDEX} + key: vfs.fs.size[{#STORAGEDESCR},total] + value_type: UNSIGNED + units: B + preprocessing: + - type: MULTIPLIER + parameters: ['4096'] # 块大小 + - name: '{#STORAGEDESCR}: Used space' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.25.2.3.1.6.{#SNMPINDEX} + key: vfs.fs.size[{#STORAGEDESCR},used] + value_type: UNSIGNED + units: B + preprocessing: + - type: MULTIPLIER + parameters: ['4096'] + - name: '{#STORAGEDESCR}: Usage in %' + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.25.2.3.1.6.{#SNMPINDEX} + key: vfs.fs.pused[{#STORAGEDESCR}] + value_type: FLOAT + units: '%' + preprocessing: + - type: JAVASCRIPT + parameters: + - | + var used = value; + var total = 1000000; // 需要从其他地方获取 + return (used / total) * 100; +''' +``` +#### 8.3.4 打印机监控 +``` +[[instances]] +agents = ["printer.example.com"] +version = 1 # 很多打印机只支持 v1 +community = "public" + +[instances.template_file_contents] +printer_template = ''' +zabbix_export: + version: '6.0' + templates: + - template: Printer Monitoring + items: + - name: Printer status + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.25.3.5.1.1.1 + key: printer.status + value_type: UNSIGNED + - name: Printer error state + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.25.3.5.1.2.1 + key: printer.error + value_type: TEXT + - name: Toner level black + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.43.11.1.1.9.1.1 + key: printer.toner.black + value_type: UNSIGNED + units: '%' + - name: Pages printed + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.43.10.2.1.4.1.1 + key: printer.pages.total + value_type: UNSIGNED +''' +``` +## 九、故障排查 +### 9.1 常见问题及解决方案 +#### 9.1.1 连接问题 +问题:无法连接到 SNMP 设备 + +检查步骤: +```bash +# 1. 测试网络连通性 +ping 192.168.1.1 + +# 2. 测试 SNMP 端口 +nc -zvu 192.168.1.1 161 + +# 3. 使用 snmpwalk 测试 +snmpwalk -v2c -c public 192.168.1.1 system + +# 4. 检查防火墙 +sudo iptables -L -n | grep 161 +``` +常见原因: +- 防火墙阻止 UDP 161 端口 +- SNMP 服务未启动 +- Community 字符串错误 +- ACL 限制访问 +#### 9.1.2 认证问题 +SNMPv3 认证失败 + +检查配置: +```toml +# 确保所有参数匹配 +username = "snmpuser" +security_level = "authPriv" +auth_protocol = "SHA" # 大小写敏感 +auth_password = "password" # 至少8个字符 +priv_protocol = "AES" +priv_password = "password" # 至少8个字符 +``` +测试命令: +```bash +snmpget -v3 -l authPriv -u snmpuser -a SHA -A authpass123 -x AES -X privpass456 192.168.1.1 sysDescr.0 +``` +#### 9.1.3 OID 不存在 +错误:OID not found on device + +排查方法: +```bash +# 1. 列出设备支持的所有 OID +snmpwalk -v2c -c public 192.168.1.1 .1 + +# 2. 检查特定 OID +snmpget -v2c -c public 192.168.1.1 .1.3.6.1.2.1.2.2.1.10.1 + +# 3. 查看 MIB 支持 +snmpwalk -v2c -c public 192.168.1.1 sysORTable +``` +解决方案: +- 确认设备支持该 MIB +- 使用正确的 OID +- 某些设备需要启用特定 MIB +#### 9.1.4 发现失败 +发现规则未返回任何项目 + +调试步骤: + +手动执行 walk: +```bash +snmpwalk -v2c -c public 192.168.1.1 .1.3.6.1.2.1.2.2.1.2 +``` +检查过滤器: +```yaml +filter: + conditions: + - macro: '{#IFNAME}' + value: 'eth' # 可能过滤太严格 + operator: LIKE +``` +查看日志: +```bash +tail -f /var/log/categraf/categraf.log | grep -i discovery +``` +#### 9.1.5 预处理错误 +预处理失败的常见原因: + +正则表达式错误: +```yaml +# 错误:未转义特殊字符 +- type: REGEX + parameters: ['Temp: (\d+).(\d+)', '\1.\2'] + +# 正确: +- type: REGEX + parameters: ['Temp: (\d+)\.(\d+)', '\1.\2'] +``` +JavaScript 语法错误: +```yaml +# 错误:缺少 return +- type: JAVASCRIPT + parameters: ['value * 100'] + +# 正确: +- type: JAVASCRIPT + parameters: ['return value * 100'] +``` +类型不匹配: +```yaml +# 错误:对字符串使用数值运算 +- type: MULTIPLIER + parameters: ['8'] + +# 正确:先转换类型 +- type: REGEX + parameters: ['(\d+)', '\1'] +- type: MULTIPLIER + parameters: ['8'] +``` +### 9.2 调试模式使用 +启用调试模式: +``` +# 启动时添加 debug 参数 +./categraf --debug --inputs snmp_zabbix + +# 查看详细日志 +tail -f /var/log/categraf/categraf.log +``` +### 9.3 日志分析 +关键日志标识: +- E! - 错误 +- W! - 警告 +- I! - 信息 +- D! - 调试 +常见日志分析: +```bash +# 查看错误 +grep "E!" /var/log/categraf/categraf.log + +# 查看发现相关 +grep -i discovery /var/log/categraf/categraf.log + +# 查看特定设备 +grep "192.168.1.1" /var/log/categraf/categraf.log + +# 查看预处理错误 +grep -i preprocessing /var/log/categraf/categraf.log +``` +### 9.4 性能问题排查 +采集延迟或超时 + +优化建议: + +调整超时和重试: +```toml +timeout = "10s" # 增加超时 +retries = 2 # 减少重试 +max_repetitions = 10 # 减少批量大小 +``` +减少并发请求: +```toml +# 分散不同设备的采集时间 +[[instances]] +agents = ["192.168.1.1"] + +[[instances]] +agents = ["192.168.1.2"] +``` +优化发现规则: +```yaml +discovery_rules: + - delay: 6h # 减少发现频率 + filter: + conditions: # 严格过滤,减少生成的监控项 + - macro: '{#IFTYPE}' + value: '6' + operator: EQUALS +``` + +### 9.5 标签relabel +跟snmp插件相比,默认的设备标签从agent_host变成了snmp_agent , 如果你想修改,假如你想把key从snmp_agent修改回agent_host, 可以添加如下配置 +``` +[[instances.relabel_configs]] +source_labels = ["snmp_agent"] +target_label = "agent_host" +replacement = '$1' +action = "replace" + +[[instances.relabel_configs]] +regex = "snmp_agent" +action = "labeldrop" +``` + +## 十、限制和注意事项 +### 10.1 功能限制 +#### 10.1.1 只支持 SNMP_AGENT 类型 +插件只处理以下类型的监控项: + - SNMP_AGENT + - SNMPV1_AGENT + - SNMPV3_AGENT) +不支持的类型(会被忽略): +- ZABBIX_AGENT +- HTTP_AGENT +- CALCULATED +- DEPENDENT +- TRAP +#### 10.1.2 不支持的 Zabbix 功能 +|功能| 支持情况| 说明| +|--------|------|-------| +|Items| ✅ 部分支持| 仅 SNMP 类型| +|Discovery| ✅ 支持 |完整支持| +|Triggers| ❌ 不支持| 插件不处理告警| +|Graphs| ❌ 不支持| 忽略图表定义| +|Dashboards| ❌ 不支持 |忽略仪表板| +|Actions| ❌ 不支持| 不执行动作| +|Trends |❌ 不支持| 不存储趋势数据| +|Events |❌ 不支持 |不生成事件| + +#### 10.1.3 模板版本兼容性 +- 完全支持:Zabbix 6.0+ YAML 格式 +- 不支持:Zabbix 5.x 及以下的 XML 格式 +- 部分支持:可能无法识别最新版本的新特性 + +### 10.2 性能考虑 +建议限制: +- 单个实例最多监控 100 个设备 +- 每个设备最多 1000 个监控项 +- 发现规则生成的项目不超过 10000 个 +- 批量请求大小固定为 60 个 OID +资源消耗: +- 每个设备一个 SNMP 连接 +- 每个监控项占用约 1KB 内存 +- CPU 使用主要在预处理阶段 +### 10.3 安全建议 +使用 SNMPv3: +```toml +version = 3 +security_level = "authPriv" +``` +限制 community 权限: +- 使用只读 community +- 配置设备 ACL + +网络隔离: +- SNMP 流量不应跨越不信任网络 +- 使用 VLAN 隔离管理网络 + +定期更新: +- 及时更新 Categraf +- 更新设备固件 + +## 十一、迁移指南 +### 11.1 从原生 SNMP 插件迁移 +步骤 1:导出现有配置 +原生 snmp插件 配置示例: +``` +[[instances]] +agents = ["192.168.1.1"] +version = 2 +community = "public" + +[[instances.field]] +oid = ".1.3.6.1.2.1.1.3.0" +name = "uptime" + +[[instances.field]] +oid = ".1.3.6.1.2.1.2.2.1.10.1" +name = "interface.eth0.in" +``` +步骤 2: 转换为模板格式 +创建模板文件 migration_template.yaml: +```yaml +zabbix_export: + version: '6.0' + templates: + - template: Migrated from SNMP + items: + - name: System Uptime + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.1.3.0 + key: uptime + value_type: UNSIGNED + - name: Interface eth0 In + type: SNMP_AGENT + snmp_oid: .1.3.6.1.2.1.2.2.1.10.1 + key: interface.eth0.in + value_type: UNSIGNED +``` +步骤 3: 添加snmp_zabbix插件配置文件 +```toml +# 新的 snmp_zabbix 配置 +[[instances]] +agents = ["192.168.1.1"] +version = 2 +community = "public" +template_files = ["new_template.yaml"] +``` + +### 11.2 从 Zabbix 迁移 +步骤 1:导出 Zabbix 配置 +推荐使用 Web 界面(见 2.2.1) + +步骤 2:分析和筛选模板 +```bash +# 查找包含 SNMP 项的模板 +grep -l "type: SNMP" templates/*.yaml + +# 统计每个模板的 SNMP 项数量 +for f in templates/*.yaml; do + count=$(grep -c "type: SNMP" "$f" 2>/dev/null || echo 0) + if [ $count -gt 0 ]; then + echo "$f: $count SNMP items" + fi +done +``` +步骤 3:配置映射表 +如果 Zabbix 中使用了主机变量,创建映射: +```toml +[instances.mappings] +"192.168.1.1" = { + device_name = "core-sw-01", + location = "DC1", + contact = "admin@example.com" +} + +``` +步骤 4:验证迁移 +```bash +# 测试配置 +categraf --test --inputs snmp_zabbix + +# 与zabbix对比指标 +``` + +## 十二、附录 +A. 配置参数速查表 + +|参数 |类型 |默认值 |说明| +|--------|------|-------|-------| +|agents |[]string |必填 |目标设备列表| +|version |int |2 |SNMP 版本(1,2,3)| +|community |string |public |团体字符串| +|username |string |- |SNMPv3 用户名| +|security_level| string |noAuthNoPriv |安全级别| +|auth_protocol| string |MD5 |认证协议| +|auth_password| string |- |认证密码| +|priv_protocol| string |DES |加密协议| +|priv_password| string |- |加密密码| +|port| int |161 |SNMP 端口| +|timeout |duration |5s |超时时间| +|retries |int |3 |重试次数| +|max_repetitions |int |10 |BULK单次请求返回的数据| +|template_files| []string |- |模板文件路径| + +B. 预处理类型对照表 + +|Zabbix 类型|插件支持| 说明| +|--------|------|-------| +|MULTIPLIER|✅| 乘数| +|SIMPLE_CHANGE|✅| 简单变化| +|CHANGE_PER_SECOND|✅| 每秒变化率| +|REGEX|✅| 正则表达式| +|JSONPATH|✅| JSON 路径| +|SNMP_WALK_TO_JSON|✅| Walk 转 JSON| +|HEX_TO_DECIMAL|✅| 十六进制转十进制| +|JAVASCRIPT|✅| JavaScript| +|TRIM |✅| 去除空白| +|MAC_FORMAT|✅ |MAC 格式化| +|IP_FORMAT| - ✅ |IP 格式化| + +C. 常用 OID 列表 +系统信息: +``` +.1.3.6.1.2.1.1.1.0 - sysDescr +.1.3.6.1.2.1.1.3.0 - sysUpTime +.1.3.6.1.2.1.1.5.0 - sysName +.1.3.6.1.2.1.1.6.0 - sysLocation +.1.3.6.1.2.1.1.7.0 - sysServices +``` +网络接口: +``` +.1.3.6.1.2.1.2.2.1.2 - ifDescr +.1.3.6.1.2.1.2.2.1.3 - ifType +.1.3.6.1.2.1.2.2.1.5 - ifSpeed +.1.3.6.1.2.1.2.2.1.7 - ifAdminStatus +.1.3.6.1.2.1.2.2.1.8 - ifOperStatus +.1.3.6.1.2.1.2.2.1.10 - ifInOctets +.1.3.6.1.2.1.2.2.1.16 - ifOutOctets +``` +CPU/内存(企业 MIB): + +``` +# Cisco +.1.3.6.1.4.1.9.9.109.1.1.1.1.7 - CPU 使用率 +.1.3.6.1.4.1.9.9.48.1.1.1.5 - 内存已用 +.1.3.6.1.4.1.9.9.48.1.1.1.6 - 内存空闲 + +# HP +.1.3.6.1.4.1.11.2.14.11.5.1.9.6.1 - CPU 使用率 +``` +D. 正则表达式示例 +``` +# 提取数字 +- type: REGEX + parameters: ['(\d+)', '\1'] + +# 提取温度值 +- type: REGEX + parameters: ['Temperature:\s*(\d+\.?\d*)', '\1'] + +# 提取接口名称 +- type: REGEX + parameters: ['([\w-]+)\s*:\s*(.+)', '\1'] + +# 提取 IP 地址 +- type: REGEX + parameters: ['(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', '\1'] + +# 提取 MAC 地址 +- type: REGEX + parameters: ['([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})', '\0'] +``` + +E. JavaScript 脚本模板 +``` +// 基本计算 +return value * 100; + +// 条件判断 +if (value > 1000) { + return value / 1000; +} else { + return value; +} + +// 字符串处理 +return value.toUpperCase(); +return value.replace(/\s+/g, '_'); + +// JSON 处理 +var obj = JSON.parse(value); +return obj.temperature; + +// 数组处理 +var parts = value.split(','); +return parts[0]; + +// 复杂逻辑 +function convertBytes(bytes) { + if (bytes >= 1099511627776) { + return (bytes / 1099511627776).toFixed(2) + " TB"; + } else if (bytes >= 1073741824) { + return (bytes / 1073741824).toFixed(2) + " GB"; + } else if (bytes >= 1048576) { + return (bytes / 1048576).toFixed(2) + " MB"; + } else if (bytes >= 1024) { + return (bytes / 1024).toFixed(2) + " KB"; + } else { + return bytes + " B"; + } +} +return convertBytes(value); +``` + +F. 术语表 + +|术语| 说明| +|--|--| +|OID |Object Identifier,对象标识符| +|MIB |Management Information Base,管理信息库| +|PDU |Protocol Data Unit,协议数据单元| +|LLD |Low-Level Discovery,低级别发现| +|SNMP Walk |遍历 SNMP 子树的操作| +|Community |SNMPv1/v2c 的认证字符串| +|Bulk Request |SNMPv2c/v3 的批量请求| +|Trap |SNMP 主动推送的告警| +|Counter |累加计数器,会溢出| +|Gauge| 测量值,可增可减| +|TimeTicks| 时间计数器,单位 1/100 秒| +|Item| 监控项,定义要采集的指标| +|Item Prototype| 项目原型,发现后生成监控项的模板| +|Macro |宏,用于动态替换的变量| +|Preprocessing |预处理,数据采集后的转换步骤| diff --git a/inputs/snmp_zabbix/dashboard.json b/inputs/snmp_zabbix/dashboard.json new file mode 100644 index 000000000..9fe94f277 --- /dev/null +++ b/inputs/snmp_zabbix/dashboard.json @@ -0,0 +1,70 @@ +{ + "title": "Zabbix SNMP Template General", + "uid": "3d0440af", + "tags": [ + "zabbix snmp template general" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Zabbix SNMP - Inbound Traffic", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(net_if_in[5m]) * 8", + "legendFormat": "{{ifName}}", + "refId": "A" + } + ] + }, + { + "title": "Zabbix SNMP - Outbound Traffic", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(net_if_out[5m]) * 8", + "legendFormat": "{{ifName}}", + "refId": "A" + } + ] + }, + { + "title": "Zabbix SNMP - CPU Utilization", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "system_cpu_util", + "legendFormat": "CPU {{cpu}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/sockstat/README.md b/inputs/sockstat/README.md index b0fe259bc..72e6c27a0 100644 --- a/inputs/sockstat/README.md +++ b/inputs/sockstat/README.md @@ -1,27 +1,42 @@ -# Read sockstat info from /proc/net/sockstat and /proc/net/sockstat6 - -## example file -```shell -sockets: used 211 -TCP: inuse 9 orphan 0 tw 19 alloc 47 mem 22 -UDP: inuse 2 mem 0 -UDPLITE: inuse 0 -RAW: inuse 0 -FRAG: inuse 0 memory 0 +# Sockstat Input Plugin + +This plugin collects global socket usage statistics and memory allocation information of the operating system by reading the `/proc/net/sockstat` and `/proc/net/sockstat6` files on Linux systems. + +**Supported Platforms:** Linux + +*Note: On non-Linux platforms (like Windows, macOS) where `/proc/net/sockstat` does not exist, this plugin will not collect meaningful data.* + +## Configuration + +Generally, no special configuration is needed; just enable the plugin. + +```toml +# Collect Linux sockstat metrics +# interval = 15 + +[[instances]] +# No specific configuration parameters required ``` -The content of "/proc/net/sockstat" in a Linux system provides information about the socket usage on the system. The fields and their meaning are as follows: - -sockets: used: Total number of used sockets on the system. -TCP: inuse: Number of currently established TCP sockets. -orphan: Number of orphaned TCP sockets. -tw: Number of sockets in TIME_WAIT state. -alloc: Number of sockets allocated. -mem: Memory used by TCP sockets. -UDP: inuse: Number of currently established UDP sockets. -mem: Memory used by UDP sockets. -UDPLITE: inuse: Number of currently established UDP-Lite sockets. -RAW: inuse: Number of currently established raw sockets. -FRAG: inuse: Number of currently established fragment sockets. -memory: Memory used by fragment sockets. -These fields provide a snapshot of the socket usage on the system, including the number of sockets in use and memory usage, which can be useful for monitoring and troubleshooting network issues. \ No newline at end of file +## Metrics + +All metrics are prefixed with `sockstat_`. Common core metrics include: + +- `sockstat_sockets_used`: Total number of used sockets on the system +- `sockstat_tcp_inuse`: Number of currently established TCP sockets +- `sockstat_tcp_orphan`: Number of orphaned TCP sockets +- `sockstat_tcp_tw`: Number of TCP sockets in `TIME_WAIT` state +- `sockstat_tcp_alloc`: Number of TCP sockets allocated +- `sockstat_tcp_mem`: Memory used by the TCP stack (in Pages) +- `sockstat_udp_inuse`: Number of currently established UDP sockets +- `sockstat_udp_mem`: Memory used by the UDP stack (in Pages) +- `sockstat_raw_inuse`: Number of currently established RAW sockets +- `sockstat_frag_inuse`: Number of currently established IP fragment sockets +- `sockstat_frag_memory`: Memory used by fragment sockets + +These fields provide a snapshot of the socket usage on the system. This is extremely useful for monitoring the network connection pressure on the OS and troubleshooting network issues caused by excessive `TIME_WAIT` or orphan connections under high concurrency. + +## Dashboards + +These metrics are part of basic host monitoring and are typically integrated into global **System** or **Network** dashboards. +A dedicated basic Dashboard focusing exclusively on the sockstat socket state distribution and memory usage is also provided in this directory. \ No newline at end of file diff --git a/inputs/sockstat/README_CN.md b/inputs/sockstat/README_CN.md new file mode 100644 index 000000000..95bf817e1 --- /dev/null +++ b/inputs/sockstat/README_CN.md @@ -0,0 +1,42 @@ +# Sockstat 采集插件 + +该插件通过读取 Linux 系统的 `/proc/net/sockstat` 和 `/proc/net/sockstat6` 文件,采集操作系统的全局 Socket 使用情况和内存分配信息。 + +**支持平台:** Linux + +*注意:非 Linux 平台(如 Windows, macOS)由于不存在 `/proc/net/sockstat`,此插件将不会采集到有效数据。* + +## 配置说明 + +通常无需任何特殊配置,直接启用该插件即可。 + +```toml +# 采集 Linux sockstat 状态 +# interval = 15 + +[[instances]] +# 无需任何特定配置参数 +``` + +## 采集指标 + +所有指标默认会附带 `sockstat_` 作为前缀。常见的核心指标包括: + +- `sockstat_sockets_used`: 系统中当前正在被使用的 Socket 总数 +- `sockstat_tcp_inuse`: 当前建立的 TCP Socket 数量 +- `sockstat_tcp_orphan`: 当前处于孤儿状态 (Orphan) 的 TCP Socket 数量 +- `sockstat_tcp_tw`: 当前处于 `TIME_WAIT` 状态的 TCP Socket 数量 +- `sockstat_tcp_alloc`: 当前已分配的 TCP Socket 数量 +- `sockstat_tcp_mem`: TCP 协议栈所消耗的内存量 (单位为 Page 页数) +- `sockstat_udp_inuse`: 当前建立的 UDP Socket 数量 +- `sockstat_udp_mem`: UDP 协议栈所消耗的内存量 (单位为 Page 页数) +- `sockstat_raw_inuse`: 当前建立的 RAW Socket 数量 +- `sockstat_frag_inuse`: 当前正在处理的 IP 分片 (Fragment) 数量 +- `sockstat_frag_memory`: IP 分片重组所消耗的内存量 + +这些字段提供了系统上套接字使用情况的一个快照,对于监控操作系统的网络连接压力、排查高并发下的 `TIME_WAIT` 或孤儿连接过多导致的网络问题非常有用。 + +## 监控大盘 + +这些指标是主机基础监控的一部分,通常会被整合在 **System (主机系统)** 或 **Network (网络)** 全局大盘中。 +本目录下也为您提供了一个仅针对 sockstat Socket 状态分布与内存占用的专属基础 Dashboard。 \ No newline at end of file diff --git a/inputs/sockstat/dashboard.json b/inputs/sockstat/dashboard.json new file mode 100644 index 000000000..94b15f636 --- /dev/null +++ b/inputs/sockstat/dashboard.json @@ -0,0 +1,108 @@ +{ + "title": "Sockstat Network Connections", + "uid": "4dc40a41", + "tags": [ + "sockstat network connections" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Socket Connection States", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "sockstat_tcp_inuse", + "legendFormat": "TCP Inuse", + "refId": "A" + }, + { + "expr": "sockstat_tcp_tw", + "legendFormat": "TCP TIME_WAIT", + "refId": "B" + }, + { + "expr": "sockstat_tcp_orphan", + "legendFormat": "TCP Orphan", + "refId": "C" + }, + { + "expr": "sockstat_udp_inuse", + "legendFormat": "UDP Inuse", + "refId": "D" + } + ] + }, + { + "title": "Socket Memory (Pages)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "sockstat_tcp_mem", + "legendFormat": "TCP Mem", + "refId": "A" + }, + { + "expr": "sockstat_udp_mem", + "legendFormat": "UDP Mem", + "refId": "B" + } + ] + }, + { + "title": "Total Used Sockets", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "sockstat_sockets_used", + "legendFormat": "Total Sockets Used", + "refId": "A" + } + ] + }, + { + "title": "Allocated TCP Sockets", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "sockstat_tcp_alloc", + "legendFormat": "TCP Allocated", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/sqlserver/README.md b/inputs/sqlserver/README.md index 8ce8ad002..5ac682cbf 100644 --- a/inputs/sqlserver/README.md +++ b/inputs/sqlserver/README.md @@ -1,6 +1,6 @@ -# SQL Server +# kubernetes -forked from [telegraf/sqlserver](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/sqlserver). 这个插件的作用是获取sqlserver的监控指标,这里去掉了Azure相关部分监控,只保留了本地部署sqlserver情况。 +forked from telegraf/sqlserver. 这个插件的作用是获取sqlserver的监控指标,这里去掉了Azure相关部分监控,只保留了本地部署sqlserver情况。 # 按照下面方法创建监控账号,用于读取监控数据 USE master; diff --git a/inputs/supervisor/README.md b/inputs/supervisor/README.md index 9b156fb8c..e2415523e 100644 --- a/inputs/supervisor/README.md +++ b/inputs/supervisor/README.md @@ -1,12 +1,18 @@ -# Supervisor +# Supervisor Input Plugin -此插件通过使用XML-RPC API收集在supervisor下运行的进程信息。 +This plugin gathers information about processes that +running under supervisor using XML-RPC API. -supervisor的最低测试版本为3.3.2。 +Minimum tested version of supervisor: 3.3.2 -## Supervisor 配置 +## Supervisor configuration -这个插件需要在supervisor中启用HTTP服务器,同时建议在HTTP服务器上启用基本身份验证。使用基本认证时,请确保在插件的url设置中包含用户名和密码。下面是一个`inet_http_server`部分的supervisor配置示例,该配置可以与默认插件配置一起工作: +This plugin needs an HTTP server to be enabled in supervisor, +also it's recommended to enable basic authentication on the +HTTP server. When using basic authentication make sure to +include the username and password in the plugin's url setting. +Here is an example of the `inet_http_server` section in supervisor's +config that will work with default plugin configuration: ```ini [inet_http_server] @@ -15,13 +21,15 @@ username = user password = pass ``` -## 全局配置选项 +## Global configuration options -除了特定于插件的配置设置外,插件还支持额外的全局和插件配置设置。这些设置用于修改指标、标签和字段或创建别名和配置排序等。 +In addition to the plugin-specific configuration settings, plugins support +additional global and plugin configuration settings. These settings are used to +modify metrics, tags, and field or create aliases and configure ordering. -## 配置 +## Configuration -```toml +```toml # Gathers information about processes that running under supervisor using XML-RPC API [[instances]] ## Url of supervisor's XML-RPC endpoint if basic auth enabled in supervisor http server, @@ -34,94 +42,103 @@ password = pass # metrics_exclude = ["pid", "rc"] ``` -注意,`url = "http://login:pass@localhost:9001/RPC2"`中的`login:pass`是用户名和密码。相关信息可以参见您的supervisor配置文件。 - -### 可选指标 +### Optional Metrics -通过在配置文件中设置`metrics_include`和`metrics_exclude`参数,用于控制哪些指标(metrics)应该被包括(`include`)或排除(`exclude`)在监控数据中。这两个配置选项为用户提供了细粒度控制,以便根据特定需要定制收集的数据。这在处理大量指标或只关心某些特定指标的情况下尤其有用。 +By setting the `metrics_include` and `metrics_exclude` parameters in the configuration file, you can control which metrics should be included or excluded in the monitoring data. These two configuration options provide users with fine-grained control, allowing for customized data collection based on specific needs. This is especially useful when dealing with a large number of metrics or only being interested in certain specific metrics. #### metrics_include -- `metrics_include` 选项允许你指定一个指标名称列表,仅这些指标会被收集和发送。如果设置了这个选项,那么只有列表中的指标会被包含,其他所有指标都会被忽略。 -- 这个选项通常用于限制数据的收集范围,以减少网络流量、存储需求或者仅仅关注一小部分重要指标。 -- 格式通常是一个指标名称的数组,例如:`metrics_include = ["cpu_usage_idle", "cpu_usage_user"]`。 +- The `metrics_include` option allows you to specify a list of metric names, only these metrics will be collected and sent. If this option is set, then only the metrics listed will be included, all other metrics will be ignored. +- This option is typically used to limit the scope of data collection, reducing network traffic, storage requirements, or simply focusing on a small set of important metrics. +- The format is usually an array of metric names, for example: `metrics_include = ["cpu_usage_idle", "cpu_usage_user"]`. #### metrics_exclude -- 相反,`metrics_exclude` 选项允许你指定一个指标名称列表,这些指标将不会被收集和发送。如果设置了这个选项,那么列表中的指标会被排除,其他所有指标都会被包含。 -- 这个选项用于从收集的数据中排除不感兴趣或不相关的指标,有助于减少处理和存储无用数据的负担。 -- 格式同样是一个指标名称的数组,例如:`metrics_exclude = ["memory_free", "memory_cached"]`。 +- Conversely, the `metrics_exclude` option allows you to specify a list of metric names, these metrics will not be collected and sent. If this option is set, then the metrics listed will be excluded, all other metrics will be included. +- This option is used to exclude uninteresting or irrelevant metrics from the collected data, helping to reduce the burden of processing and storing useless data. +- The format is also an array of metric names, for example: `metrics_exclude = ["memory_free", "memory_cached"]`. -#### 使用注意事项 +#### Usage Notes -- 如果同时使用`metrics_include`和`metrics_exclude`,首先应用`metrics_include`过滤规则,然后应用`metrics_exclude`。这意味着如果一个指标在`metrics_include`中被明确包含,在`metrics_exclude`中也被明确排除,那么这个指标最终将被排除。 -- 这两个配置选项的工作原理和具体可用值可能依赖于具体的插件。有的插件可能允许根据指标的某些属性或标签来进行包含或排除。 -- 正确使用这两个配置选项可以显著改善Telegraf的性能和效率,特别是在资源受限的环境中或当监控系统规模较大时。 +- If `metrics_include` and `metrics_exclude` are used simultaneously, the `metrics_include` filter rules are applied first, followed by `metrics_exclude`. This means that if a metric is explicitly included in `metrics_include` and also explicitly excluded in `metrics_exclude`, then the metric will ultimately be excluded. +- The workings and specific available values of these two configuration options may depend on the specific plugin. Some plugins may allow inclusion or exclusion based on certain properties or tags of the metrics. +- Properly using these two configuration options can significantly improve the performance and efficiency of Telegraf, especially in resource-constrained environments or when the monitoring system is large in scale. -#### 示例 +#### Example -假设你使用Categraf监控系统性能,并使用`cpu`插件收集CPU使用情况的指标。如果你只对CPU的闲置时间和用户时间感兴趣,可以使用以下配置: +Suppose you are using Categraf to monitor system performance and are using the `cpu` plugin to collect CPU usage metrics. If you are only interested in the CPU's idle time and user time, you could use the following configuration: ```toml [[instances]] - ## 仅收集CPU的闲置时间和用户使用时间的指标 + ## Only collect metrics of CPU's idle time and user time metrics_include = ["cpu_usage_idle", "cpu_usage_user"] ``` -或者,如果你想收集所有CPU相关指标,但排除闲置时间和用户时间,可以使用: +Alternatively, if you want to collect all CPU-related metrics but exclude idle time and user time, you could use: ```toml [[instances]] - ## 排除CPU的闲置时间和用户使用时间的指标 + ## Exclude metrics of CPU's idle time and user time metrics_exclude = ["cpu_usage_idle", "cpu_usage_user"] ``` -通过精细控制指标的收集,你可以优化监控设置,确保只处理对你最重要的信息。 +By finely controlling the collection of metrics, you can optimize your monitoring setup to ensure only the most important information is processed. -### 服务器标签 +### Server tag -服务器标签用于标识指标源服务器。你可以选择默认使用supervisor的http端点的`host:port`,或者你可以使用在supervisor配置文件中设置的supervisor的标识字符串。 +Server tag is used to identify metrics source server. You have an option +to use host:port pair of supervisor's http endpoint by default or you +can use supervisor's identification string, which is set in supervisor's +configuration file. -## 指标 +## Metrics - supervisor_processes - - tags: - - source(supervisor实例的主机名或IP地址) - - port(supervisor的HTTP服务器端口号) - - id(supervisor的标识字符串) - - name(进程名) - - group(进程组) - - fields: - - state(int,参见参考表) - - uptime(int,秒) - - pid(int,可选) - - exitCode(int,可选) + - Tags: + - source (Hostname or IP address of supervisor's instance) + - port (Port number of supervisor's HTTP server) + - id (Supervisor's identification string) + - name (Process name) + - group (Process group) + - Fields: + - state (int, see reference) + - uptime (int, seconds) + - pid (int, optional) + - exitCode (int, optional) - supervisor_instance - - tags: - - source(supervisor实例的主机名或IP地址) - - port(supervisor的HTTP服务器端口号) - - id(supervisor的标识字符串) - - fields: - - state(int,参见参考表) - -### Supervisor进程状态字段参考表 - -| 状态码 | 状态名 | 描述 | -|------|----------|---------------------------------------| -| 0 | STOPPED | 进程因停止请求停止了,或者从未启动。 | -| 10 | STARTING | 进程因启动请求正在启动。 | -| 20 | RUNNING | 进程正在运行。 | -| 30 | BACKOFF | 进程进入STARTING状态但随后过快退出,未能移动到RUNNING状态。 | -| 40 | STOPPING | 进程因停止请求正在停止。 | -| 100 | EXITED | 进程已从RUNNING状态退出(预期地或意外地)。 | -| 200 | FATAL | 无法成功启动进程。 | -| 1000 | UNKNOWN | 进程处于未知状态(supervisord编程错误)。 | - -### Supervisor实例状态字段参考 - -| 状态码 | 状态名 | 描述 | -|-----|---------|--------------------| -| 2 | FATAL | Supervisor遇到了严重错误。 | -| 1 | RUNNING | Supervisor正在正常工作。 | -| 0 | | | \ No newline at end of file + - Tags: + - source (Hostname or IP address of supervisor's instance) + - port (Port number of supervisor's HTTP server) + - id (Supervisor's identification string) + - Fields: + - state (int, see reference) + +### Supervisor process state field reference table + +| Statecode | Statename | Description | +|-----------|-----------|----------------------------------------------------------------------------------------------------------| +| 0 | STOPPED | The process has been stopped due to a stop request or has never been started. | +| 10 | STARTING | The process is starting due to a start request. | +| 20 | RUNNING | The process is running. | +| 30 | BACKOFF | The process entered the STARTING state but subsequently exited too quickly to move to the RUNNING state. | +| 40 | STOPPING | The process is stopping due to a stop request. | +| 100 | EXITED | The process exited from the RUNNING state (expectedly or unexpectedly). | +| 200 | FATAL | The process could not be started successfully. | +| 1000 | UNKNOWN | The process is in an unknown state (supervisord programming error). | + +### Supervisor instance state field reference + +| Statecode | Statename | Description | +|-----------|------------|------------------------------------------------| +| 2 | FATAL | Supervisor has experienced a serious error. | +| 1 | RUNNING | Supervisor is working normally. | +| 0 | RESTARTING | Supervisor is in the process of restarting. | +| -1 | SHUTDOWN | Supervisor is in the process of shutting down. | + +## Example Output + +```text +supervisor_processes,group=ExampleGroup,id=supervisor,port=9001,process=ExampleProcess,source=localhost state=20i,uptime=75958i 1659786637000000000 +supervisor_instance,id=supervisor,port=9001,source=localhost state=1i 1659786637000000000 +``` diff --git a/inputs/supervisor/README_CN.md b/inputs/supervisor/README_CN.md new file mode 100644 index 000000000..9b156fb8c --- /dev/null +++ b/inputs/supervisor/README_CN.md @@ -0,0 +1,127 @@ +# Supervisor + +此插件通过使用XML-RPC API收集在supervisor下运行的进程信息。 + +supervisor的最低测试版本为3.3.2。 + +## Supervisor 配置 + +这个插件需要在supervisor中启用HTTP服务器,同时建议在HTTP服务器上启用基本身份验证。使用基本认证时,请确保在插件的url设置中包含用户名和密码。下面是一个`inet_http_server`部分的supervisor配置示例,该配置可以与默认插件配置一起工作: + +```ini +[inet_http_server] +port = 127.0.0.1:9001 +username = user +password = pass +``` + +## 全局配置选项 + +除了特定于插件的配置设置外,插件还支持额外的全局和插件配置设置。这些设置用于修改指标、标签和字段或创建别名和配置排序等。 + +## 配置 + +```toml +# Gathers information about processes that running under supervisor using XML-RPC API +[[instances]] + ## Url of supervisor's XML-RPC endpoint if basic auth enabled in supervisor http server, + ## than you have to add credentials to url (ex. http://login:pass@localhost:9001/RPC2) + # url = "http://login:pass@localhost:9001/RPC2" + ## With settings below you can manage gathering additional information about processes + ## If both of them empty, then all additional information will be collected. + ## Currently supported supported additional metrics are: pid, rc + # metrics_include = [] + # metrics_exclude = ["pid", "rc"] +``` + +注意,`url = "http://login:pass@localhost:9001/RPC2"`中的`login:pass`是用户名和密码。相关信息可以参见您的supervisor配置文件。 + +### 可选指标 + +通过在配置文件中设置`metrics_include`和`metrics_exclude`参数,用于控制哪些指标(metrics)应该被包括(`include`)或排除(`exclude`)在监控数据中。这两个配置选项为用户提供了细粒度控制,以便根据特定需要定制收集的数据。这在处理大量指标或只关心某些特定指标的情况下尤其有用。 + +#### metrics_include + +- `metrics_include` 选项允许你指定一个指标名称列表,仅这些指标会被收集和发送。如果设置了这个选项,那么只有列表中的指标会被包含,其他所有指标都会被忽略。 +- 这个选项通常用于限制数据的收集范围,以减少网络流量、存储需求或者仅仅关注一小部分重要指标。 +- 格式通常是一个指标名称的数组,例如:`metrics_include = ["cpu_usage_idle", "cpu_usage_user"]`。 + +#### metrics_exclude + +- 相反,`metrics_exclude` 选项允许你指定一个指标名称列表,这些指标将不会被收集和发送。如果设置了这个选项,那么列表中的指标会被排除,其他所有指标都会被包含。 +- 这个选项用于从收集的数据中排除不感兴趣或不相关的指标,有助于减少处理和存储无用数据的负担。 +- 格式同样是一个指标名称的数组,例如:`metrics_exclude = ["memory_free", "memory_cached"]`。 + +#### 使用注意事项 + +- 如果同时使用`metrics_include`和`metrics_exclude`,首先应用`metrics_include`过滤规则,然后应用`metrics_exclude`。这意味着如果一个指标在`metrics_include`中被明确包含,在`metrics_exclude`中也被明确排除,那么这个指标最终将被排除。 +- 这两个配置选项的工作原理和具体可用值可能依赖于具体的插件。有的插件可能允许根据指标的某些属性或标签来进行包含或排除。 +- 正确使用这两个配置选项可以显著改善Telegraf的性能和效率,特别是在资源受限的环境中或当监控系统规模较大时。 + +#### 示例 + +假设你使用Categraf监控系统性能,并使用`cpu`插件收集CPU使用情况的指标。如果你只对CPU的闲置时间和用户时间感兴趣,可以使用以下配置: + +```toml +[[instances]] + ## 仅收集CPU的闲置时间和用户使用时间的指标 + metrics_include = ["cpu_usage_idle", "cpu_usage_user"] +``` + +或者,如果你想收集所有CPU相关指标,但排除闲置时间和用户时间,可以使用: + +```toml +[[instances]] + ## 排除CPU的闲置时间和用户使用时间的指标 + metrics_exclude = ["cpu_usage_idle", "cpu_usage_user"] +``` + +通过精细控制指标的收集,你可以优化监控设置,确保只处理对你最重要的信息。 + +### 服务器标签 + +服务器标签用于标识指标源服务器。你可以选择默认使用supervisor的http端点的`host:port`,或者你可以使用在supervisor配置文件中设置的supervisor的标识字符串。 + +## 指标 + +- supervisor_processes + - tags: + - source(supervisor实例的主机名或IP地址) + - port(supervisor的HTTP服务器端口号) + - id(supervisor的标识字符串) + - name(进程名) + - group(进程组) + - fields: + - state(int,参见参考表) + - uptime(int,秒) + - pid(int,可选) + - exitCode(int,可选) + +- supervisor_instance + - tags: + - source(supervisor实例的主机名或IP地址) + - port(supervisor的HTTP服务器端口号) + - id(supervisor的标识字符串) + - fields: + - state(int,参见参考表) + +### Supervisor进程状态字段参考表 + +| 状态码 | 状态名 | 描述 | +|------|----------|---------------------------------------| +| 0 | STOPPED | 进程因停止请求停止了,或者从未启动。 | +| 10 | STARTING | 进程因启动请求正在启动。 | +| 20 | RUNNING | 进程正在运行。 | +| 30 | BACKOFF | 进程进入STARTING状态但随后过快退出,未能移动到RUNNING状态。 | +| 40 | STOPPING | 进程因停止请求正在停止。 | +| 100 | EXITED | 进程已从RUNNING状态退出(预期地或意外地)。 | +| 200 | FATAL | 无法成功启动进程。 | +| 1000 | UNKNOWN | 进程处于未知状态(supervisord编程错误)。 | + +### Supervisor实例状态字段参考 + +| 状态码 | 状态名 | 描述 | +|-----|---------|--------------------| +| 2 | FATAL | Supervisor遇到了严重错误。 | +| 1 | RUNNING | Supervisor正在正常工作。 | +| 0 | | | \ No newline at end of file diff --git a/inputs/supervisor/dashboard.json b/inputs/supervisor/dashboard.json new file mode 100644 index 000000000..a1c47b6f6 --- /dev/null +++ b/inputs/supervisor/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "Supervisor Process Monitor", + "uid": "6a519a0c", + "tags": [ + "supervisor process monitor" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Supervisor Process State (20=RUNNING)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "supervisor_state", + "legendFormat": "{{process}} @ {{server}}", + "refId": "A" + } + ] + }, + { + "title": "Supervisor Process Uptime", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "supervisor_uptime", + "legendFormat": "{{process}} @ {{server}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/systemd/README.md b/inputs/systemd/README.md index 44ab3db4d..a6ea86733 100644 --- a/inputs/systemd/README.md +++ b/inputs/systemd/README.md @@ -1,12 +1,53 @@ -# systemd 插件 -自 [node_exporter](https://github.com/prometheus/node_exporter/blob/master/collector/systemd_linux.go)fork 并改动 +# Systemd Input Plugin + +This plugin collects metrics about the running state of `systemd` on Linux systems. It gathers the statuses of various units (services, sockets, timers, etc.), restart counts, startup times, and task counts. + +The implementation of this plugin is forked and adapted from [node_exporter](https://github.com/prometheus/node_exporter/blob/master/collector/systemd_linux.go). +**Note:** This plugin interacts with D-Bus using pure Go. It does not require CGO to compile and run on Linux. On non-Linux systems, it compiles down to an empty implementation. ## Configuration + +You can enable and configure the systemd plugin in your Categraf configuration file: + ```toml -enable=false # 设置为true 打开采集 -#unit_include=".+" -#unit_exclude="" -enable_start_time_metrics=true #是否采集service unit的启动时间信息 单位秒 -enable_task_metrics=true # 是否采集service unit task的metrics -enable_restarts_metrics=true #是否采集service unit重启的次数信息 +# Collect systemd unit metrics +# interval = 15 + +[[instances]] +# Regex: Used to match the unit names to be collected. Default is all (".+"). +# unit_include = ".+" + +# Regex: Used to exclude specific unit names from collection. +# If a unit matches both include and exclude regexes, it will be excluded. +# By default, automount, device, mount, scope, and slice units are excluded. +# unit_exclude = ".+\\.(automount|device|mount|scope|slice)" + +# Whether to establish a private, non-D-Bus direct connection to systemd. +# (Strongly discouraged for production, requires root privileges, mainly for testing). +# systemd_private = false + +# Enable gathering metrics for task counts and maximum tasks allowed per unit. +# enable_task_metrics = false + +# Enable gathering metrics regarding the restart counts of units. +# enable_restarts_metrics = false + +# Enable gathering start time metrics for units. +# enable_start_time_metrics = false ``` + +## Metrics + +All metrics reported by this plugin are prefixed with `systemd_`. The core metrics include: + +- `systemd_system_running`: A boolean indicating if systemd is fully running and not in a degraded or initializing state. +- `systemd_version`: systemd version info metric (value is 1, with version info in labels). +- `systemd_units`: The total count of systemd units by state (active, activating, inactive, failed, etc.). +- `systemd_unit_state`: Indicates the state of each specific unit (e.g., `state="active"`, `state="failed"`). This is extremely useful for alerting on failed services. +- `systemd_unit_tasks_current` / `systemd_unit_tasks_max`: The current and maximum allowed tasks for the unit (if `enable_task_metrics` is true). +- `systemd_service_restart_total`: The total number of times the service has restarted (if `enable_restarts_metrics` is true). +- `systemd_unit_start_time_seconds`: The timestamp when the unit started (if `enable_start_time_metrics` is true). + +## Dashboards + +A basic companion Dashboard (`dashboard.json`) is provided in this directory. It visualizes the overall health of systemd (`system_running`), identifies any failed units, and tracks the number of services running on the node. diff --git a/inputs/systemd/README_CN.md b/inputs/systemd/README_CN.md new file mode 100644 index 000000000..33077b62d --- /dev/null +++ b/inputs/systemd/README_CN.md @@ -0,0 +1,54 @@ +# systemd 插件 + +该插件用于采集 Linux 系统上 `systemd` 的运行状态、各个 unit (service, socket, timer 等) 的状态、重启次数、启动时间以及任务数等关键指标。 + +本插件的实现自 [node_exporter](https://github.com/prometheus/node_exporter/blob/master/collector/systemd_linux.go) fork 并经过修改适配。 +**注意**:该插件通过纯 Go 语言实现与 D-Bus 的交互,不需要开启 CGO 即可在 Linux 环境下编译和运行。在非 Linux 系统下编译会退化为空实现。 + +## Configuration + +在 Categraf 的配置文件中,可以通过以下选项来开启和配置 systemd 插件的采集(位于 `conf/input.systemd/systemd.toml`): + +```toml +# 是否启用该插件 +enable = false + +# 正则表达式:用于匹配需要采集的 unit 名称,默认为匹配所有 (".+") +# unit_include = ".+" + +# 正则表达式:用于匹配需要排除采集的 unit 名称。 +# 如果一个 unit 同时符合 include 和 exclude 的正则,它将会被排除。 +# 默认排除了 automount, device, mount, scope, slice 类型的 unit。 +# unit_exclude = ".+\\.(automount|device|mount|scope|slice)" + +# 是否建立一个私有的、不经过 dbus 的直连到 systemd (强烈不建议开启,需要 root 权限,主要用于测试) +# systemd_private = false + +# 是否采集 service unit 的启动时间信息 (单位:秒) +enable_start_time_metrics = true + +# 是否采集 service unit task (任务数) 的指标 +enable_task_metrics = true + +# 是否采集 service unit 重启的次数信息 +enable_restarts_metrics = true +``` + +## Metrics + +插件成功采集后,会上报以下系统指标(所有的指标名称在系统中都会自动附带 `systemd_` 测量前缀): + +| 指标名称 | 类型 | 标签 (Tags) | 说明 | +| :--- | :--- | :--- | :--- | +| `systemd_version` | Gauge | `version` | 检测到的 systemd 版本。指标值为版本号浮点数,完整的版本字符串记录在 `version` 标签中。 | +| `systemd_system_running` | Gauge | 无 | 整个系统是否在正常运行 (类似命令 `systemctl is-system-running`),值为 1.0 表示 running。 | +| `systemd_units` | Gauge | `state` | 处于不同系统状态 (`active`, `activating`, `deactivating`, `inactive`, `failed`) 的 unit 总计数量。 | +| `systemd_unit_state` | Gauge | `name`, `state`, `type` | 特定 unit 的状态指示器。如果该 unit 正处于对应的 `state` 则值为 1.0,否则为 0.0。 | +| `systemd_service_restart_total` | Counter | `name` | service 类型的 unit 所触发的重启总次数。 | +| `systemd_unit_start_time_seconds` | Gauge | `name` | unit 的启动时间点 (表示为自 Unix epoch 以来的秒数)。 | +| `systemd_unit_tasks_current` | Gauge | `name` | 当前 unit 内部正在运行的任务数量。 | +| `systemd_unit_tasks_max` | Gauge | `name` | 当前 unit 允许的最大任务数量。 | +| `systemd_socket_accepted_connections_total` | Counter | `name` | socket 类型的 unit 累计已接受的连接总数。 | +| `systemd_socket_current_connections` | Gauge | `name` | socket 类型的 unit 当前活动的连接数。 | +| `systemd_socket_refused_connections_total` | Gauge | `name` | socket 类型的 unit 累计被拒绝的连接总数 (需要 systemd >= 239)。 | +| `systemd_timer_last_trigger_seconds` | Gauge | `name` | timer 类型的 unit 上一次触发的时间点 (自 Unix epoch 以来的秒数)。 | \ No newline at end of file diff --git a/inputs/systemd/dashboard.json b/inputs/systemd/dashboard.json new file mode 100644 index 000000000..07356ba73 --- /dev/null +++ b/inputs/systemd/dashboard.json @@ -0,0 +1,70 @@ +{ + "title": "Systemd Services Monitor", + "uid": "0ee1df6d", + "tags": [ + "systemd services monitor" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Systemd Overall Running State (1=OK)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "systemd_system_running", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Failed Services (Alert Candidate)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "systemd_unit_state{state=\"failed\"} > 0", + "legendFormat": "{{name}} @ {{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "Total Units by State", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "systemd_units", + "legendFormat": "{{state}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/tengine/README.md b/inputs/tengine/README.md index c268529ac..8211b80b8 100644 --- a/inputs/tengine/README.md +++ b/inputs/tengine/README.md @@ -1,87 +1,61 @@ # Tengine Input Plugin -The tengine plugin gathers metrics from the -[Tengine Web Server](http://tengine.taobao.org/) via the -[reqstat](http://tengine.taobao.org/document/http_reqstat.html) module. +This plugin collects advanced statistics from Tengine (or Nginx with the `ngx_http_reqstat_module` compiled and enabled). +Compared to the basic `nginx` stub_status plugin, this Tengine plugin provides detailed, host-level (virtual server) metrics including traffic, connections, HTTP status codes (2xx, 3xx, 4xx, 5xx, and detailed codes like 499), and Upstream interaction latencies. -## Tengine Configuration Example +## Prerequisites -``` -http { - - req_status_zone server "$host,$server_addr:$server_port" 10M; - #req_status_zone_add_indicator server $limit; - req_status server; - - server { - location /us { - req_status_show; - #req_status_show_field req_total $limit; - #allow 127.0.0.1/32; - #deny all; - } - - #set $limit 0; - #if ($arg_limit = '1') { - # set $limit 1; - #} +The target Tengine/Nginx server must have the `ngx_http_reqstat_module` compiled and enabled. +Your Nginx/Tengine configuration should contain a block similar to the following: + +```nginx +req_status_zone server_name $server_name 256k; +req_status server_name; + +server { + location /reqstat { + req_status_show; + allow 127.0.0.1; + deny all; } } ``` +## Configuration + +Configure your request URLs in Categraf's `conf/input.tengine/tengine.toml`: + +```toml +# Collect Tengine HTTP reqstat metrics +# interval = 15 + +[[instances]] +# The HTTP URL to the Tengine reqstat endpoint. Multiple URLs can be configured. +urls = ["http://127.0.0.1/reqstat"] + +# HTTP request timeout +# timeout = "5s" + +# Optional TLS configuration +# ca_file = "/etc/telegraf/ca.pem" +# cert_file = "/etc/telegraf/cert.pem" +# key_file = "/etc/telegraf/key.pem" +# insecure_skip_verify = false +``` + ## Metrics -- Measurement - - tags: - - target - - target_port - - server_name - - server_schema - - fields: - - bytes_in (integer, total number of bytes received from client) - - bytes_out (integer, total number of bytes sent to client) - - conn_total (integer, total number of accepted connections) - - req_total (integer, total number of processed requests) - - http_2xx (integer, total number of 2xx requests) - - http_3xx (integer, total number of 3xx requests) - - http_4xx (integer, total number of 4xx requests) - - http_5xx (integer, total number of 5xx requests) - - http_other_status (integer, total number of other requests) - - rt (integer, accumulation or rt) - - ups_req (integer, total number of requests calling for upstream) - - ups_rt (integer, accumulation or upstream rt) - - ups_tries (integer, total number of times calling for upstream) - - http_200 (integer, total number of 200 requests) - - http_206 (integer, total number of 206 requests) - - http_302 (integer, total number of 302 requests) - - http_304 (integer, total number of 304 requests) - - http_403 (integer, total number of 403 requests) - - http_404 (integer, total number of 404 requests) - - http_416 (integer, total number of 416 requests) - - http_499 (integer, total number of 499 requests) - - http_500 (integer, total number of 500 requests) - - http_502 (integer, total number of 502 requests) - - http_503 (integer, total number of 503 requests) - - http_504 (integer, total number of 504 requests) - - http_508 (integer, total number of 508 requests) - - http_other_detail_status (integer, total number of requests of other status codes*http_ups_4xx total number of requests of upstream 4xx) - - http_ups_5xx (integer, total number of requests of upstream 5xx) - -## Example Output - -```text -tengine_rt agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=http target=127.0.0.1 target_port=80 37634 -tengine_ups_rt agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=http target=127.0.0.1 target_port=80 37394 -tengine_http_499 agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=http target=127.0.0.1 target_port=80 0 -tengine_http_504 agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=http target=127.0.0.1 target_port=80 0 -tengine_bytes_in agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=http target=127.0.0.1 target_port=80 129592 -tengine_http_4xx agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=http target=127.0.0.1 target_port=80 535 -tengine_http_other_status agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 0 -tengine_http_200 agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 14452 -tengine_http_499 agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 0 -tengine_http_503 agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 0 -tengine_http_504 agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 0 -tengine_http_500 agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 0 -tengine_http_ups_4xx agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 13 -tengine_http_ups_5xx agent_hostname=zy-fat project=matrix server_name=www.baidu.com server_schema=https target=127.0.0.1 target_port=80 1 -``` \ No newline at end of file +The plugin converts each virtual host's statistics into metrics prefixed by `tengine_`. All metrics will carry the `host` tag, which corresponds to the virtual server name processing the request. + +Core metrics include: +- `tengine_bytes_in` / `tengine_bytes_out`: Inbound and outbound traffic in bytes. +- `tengine_conn_total`: Total connections established. +- `tengine_req_total`: Total requests processed. +- `tengine_http_2xx` / `tengine_http_3xx` / `tengine_http_4xx` / `tengine_http_5xx`: Counter for major HTTP status code categories. +- `tengine_http_499` / `tengine_http_502` / `tengine_http_504`: Counter for specific detailed error status codes. +- `tengine_rt`: Total request response time (RT). +- `tengine_ups_req` / `tengine_ups_rt` / `tengine_ups_tries`: Number of requests forwarded to the Upstream, Upstream latency, and retry counts. + +## Dashboards + +A companion basic Dashboard (`dashboard.json`) is provided in this directory. It provides a breakdown by virtual host (`host`) showing HTTP traffic, Request Rate (QPS), HTTP status code distributions, and backend Upstream latencies. diff --git a/inputs/tengine/README_CN.md b/inputs/tengine/README_CN.md new file mode 100644 index 000000000..f31a4f43d --- /dev/null +++ b/inputs/tengine/README_CN.md @@ -0,0 +1,61 @@ +# Tengine 采集插件 + +该插件用于采集 Tengine (或配置了 `ngx_http_reqstat_module` 的 Nginx) 的高级统计指标。 +与基础的 `nginx` stub_status 相比,Tengine 提供了基于主机 (Host) 维度的详细流量、连接和 HTTP 状态码(2xx, 3xx, 4xx, 5xx, 甚至是 499 等细分状态码)的分布情况,还包含了与 Upstream 交互的耗时等指标。 + +## 前置要求 + +目标 Tengine/Nginx 必须编译并开启了 `ngx_http_reqstat_module` 模块。 +在 Nginx/Tengine 的配置文件中,需要包含类似以下的配置: + +```nginx +req_status_zone server_name $server_name 256k; +req_status server_name; + +server { + location /reqstat { + req_status_show; + allow 127.0.0.1; + deny all; + } +} +``` + +## 配置说明 + +在 Categraf 的 `conf/input.tengine/tengine.toml` 中配置你的请求 URL: + +```toml +# 采集 Tengine 的高级 HTTP 状态指标 +# interval = 15 + +[[instances]] +# Tengine reqstat 的 HTTP 访问地址,可以配置多个 +urls = ["http://127.0.0.1/reqstat"] + +# HTTP 请求超时时间 +# timeout = "5s" + +# 可选 TLS 配置 +# ca_file = "/etc/telegraf/ca.pem" +# cert_file = "/etc/telegraf/cert.pem" +# key_file = "/etc/telegraf/key.pem" +# insecure_skip_verify = false +``` + +## 采集指标 + +该插件会将每个域名的状态转化为指标,默认前缀为 `tengine_`。所有指标会携带 `host` (请求所对应的虚拟主机名) 作为核心标签。 + +核心指标包含: +- `tengine_bytes_in` / `tengine_bytes_out`: 进出流量字节数。 +- `tengine_conn_total`: 建立的总连接数。 +- `tengine_req_total`: 处理的总请求数。 +- `tengine_http_2xx` / `tengine_http_3xx` / `tengine_http_4xx` / `tengine_http_5xx`: 各种主类 HTTP 状态码的数量统计。 +- `tengine_http_499` / `tengine_http_502` / `tengine_http_504`: 细分错误状态码的统计。 +- `tengine_rt`: 请求总耗时 (RT)。 +- `tengine_ups_req` / `tengine_ups_rt` / `tengine_ups_tries`: 发送给 Upstream 后端的请求数、耗时以及重试次数。 + +## 监控大盘 + +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),用于按域名 (Host) 拆解展示 HTTP 流量、请求率 (QPS)、各种 HTTP 状态码的分布以及后端响应延迟情况。 diff --git a/inputs/tengine/dashboard.json b/inputs/tengine/dashboard.json new file mode 100644 index 000000000..198291223 --- /dev/null +++ b/inputs/tengine/dashboard.json @@ -0,0 +1,98 @@ +{ + "title": "Tengine Metrics Dashboard", + "uid": "b453a515", + "tags": [ + "tengine metrics dashboard" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Tengine Requests (QPS)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "rate(tengine_req_total[5m])", + "legendFormat": "{{host}}", + "refId": "A" + } + ] + }, + { + "title": "Tengine Traffic Rate (Bytes/s)", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "rate(tengine_bytes_in[5m])", + "legendFormat": "In - {{host}}", + "refId": "A" + }, + { + "expr": "rate(tengine_bytes_out[5m])", + "legendFormat": "Out - {{host}}", + "refId": "B" + } + ] + }, + { + "title": "Tengine Errors Rate (4xx, 5xx)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "rate(tengine_http_4xx[5m])", + "legendFormat": "4xx - {{host}}", + "refId": "A" + }, + { + "expr": "rate(tengine_http_5xx[5m])", + "legendFormat": "5xx - {{host}}", + "refId": "B" + } + ] + }, + { + "title": "Tengine Upstream Request Latency", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(tengine_ups_rt[5m]) / rate(tengine_ups_req[5m])", + "legendFormat": "{{host}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/tpl/README.md b/inputs/tpl/README.md index 51e557ccf..9574433c8 100644 --- a/inputs/tpl/README.md +++ b/inputs/tpl/README.md @@ -1,3 +1,13 @@ -# tpl +# Tpl Input Plugin Template -这不是个插件,这是插件开发模板,如果要开发某个采集插件,可以拷贝一份tpl的代码,改一下作为基础代码使用 \ No newline at end of file +This is **NOT** an actual input plugin. +It serves as a **Plugin Development Template**. If you want to develop a new, custom Categraf input plugin, you can simply copy the `tpl` directory, rename it to your desired plugin name, and use the existing code as a boilerplate. + +## Development Guide + +1. Copy `inputs/tpl` to `inputs/your_plugin_name`. +2. Change the package name to `package your_plugin_name`. +3. Modify the `inputName` constant to reflect your plugin's name. +4. Implement the logic to fetch metrics inside the `Gather(slist *types.SampleList)` function. +5. Create a corresponding configuration template under the `conf/` directory. +6. Modify the main entry file `metrics_agent.go` to anonymously import your new plugin (or configure build tags as needed). diff --git a/inputs/tpl/README_CN.md b/inputs/tpl/README_CN.md new file mode 100644 index 000000000..db062fc7a --- /dev/null +++ b/inputs/tpl/README_CN.md @@ -0,0 +1,13 @@ +# Tpl 采集插件模板 + +这**不是**一个真正的采集插件。 +这是一个**插件开发模板**。如果您要开发或编写一个属于自己的自定义 Categraf 采集插件,您可以直接将 `tpl` 目录复制一份,重命名为您所需的插件名称,并以其中的代码为基础骨架进行二次开发。 + +## 开发指南 + +1. 复制 `inputs/tpl` 到 `inputs/你的插件名`。 +2. 修改代码包名为 `package 你的插件名`。 +3. 修改代码中的 `inputName` 常量为您插件的名字。 +4. 实现 `Gather(slist *types.SampleList)` 函数以拉取指标数据。 +5. 在 `conf/` 下建立对应的配置模板文件。 +6. 修改主入口 `metrics_agent.go`,将新插件匿名 import 进去(或者按需配置编译 Tags)。 \ No newline at end of file diff --git a/inputs/tpl/dashboard.json b/inputs/tpl/dashboard.json new file mode 100644 index 000000000..322708125 --- /dev/null +++ b/inputs/tpl/dashboard.json @@ -0,0 +1,34 @@ +{ + "title": "Custom Plugin Template", + "uid": "fdf9c02c", + "tags": [ + "custom plugin template" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Placeholder Metric", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "your_custom_metric", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/vsphere/README.md b/inputs/vsphere/README.md new file mode 100644 index 000000000..0b6d62601 --- /dev/null +++ b/inputs/vsphere/README.md @@ -0,0 +1,63 @@ +# vSphere Input Plugin + +This plugin uses the VMware vSphere API to collect performance metrics from ESXi hosts and virtual machines by connecting to your vCenter server. +It features automatic discovery of resources at various levels (Datacenters, Clusters, Hosts, VMs, and Datastores) and allows flexible filtering of resources and metrics using include/exclude rules. + +## Configuration + +```toml +# Collect VMware vSphere metrics +# interval = 60 + +[[instances]] +# vCenter connection URL (must include http/https) +vcenter = "https://vcenter.local/sdk" +username = "administrator@vsphere.local" +password = "yourpassword" + +# TLS Configuration +# insecure_skip_verify = true + +# ======================================================== +# Resource Discovery & Filtering Controls +# You can enable/disable specific levels and configure filters +# ======================================================== + +# Datacenter level +datacenter_instances = true +# datacenter_include = ["/*"] +# datacenter_exclude = [] +# datacenter_metric_include = ["/*"] + +# Cluster level +cluster_instances = true +# cluster_include = ["/*/host/**"] + +# Physical Host (ESXi) level +host_instances = true +# host_include = ["/*/host/**"] + +# Virtual Machine (VM) level +vm_instances = true +# vm_include = ["/*/vm/**"] + +# Datastore level +datastore_instances = true +# datastore_include = ["/*/datastore/**"] + +# Use include and exclude rules to precisely control which resources are scraped, or to exclude unnecessary heavy metrics. +``` + +## Metrics + +The vSphere performance metrics ecosystem is massive. This plugin generates metrics with different prefixes depending on the resource entity, such as: +- `vsphere_vm_*`: Virtual Machine metrics (e.g., `cpu_usage_average`, `mem_active_average`) +- `vsphere_host_*`: ESXi Host metrics +- `vsphere_datastore_*`: Datastore metrics +- `vsphere_cluster_*` / `vsphere_datacenter_*`: Macroscopic cluster and datacenter summary metrics + +All metrics carry detailed topological context labels such as `datacentername`, `clustername`, `hostname`, `vmname`, etc. + +## Dashboards + +A comprehensive, pre-built `dashboard.json` is already provided in this directory. By importing this Dashboard, you gain immediate visibility into the overall capacity of your host clusters, resource utilization of individual ESXi hosts, and detailed CPU, Memory, and Disk I/O metrics for individual VMs. diff --git a/inputs/vsphere/README_CN.md b/inputs/vsphere/README_CN.md new file mode 100644 index 000000000..51bd1318c --- /dev/null +++ b/inputs/vsphere/README_CN.md @@ -0,0 +1,63 @@ +# vSphere 采集插件 + +该插件使用 VMware vSphere API,通过连接到 vCenter 从 ESXi 主机和虚拟机集群中采集性能指标。 +它支持自动发现 Datacenter、Cluster、Host、VM 和 Datastore 等各层级资源,并支持通过 include/exclude 规则对采集范围和指标进行灵活过滤。 + +## 配置说明 + +```toml +# 采集 VMware vSphere 指标 +# interval = 60 + +[[instances]] +# vCenter 访问地址,需带协议 (http/https) +vcenter = "https://vcenter.local/sdk" +username = "administrator@vsphere.local" +password = "yourpassword" + +# TLS 配置 +# insecure_skip_verify = true + +# ======================================================== +# 资源层级发现与过滤控制 +# 以下分为不同的资源层级,你可以开启/关闭该层级并配置过滤规则 +# ======================================================== + +# 数据中心 (Datacenter) +datacenter_instances = true +# datacenter_include = ["/*"] +# datacenter_exclude = [] +# datacenter_metric_include = ["/*"] + +# 集群 (Cluster) +cluster_instances = true +# cluster_include = ["/*/host/**"] + +# 物理主机 (Host) +host_instances = true +# host_include = ["/*/host/**"] + +# 虚拟机 (Virtual Machine) +vm_instances = true +# vm_include = ["/*/vm/**"] + +# 数据存储 (Datastore) +datastore_instances = true +# datastore_include = ["/*/datastore/**"] + +# 你可以利用 include 和 exclude 来精细控制采集特定目录下的资源,或排除不需要的指标 +``` + +## 采集指标 + +vSphere 的性能指标系统非常庞大。该插件会依据不同的资源实体,生成带不同前缀的指标,例如: +- `vsphere_vm_*`: 虚拟机指标 (例如 `cpu_usage_average`, `mem_active_average`) +- `vsphere_host_*`: ESXi 宿主机指标 +- `vsphere_datastore_*`: 数据存储指标 +- `vsphere_cluster_*` / `vsphere_datacenter_*`: 宏观集群与数据中心汇总指标 + +所有指标将附带详细的拓扑位置标签,例如 `datacentername`, `clustername`, `hostname`, `vmname` 等。 + +## 监控大盘 + +本目录下已经包含了一个非常丰富且完整的预置 `dashboard.json`。导入该 Dashboard,你可以直接查看宿主机集群的整体容量、各 ESXi 主机的水位,以及单个虚拟机的 CPU、内存和磁盘 I/O 监控。 diff --git a/inputs/vsphere/dashboard.json b/inputs/vsphere/dashboard.json new file mode 100644 index 000000000..ea18a914a --- /dev/null +++ b/inputs/vsphere/dashboard.json @@ -0,0 +1,1116 @@ +{ + "name": "Vsphere", + "tags": "", + "ident": "", + "configs": { + "var": [ + { + "name": "vcenter", + "type": "query", + "datasource": { + "cate": "prometheus" + }, + "definition": "vsphere_host_cpu_usage_average", + "reg": "/.*vcenter=\"(.*?)\".*/", + "multi": false + }, + { + "name": "Cluster", + "type": "query", + "datasource": { + "cate": "prometheus" + }, + "definition": "vsphere_host_cpu_usage_average{vcenter=\"$vcenter\"}", + "reg": "/.*clustername=\"(.*?)\".*/" + }, + { + "name": "esxi", + "type": "query", + "datasource": { + "cate": "prometheus" + }, + "definition": "vsphere_host_cpu_usage_average{vcenter=\"$vcenter\"}", + "reg": "/.*esxhostname=\"(.*?)\".*/" + }, + { + "name": "vmname", + "type": "query", + "datasource": { + "cate": "prometheus" + }, + "definition": "vsphere_vm_sys_uptime_latest{vcenter=\"$vcenter\"}", + "reg": "/.*vmname=\"(.*?)\".*/" + }, + { + "name": "datastore", + "type": "query", + "datasource": { + "cate": "prometheus" + }, + "definition": "vsphere_datastore_disk_provisioned_latest{vcenter=\"$vcenter\"}", + "reg": "/.*dsname=\"(.*?)\".*/" + } + ], + "panels": [ + { + "type": "row", + "id": "0a149fdd-5c4c-4d09-857c-b16ca3e60f1f", + "name": "vSphere Overview", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 0, + "i": "0a149fdd-5c4c-4d09-857c-b16ca3e60f1f", + "isResizable": false + }, + "panels": [] + }, + { + "type": "stat", + "id": "89ae4416-32f6-4cbd-97dc-12aa11c1363e", + "layout": { + "h": 2, + "w": 24, + "x": 0, + "y": 1, + "i": "89ae4416-32f6-4cbd-97dc-12aa11c1363e", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "count(count(vsphere_host_cpu_usage_average) by(vcenter))", + "legend": "vCenter Summary" + }, + { + "expr": "count(count(vsphere_host_cpu_usage_average) by(clustername))", + "refId": "B", + "legend": "Cluster Summary" + }, + { + "expr": "count(count(vsphere_host_cpu_usage_average) by(esxhostname))", + "refId": "C", + "legend": "ESXi Summary" + }, + { + "expr": "count(count(vsphere_vm_cpu_used_summation) by(vmname))", + "refId": "D", + "legend": "VM Summary" + }, + { + "expr": "count(count(vsphere_datastore_disk_used_latest) by(dsname))", + "refId": "E", + "legend": "Datastore Summary" + } + ], + "name": "", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 5, + "textSize": { + "title": 12, + "value": 36 + } + }, + "options": { + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#9470ff", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "type": "barGauge", + "id": "6e2b4ee6-1f73-413a-a174-840490177541", + "layout": { + "h": 5, + "w": 12, + "x": 0, + "y": 3, + "i": "6e2b4ee6-1f73-413a-a174-840490177541", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "(sum(vsphere_host_cpu_usage_average{}) by(esxhostname)/count(vsphere_host_cpu_usage_average{}) by(esxhostname))", + "legend": "{{esxhostname}}" + } + ], + "name": "EXSI cpu使用率排名", + "custom": { + "calc": "lastNotNull", + "baseColor": "#9470ff", + "serieWidth": 20, + "sortOrder": "desc" + }, + "options": { + "valueMappings": [ + { + "type": "range", + "result": { + "color": "#ff656b" + }, + "match": { + "from": 60, + "to": 100 + } + } + ], + "standardOptions": {} + } + }, + { + "type": "barGauge", + "id": "ff88cb4a-c396-415c-ad8d-bbb3af289427", + "layout": { + "h": 5, + "w": 12, + "x": 12, + "y": 3, + "i": "905a1562-f7dc-4f3e-bbf6-b1697db9d489", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "(sum(vsphere_host_mem_usage_average{}) by(esxhostname)/count(vsphere_host_mem_usage_average{}) by(esxhostname))", + "legend": "{{esxhostname}}" + } + ], + "name": "EXSI mem使用率排名", + "custom": { + "calc": "lastNotNull", + "baseColor": "#9470ff", + "serieWidth": 20, + "sortOrder": "desc" + }, + "options": { + "valueMappings": [ + { + "type": "range", + "result": { + "color": "#ff656b" + }, + "match": { + "from": 60, + "to": 100 + } + } + ], + "standardOptions": {} + } + }, + { + "type": "barGauge", + "id": "d651d6eb-5e30-408c-ba7f-1823d960251a", + "layout": { + "h": 5, + "w": 12, + "x": 0, + "y": 8, + "i": "70cc38e7-2ded-48c2-bb75-fe1d71d4a770", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "(sum(vsphere_vm_cpu_usage_average{}) by(vmname)/count(vsphere_vm_cpu_usage_average{}) by(vmname))", + "legend": "{{vmname}}" + } + ], + "name": "VM cpu使用率排名", + "custom": { + "calc": "lastNotNull", + "baseColor": "#9470ff", + "serieWidth": 20, + "sortOrder": "desc" + }, + "options": { + "valueMappings": [ + { + "type": "range", + "result": { + "color": "#ff656b" + }, + "match": { + "from": 60, + "to": 100 + } + } + ], + "standardOptions": {} + } + }, + { + "type": "barGauge", + "id": "cdd8f673-750e-408e-80ab-3990c16b1da5", + "layout": { + "h": 5, + "w": 12, + "x": 12, + "y": 8, + "i": "f6e55bcf-6d39-425e-8c6d-efc6b8350501", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "(sum(vsphere_vm_mem_usage_average{}) by(vmname)/count(vsphere_vm_mem_usage_average{}) by(vmname))", + "legend": "{{vmname}}" + } + ], + "name": "VM mem使用率排名", + "custom": { + "calc": "lastNotNull", + "baseColor": "#9470ff", + "serieWidth": 20, + "sortOrder": "desc" + }, + "options": { + "valueMappings": [ + { + "type": "range", + "result": { + "color": "#ff656b" + }, + "match": { + "from": 60, + "to": 100 + } + } + ], + "standardOptions": {} + } + }, + { + "type": "row", + "id": "5329376c-2084-4f4d-b5f4-372cd702b643", + "name": "Cluster Status", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 13, + "i": "5329376c-2084-4f4d-b5f4-372cd702b643", + "isResizable": false + }, + "panels": [] + }, + { + "type": "stat", + "id": "046163e4-031e-44e8-b592-7dc606496922", + "layout": { + "h": 5, + "w": 2, + "x": 0, + "y": 14, + "i": "046163e4-031e-44e8-b592-7dc606496922", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "(sum(vsphere_host_sys_uptime_latest{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername)/count(vsphere_host_sys_uptime_latest{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername))/86500", + "legend": "" + } + ], + "name": "uptime", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 3, + "textSize": {} + }, + "options": { + "standardOptions": { + "util": "none" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "type": "timeseries", + "id": "8aca78d4-2869-44c4-a702-ad46f5c89443", + "layout": { + "h": 5, + "w": 4, + "x": 2, + "y": 14, + "i": "af0d1102-cf29-45b9-b647-1bc0b605ac04", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_host_cpu_usage_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername)/count(vsphere_host_cpu_usage_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername))", + "refId": "B", + "legend": "{{clustername}}" + } + ], + "name": "Cluster CPU Usage %", + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "8309ec72-51cb-4b9d-ba7e-e22acfd6e461", + "layout": { + "h": 5, + "w": 4, + "x": 6, + "y": 14, + "i": "69332b5d-8841-4572-92f5-6ad237fb6ad5", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_host_mem_usage_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername)/count(vsphere_host_mem_usage_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername))", + "refId": "B", + "legend": "{{clustername}}" + } + ], + "name": "Cluster RAM Usage in %", + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "7b43af1c-0814-42eb-ba31-14ed6eb07dc9", + "layout": { + "h": 5, + "w": 5, + "x": 10, + "y": 14, + "i": "7dad6d09-2d17-41aa-845f-30f916ee344d", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_host_net_bytesRx_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername))/1000 ", + "refId": "B", + "legend": "{{clustername}}-net_bytesRx" + }, + { + "expr": "(sum(vsphere_host_net_bytesTx_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(clustername))/1000", + "refId": "A", + "legend": "{{clustername}}-net_bytesTx" + } + ], + "name": "Cluster Network Usage", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "none" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "table", + "id": "6dffc761-7805-47c5-b82d-cf34dd7b8b11", + "layout": { + "h": 5, + "w": 9, + "x": 15, + "y": 14, + "i": "6dffc761-7805-47c5-b82d-cf34dd7b8b11", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "sum(vsphere_datastore_disk_capacity_latest{vcenter=\"$vcenter\"}) by(source)", + "refId": "A", + "legend": "总量" + }, + { + "expr": "sum(vsphere_datastore_disk_used_latest{vcenter=\"$vcenter\"}) by(source)", + "refId": "B", + "legend": "使用量" + }, + { + "expr": "sum(vsphere_datastore_disk_used_latest{vcenter=\"$vcenter\"}/vsphere_datastore_disk_capacity_latest{vcenter=\"$vcenter\"}) by(source)*100", + "refId": "C", + "legend": "使用率(%)" + } + ], + "custom": { + "showHeader": true, + "colorMode": "background", + "calc": "lastNotNull", + "displayMode": "labelValuesToRows", + "aggrDimension": "source", + "sortColumn": "source", + "sortOrder": "descend" + }, + "options": { + "valueMappings": [], + "standardOptions": {} + }, + "overrides": [ + {} + ] + }, + { + "type": "row", + "id": "fe54e096-8e11-406b-98f6-d2c5d76d9d8d", + "name": "Exsi status", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 19, + "i": "fe54e096-8e11-406b-98f6-d2c5d76d9d8d", + "isResizable": false + }, + "panels": [] + }, + { + "type": "stat", + "id": "4cca929b-8a04-4c0c-924f-240ad5cf08d9", + "layout": { + "h": 5, + "w": 2, + "x": 0, + "y": 20, + "i": "4cca929b-8a04-4c0c-924f-240ad5cf08d9", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "vsphere_host_sys_uptime_latest{esxhostname=\"$esxi\"}", + "legend": "{{esxhostname}}" + } + ], + "name": "uptime\n", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": { + "util": "humantimeSeconds" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "type": "stat", + "id": "c097de32-5d94-4b32-8f93-0ac8cfe32657", + "layout": { + "h": 5, + "w": 3, + "x": 2, + "y": 20, + "i": "7f2edbdb-d890-4799-89b0-fad87ebf1c22", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_host_cpu_ready_summation{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname)/count(vsphere_host_cpu_ready_summation{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname))", + "refId": "B", + "legend": "cpu usage" + } + ], + "name": "Host CPU Ready Time", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": { + "util": "milliseconds" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "type": "timeseries", + "id": "25ed7e6c-1d20-4d3f-a2d3-de1e9bb2fb17", + "layout": { + "h": 5, + "w": 5, + "x": 5, + "y": 20, + "i": "9e349e57-b55e-462b-b63f-faed76213544", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_host_cpu_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname)/count(vsphere_host_cpu_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname))", + "refId": "B", + "legend": "{{esxhostname}}" + } + ], + "name": "Host CPU Usage %", + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "2f77a4cc-0950-4fe5-86d7-0d6ebfed3d0c", + "layout": { + "h": 5, + "w": 5, + "x": 10, + "y": 20, + "i": "01d79496-7b09-44c1-8e5e-8430509295f3", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_host_mem_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname)/count(vsphere_host_mem_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname))", + "refId": "B", + "legend": "{{esxhostname}}" + } + ], + "name": "Host RAM Usage in %", + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "af8a3cca-ad6e-4650-838f-c38c1cf2c7fa", + "layout": { + "h": 5, + "w": 9, + "x": 15, + "y": 20, + "i": "446ea897-c338-40f7-a146-2cda7bbca311", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_host_net_bytesRx_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(esxhostname))/1000", + "refId": "B", + "legend": "{{esxhostname}}-net_bytesRx" + }, + { + "expr": "(sum(vsphere_host_net_bytesTx_average{clustername=\"$Cluster\",vcenter=\"$vcenter\"}) by(esxhostname))/1000", + "refId": "A", + "legend": "{{esxhostname}}-net_bytesTx" + } + ], + "name": "Cluster Network Usage", + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "none" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "row", + "id": "d2f2839c-11d2-470f-85a8-da9e81e72ad3", + "name": "VMs status", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 25, + "i": "d2f2839c-11d2-470f-85a8-da9e81e72ad3", + "isResizable": false + }, + "panels": [] + }, + { + "type": "stat", + "id": "772d5173-cde1-4e3f-a72d-864f737e07b4", + "layout": { + "h": 5, + "w": 2, + "x": 0, + "y": 26, + "i": "7a9fe621-aca3-4a32-aae0-c3f3cf951ba3", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "vsphere_vm_sys_uptime_latest{vmname=\"$vmname\"}", + "legend": "{{vmname}}" + } + ], + "name": "uptime", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": { + "util": "humantimeSeconds" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "type": "stat", + "id": "6a28ae90-8a63-45a1-9ceb-ab660f0a0d75", + "layout": { + "h": 5, + "w": 3, + "x": 2, + "y": 26, + "i": "f1ba46e5-558d-483e-bc9b-5cffff7343a8", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_vm_cpu_ready_summation{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname)/count(vsphere_vm_cpu_ready_summation{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname))", + "refId": "B", + "legend": "cpu usage" + } + ], + "name": "VM CPU Ready Time", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": { + "util": "milliseconds" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "type": "timeseries", + "id": "0cd4f9df-81c6-4a31-a15e-060a3b0c7e65", + "layout": { + "h": 5, + "w": 5, + "x": 5, + "y": 26, + "i": "96826cc9-cbe0-4e4e-9144-90d9052904cd", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_vm_cpu_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname)/count(vsphere_vm_cpu_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname))", + "refId": "B", + "legend": "{{vmname}}" + } + ], + "name": "VM CPU Usage %", + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "none" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "2beeb800-bacd-49ec-b44a-4b19d6497808", + "layout": { + "h": 5, + "w": 5, + "x": 10, + "y": 26, + "i": "7651144f-7b05-4a21-a595-2816f108b23d", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_vm_mem_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname)/count(vsphere_vm_mem_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname))", + "refId": "B", + "legend": "{{vmname}}" + } + ], + "name": "VM RAM Usage in %", + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "c2804d43-f9b0-4a74-a3fe-fb1f8b5f4a01", + "layout": { + "h": 5, + "w": 9, + "x": 15, + "y": 26, + "i": "06122a9b-53b1-4ef9-93be-2419197600c8", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "expr": "(sum(vsphere_vm_net_bytesRx_average{vmname=\"$vmname\",vcenter=\"$vcenter\"}) by(vmname))/1000", + "refId": "B", + "legend": "{{vmname}}-net_bytesRx" + }, + { + "expr": "(sum(vsphere_vm_net_bytesTx_average{vmname=\"$vmname\",vcenter=\"$vcenter\"}) by(vmname))/1000", + "refId": "A", + "legend": "{{vmname}}-net_bytesTx" + } + ], + "name": "VMNetwork Usage", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "none" + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + } + ], + "version": "2.0.0", + "datasourceValue": "Default" + } +} diff --git a/inputs/weblogic/README.md b/inputs/weblogic/README.md index d09140f4a..8add866e7 100644 --- a/inputs/weblogic/README.md +++ b/inputs/weblogic/README.md @@ -1,3 +1,15 @@ -# weblogic +# WebLogic Input Plugin (via Jolokia) -weblogic 当前可以使用 jolokia_agent 插件来监控,通过读取 jmx 数据的方式获取监控指标,配置文件可以参考:[weblogic.toml](../../conf/input.jolokia_agent_misc/weblogic.toml) +Native monitoring data for WebLogic is typically exposed through JMX. In Categraf, instead of developing a dedicated native WebLogic Go plugin, we strongly recommend using the universal **Jolokia** approach. + +## How to Monitor + +WebLogic can be monitored using the `jolokia_agent` plugin. It fetches WebLogic metrics (such as JVM memory, thread pools, JDBC connection pools, etc.) by reading JMX data exposed over HTTP by the Jolokia agent. + +For specific configurations and pre-defined WebLogic JMX metrics collection items, please refer directly to our provided example configuration file: +[weblogic.toml](../../conf/input.jolokia_agent_misc/weblogic.toml) + +## Dashboards + +Since the data is collected via `jolokia_agent`, all metrics and tagging systems will follow the Jolokia standards. +A placeholder `dashboard.json` is provided in this directory. For actual JVM monitoring dashboards, it is recommended to use the generic Dashboards associated with `jolokia` or `jvm`. diff --git a/inputs/weblogic/README_CN.md b/inputs/weblogic/README_CN.md new file mode 100644 index 000000000..859ab31cb --- /dev/null +++ b/inputs/weblogic/README_CN.md @@ -0,0 +1,15 @@ +# WebLogic 采集插件 (基于 Jolokia) + +WebLogic 的原生监控数据通常暴露在 JMX 中。在 Categraf 中,我们并没有专门开发一个原生的 WebLogic Go 插件,而是推荐使用通用的 **Jolokia** 方式来采集。 + +## 采集方法 + +WebLogic 当前可以使用 `jolokia_agent` 插件来监控,通过 HTTP 请求读取 Jolokia 代理暴露出的 JMX 数据,从而获取 WebLogic 的各种监控指标(如 JVM 内存、线程池、JDBC 连接池等)。 + +具体配置和预置的 WebLogic JMX 采集项,请直接参考我们提供的示例配置文件: +[weblogic.toml](../../conf/input.jolokia_agent_misc/weblogic.toml) + +## 监控大盘 + +既然数据是通过 `jolokia_agent` 采集的,所有的指标和标签体系将遵循 Jolokia 规范。 +本目录下提供了一个占位用的 `dashboard.json`。如果你想查看真正的 JVM 监控大盘,建议直接使用 `jolokia` 或 `jvm` 相关的通用 Dashboard。 diff --git a/inputs/weblogic/dashboard.json b/inputs/weblogic/dashboard.json new file mode 100644 index 000000000..160018711 --- /dev/null +++ b/inputs/weblogic/dashboard.json @@ -0,0 +1,52 @@ +{ + "title": "WebLogic via Jolokia", + "uid": "a5a78c5a", + "tags": [ + "weblogic via jolokia" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "WebLogic JVM Heap Usage", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "java_lang_Memory_HeapMemoryUsage_used", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + }, + { + "title": "WebLogic Thread Count", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "java_lang_Threading_ThreadCount", + "legendFormat": "{{agent_hostname}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/whois/README.md b/inputs/whois/README.md index 82fb52e01..56a9f3de7 100644 --- a/inputs/whois/README.md +++ b/inputs/whois/README.md @@ -1,25 +1,34 @@ -# whois - -域名探测插件,用于探测域名的注册时间和到期时间,值为UTC0时间戳 +# Whois Input Plugin +This plugin acts as a domain probe, collecting domain registration and expiration information using the Whois protocol. All returned values are standard UTC0 Unix timestamps. ## Configuration -最核心的配置就是 domain 配置,配置目标地址,比如想要监控一个地址: -默认保持注释状态,注释状态下,插件默认不启用 +The core configuration involves setting the `domain` parameter for the target address you wish to monitor. ```toml -# [[instances]] -## Used to collect domain name information. -# domain = "baidu.com" +# Collect domain whois information +# interval = 86400 + +[[instances]] +# Target domain name. Please note that this must be a domain (e.g., "baidu.com"), NOT a URL (like "https://baidu.com"). +domain = "baidu.com" ``` -请注意这里配置的是域名不是URL -## 指标解释 +## Metrics + +The plugin outputs the following timestamp metrics: + +- `whois_domain_createddate`: Domain creation timestamp (Unix epoch). +- `whois_domain_updateddate`: Domain last update timestamp (Unix epoch). +- `whois_domain_expirationdate`: Domain expiration timestamp (Unix epoch). + +All metrics include the `domain` tag for identification. + +## Important Note + +**Do NOT** set the `interval` too short (e.g., 10 seconds). Frequent Whois queries are unnecessary and will very likely lead to rate limiting, connection timeouts, or being IP banned by the Whois servers. Please keep the collection cycle long (e.g., once a day, `interval = 86400`). -whois_domain_createddate 域名创建时间戳 -whois_domain_updateddate 域名更新时间戳 -whois_domain_expirationdate 域名到期时间戳 +## Dashboards -## 注意事项 -请不要将interval设置过短,会导致频繁请求timeout,没太大必要性,请尽量放长请求周期 \ No newline at end of file +A companion basic Dashboard (`dashboard.json`) is provided in this directory. It visualizes the days remaining until domain expiration, enabling proactive alerts before crucial domains lapse. diff --git a/inputs/whois/README_CN.md b/inputs/whois/README_CN.md new file mode 100644 index 000000000..82fb52e01 --- /dev/null +++ b/inputs/whois/README_CN.md @@ -0,0 +1,25 @@ +# whois + +域名探测插件,用于探测域名的注册时间和到期时间,值为UTC0时间戳 + + +## Configuration + +最核心的配置就是 domain 配置,配置目标地址,比如想要监控一个地址: +默认保持注释状态,注释状态下,插件默认不启用 + +```toml +# [[instances]] +## Used to collect domain name information. +# domain = "baidu.com" +``` +请注意这里配置的是域名不是URL + +## 指标解释 + +whois_domain_createddate 域名创建时间戳 +whois_domain_updateddate 域名更新时间戳 +whois_domain_expirationdate 域名到期时间戳 + +## 注意事项 +请不要将interval设置过短,会导致频繁请求timeout,没太大必要性,请尽量放长请求周期 \ No newline at end of file diff --git a/inputs/whois/dashboard.json b/inputs/whois/dashboard.json new file mode 100644 index 000000000..0a4e99331 --- /dev/null +++ b/inputs/whois/dashboard.json @@ -0,0 +1,34 @@ +{ + "title": "Whois Domain Expiry Tracker", + "uid": "ad10a5ca", + "tags": [ + "whois domain expiry tracker" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Domain Days to Expiration", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "(whois_domain_expirationdate - time()) / 86400", + "legendFormat": "{{domain}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file diff --git a/inputs/xskyapi/README.md b/inputs/xskyapi/README.md new file mode 100644 index 000000000..022bd04ca --- /dev/null +++ b/inputs/xskyapi/README.md @@ -0,0 +1,43 @@ +# XSKY API Input Plugin + +This plugin collects capacity and performance monitoring data from XSKY software-defined storage systems (e.g., XEBS, XEOS) by directly querying their REST APIs using `XmsAuthTokens`. It monitors clusters, storage pools, volumes, nodes, and physical disks. + +## Configuration + +You can configure multiple XSKY Management Server (XMS) API endpoints and their corresponding tokens. + +```toml +# Collect XSKY storage metrics +# interval = 60 + +[[instances]] +# XSKY storage type +# dss_type = "xsky" + +# List of XSKY XMS API endpoints +servers = ["http://10.10.10.10:8056"] + +# List of access tokens corresponding to the servers +xms_auth_tokens = ["xxxxxxxxxxxxx"] + +# Request timeout +# response_timeout = "5s" + +# (Optional) Specify JSON keys to be converted into labels instead of metrics +# tag_keys = ["pool_id", "volume_id"] +``` + +## Metrics + +By default, the plugin queries endpoints such as `/api/v1/clusters`, `/api/v1/pools`, `/api/v1/volumes`, `/api/v1/hosts`, and `/api/v1/disks`, mapping the returned status codes and counter values directly to metrics. +All metrics are prefixed with `xskyapi_`. + +Typical metrics include: +- `xskyapi_cluster_status`: Overall cluster health status. +- `xskyapi_pool_allocated_capacity`: Allocated capacity of storage pools. +- `xskyapi_volume_iops` / `xskyapi_volume_bandwidth`: IOPS and bandwidth performance data for volumes (exact names depend on API returns). +- `xskyapi_disk_status`: Disk presence and health status. + +## Dashboards + +A basic companion Dashboard (`dashboard.json`) is provided in this directory to monitor the overall capacity of the XSKY storage cluster, the utilization rate of individual storage pools, and the distribution of disk errors, helping administrators proactively detect storage bottlenecks and hardware failures. diff --git a/inputs/xskyapi/README_CN.md b/inputs/xskyapi/README_CN.md new file mode 100644 index 000000000..c61d0b6f3 --- /dev/null +++ b/inputs/xskyapi/README_CN.md @@ -0,0 +1,43 @@ +# XSKY API 采集插件 + +该插件通过调用 XSKY 星辰天合存储系统 (XEBS/XEOS 等) 的 REST API (`XmsAuthTokens`),直接收集存储集群、存储池、卷 (Volume)、节点以及硬盘状态的相关容量与性能监控数据。 + +## 配置说明 + +你可以配置多个 XSKY 管理节点的 API Server 和对应的 Token。 + +```toml +# 采集 XSKY 存储指标 +# interval = 60 + +[[instances]] +# XSKY 存储类型 +# dss_type = "xsky" + +# XSKY 管理节点 (XMS) 的 API 地址列表 +servers = ["http://10.10.10.10:8056"] + +# 与 API 地址对应的访问 Token 列表 +xms_auth_tokens = ["xxxxxxxxxxxxx"] + +# 请求超时时间 +# response_timeout = "5s" + +# (可选) 指定将哪些 JSON 字段转化为 Label (而非指标字段) +# tag_keys = ["pool_id", "volume_id"] +``` + +## 采集指标 + +插件默认会将从 `/api/v1/clusters`, `/api/v1/pools`, `/api/v1/volumes`, `/api/v1/hosts` 和 `/api/v1/disks` 等 API 获取到的状态码和计数值直接映射为指标。 +所有的指标统一带有 `xskyapi_` 前缀。 + +典型指标举例: +- `xskyapi_cluster_status`: 集群整体健康状态。 +- `xskyapi_pool_allocated_capacity`: 存储池已分配容量。 +- `xskyapi_volume_iops` / `xskyapi_volume_bandwidth`: 卷的 IOPS 与吞吐性能数据(具体字段名依赖 API 实际返回)。 +- `xskyapi_disk_status`: 硬盘的在位与健康状态。 + +## 监控大盘 + +本目录下提供了一个基础的 Dashboard (`dashboard.json`),用于监控 XSKY 存储集群的整体容量、各存储池的使用率以及硬盘错误状态分布,帮助管理员提前发现存储瓶颈和硬件故障。 diff --git a/inputs/xskyapi/dashboard.json b/inputs/xskyapi/dashboard.json new file mode 100644 index 000000000..7d221fd8d --- /dev/null +++ b/inputs/xskyapi/dashboard.json @@ -0,0 +1,70 @@ +{ + "title": "XSKY Storage Cluster", + "uid": "b710a0dc", + "tags": [ + "xsky storage cluster" + ], + "timezone": "browser", + "schemaVersion": 30, + "panels": [ + { + "title": "Cluster Status (1=OK)", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 1, + "targets": [ + { + "expr": "xskyapi_cluster_status", + "legendFormat": "{{server}}", + "refId": "A" + } + ] + }, + { + "title": "Pool Allocated Capacity", + "type": "timeseries", + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "id": 2, + "targets": [ + { + "expr": "xskyapi_pool_allocated_capacity", + "legendFormat": "{{pool_id}} @ {{server}}", + "refId": "A" + } + ] + }, + { + "title": "Host Disk Status", + "type": "timeseries", + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "id": 3, + "targets": [ + { + "expr": "xskyapi_disk_status", + "legendFormat": "{{disk_id}} @ {{server}}", + "refId": "A" + } + ] + } + ], + "refresh": "1m", + "time": { + "from": "now-1h", + "to": "now" + } +} \ No newline at end of file From 426d1668fdd3490933bea5c2375b026caced66df Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Thu, 11 Jun 2026 16:25:46 +0800 Subject: [PATCH 2/8] docs: sync enterprise dashboards and fix review issues --- inputs/aliyun/README.md | 2 +- inputs/clickhouse/README.md | 6 +++--- inputs/exec/README.md | 2 +- inputs/exec/README_CN.md | 6 +++--- inputs/hadoop_hdfs/README.md | 4 ++-- inputs/kube_proxy/README.md | 4 ++-- inputs/kube_proxy/README_CN.md | 4 ++-- inputs/kubelet/README.md | 6 +++--- inputs/kubelet/README_CN.md | 6 +++--- inputs/nginx/README.md | 11 +++++------ inputs/sqlserver/README.md | 4 ++-- inputs/tpl/README.md | 2 +- 12 files changed, 28 insertions(+), 29 deletions(-) diff --git a/inputs/aliyun/README.md b/inputs/aliyun/README.md index e4003c117..52fa9add6 100644 --- a/inputs/aliyun/README.md +++ b/inputs/aliyun/README.md @@ -38,7 +38,7 @@ RAM 用户授权。RAM 用户调用云监控 API 前,需要所属的阿里云 [[instances]] ## 阿里云资源所处的region ## endpoint region 参考 https://help.aliyun.com/document_detail/28616.html#section-72p-xhs-6qt -regions=["cn-beijing","cn-shanghai"] +region="cn-beijing" endpoint="metrics.cn-hangzhou.aliyuncs.com" ## 填入你的acces_key_id access_key_id="" diff --git a/inputs/clickhouse/README.md b/inputs/clickhouse/README.md index 135307e20..1727c2288 100644 --- a/inputs/clickhouse/README.md +++ b/inputs/clickhouse/README.md @@ -74,9 +74,9 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. # cluster_exclude = [] ## Optional TLS Config - # tls_ca = "/etc/telegraf/ca.pem" - # tls_cert = "/etc/telegraf/cert.pem" - # tls_key = "/etc/telegraf/key.pem" + # tls_ca = "/etc/categraf/ca.pem" + # tls_cert = "/etc/categraf/cert.pem" + # tls_key = "/etc/categraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = false diff --git a/inputs/exec/README.md b/inputs/exec/README.md index 389d840be..6552252b3 100644 --- a/inputs/exec/README.md +++ b/inputs/exec/README.md @@ -9,7 +9,7 @@ The executed script must print the monitoring data to stdout in one of the follo ### 1. influx ```text -mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3 +measurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3 ``` - Measurement and tags are separated by a comma. - Tags are separated by commas. diff --git a/inputs/exec/README_CN.md b/inputs/exec/README_CN.md index 94cd167f7..c27290299 100644 --- a/inputs/exec/README_CN.md +++ b/inputs/exec/README_CN.md @@ -9,12 +9,12 @@ Exec 插件主要用于执行用户自定义的监控脚本或程序,并将脚 ### 1. influx ```text -mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3 +measurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3 ``` -- 指标名 (mesurement) 和标签 (Tags) 之间用逗号分隔 +- 指标名 (measurement) 和标签 (Tags) 之间用逗号分隔 - 标签之间用逗号分隔 - 标签和属性字段 (Fields) 之间用**空格**分隔 -- 最终的指标名会根据 `mesurement` 和 `field` 组合生成 +- 最终的指标名会根据 `measurement` 和 `field` 组合生成 ### 2. prometheus 直接输出 Prometheus 的标准 Exposition 格式: diff --git a/inputs/hadoop_hdfs/README.md b/inputs/hadoop_hdfs/README.md index b1579563b..72cd489b7 100644 --- a/inputs/hadoop_hdfs/README.md +++ b/inputs/hadoop_hdfs/README.md @@ -6,10 +6,10 @@ Categraf does not require a dedicated, standalone native plugin to monitor Hadoo To monitor HDFS, please configure the `jolokia_agent` plugin directly. We have already prepared a template configuration suitable for Hadoop HDFS in the example configuration directory. -Please refer to: [hadoop-hdfs.toml](../../conf/example.input.jolokia_agent/hadoop-hdfs.toml) +Please refer to: [hadoop-hdfs.toml](../../conf/input.jolokia_agent_misc/hadoop-hdfs.toml) Steps: -1. Copy the reference configuration above into your Categraf `conf/input.jolokia_agent/` directory. +1. Copy the reference configuration above into your Categraf `conf/input.jolokia_agent_misc/` directory. 2. Ensure that Jolokia Agent is enabled on your Hadoop NameNode or DataNode. 3. Modify the `urls` in the configuration file to point to your real Jolokia JMX HTTP Endpoint (e.g., `http://localhost:8778/jolokia/`). diff --git a/inputs/kube_proxy/README.md b/inputs/kube_proxy/README.md index d3f2322ae..8f5c5a9f2 100644 --- a/inputs/kube_proxy/README.md +++ b/inputs/kube_proxy/README.md @@ -6,10 +6,10 @@ This component is not an independent Go native input plugin. Instead, it leverag To scrape Kube-Proxy metrics, you should configure the `prometheus` plugin. We have prepared a dedicated scraping template for Kube-Proxy in the example configuration directory. -Reference configuration: [kube_proxy.toml](../../conf/input.prometheus/kube_proxy.toml) +Reference configuration: `prometheus.toml` Steps: -1. Copy the reference configuration `kube_proxy.toml` to your Categraf `conf/input.prometheus/` directory. +1. Add a new `[[instances]]` block in your `conf/input.prometheus/prometheus.toml` for kube-proxy. 2. Ensure that Categraf can access the kube-proxy metrics endpoint (typically `127.0.0.1:10249/metrics` or `NodeIP:10249`). When running as a DaemonSet, this is usually accessed via the Node IP. 3. Modify the `urls` in the configuration to point to the correct address. diff --git a/inputs/kube_proxy/README_CN.md b/inputs/kube_proxy/README_CN.md index 34e2845fc..bab0444c0 100644 --- a/inputs/kube_proxy/README_CN.md +++ b/inputs/kube_proxy/README_CN.md @@ -6,10 +6,10 @@ 要采集 Kube-Proxy 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。我们在示例配置中已经准备好了一个专用于 Kube-Proxy 的抓取模板。 -参考配置:[kube_proxy.toml](../../conf/input.prometheus/kube_proxy.toml) +参考配置:`prometheus.toml` 具体步骤: -1. 将参考配置文件 `kube_proxy.toml` 复制到您的 Categraf `conf/input.prometheus/` 目录下。 +1. 在 `conf/input.prometheus/prometheus.toml` 中新增一个用于抓取 kube-proxy 的 `[[instances]]` 配置块。 2. 确保您的 Kubernetes 集群中,kube-proxy 的 metrics 接口 (通常是 `127.0.0.1:10249/metrics` 或者节点 IP 的 `10249` 端口) 可以被 Categraf 访问到。如果在 DaemonSet 模式下,通常通过 Node IP 访问。 3. 修改配置中的 `urls` 指向正确的地址。 diff --git a/inputs/kubelet/README.md b/inputs/kubelet/README.md index 4370e69d5..3c4c77f44 100644 --- a/inputs/kubelet/README.md +++ b/inputs/kubelet/README.md @@ -6,12 +6,12 @@ This component is not an independent Go native input plugin. Instead, it leverag To scrape Kubelet metrics, you should configure the `prometheus` plugin. We have prepared a dedicated scraping template for Kubelet in the example configuration directory. -Reference configuration: [kubelet.toml](../../conf/input.prometheus/kubelet.toml) +Reference configuration: `prometheus.toml` Steps: -1. Copy the reference configuration `kubelet.toml` to your Categraf `conf/input.prometheus/` directory. +1. Add a new `[[instances]]` block in your `conf/input.prometheus/prometheus.toml` for Kubelet. 2. Ensure that Categraf (usually deployed as a DaemonSet on each Node) can access the Kubelet API on the current node. This often involves using the Node IP and a service account token. -3. Configure the correct authentication in `kubelet.toml` according to your Kubernetes cluster's security setup (e.g., TLS settings, token file paths). +3. Configure the correct authentication in your prometheus configuration according to your Kubernetes cluster's security setup (e.g., TLS settings, token file paths). ## Metrics and Dashboards diff --git a/inputs/kubelet/README_CN.md b/inputs/kubelet/README_CN.md index a7261ae91..304aa9277 100644 --- a/inputs/kubelet/README_CN.md +++ b/inputs/kubelet/README_CN.md @@ -6,12 +6,12 @@ 要采集 Kubelet 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。我们在示例配置中已经准备好了一个专用于 Kubelet 的抓取模板。 -参考配置:[kubelet.toml](../../conf/input.prometheus/kubelet.toml) +参考配置:`prometheus.toml` 具体步骤: -1. 将参考配置文件 `kubelet.toml` 复制到您的 Categraf `conf/input.prometheus/` 目录下。 +1. 在 `conf/input.prometheus/prometheus.toml` 中新增一个用于抓取 Kubelet 的 `[[instances]]` 配置块。 2. 确保 Categraf 作为 DaemonSet 部署在每个 Node 上时,可以访问到当前节点的 Kubelet API(通常通过挂载 Node 的 IP 和相应的认证 Token 获取)。 -3. 根据您的 Kubernetes 集群的安全配置(如是否需要 TLS,Token 文件路径),在 `kubelet.toml` 中配置正确的认证信息。 +3. 根据您的 Kubernetes 集群的安全配置(如是否需要 TLS,Token 文件路径),在相应的配置块中配置正确的认证信息。 ## 采集指标与监控大盘 diff --git a/inputs/nginx/README.md b/inputs/nginx/README.md index 734409be9..5d4f53ead 100644 --- a/inputs/nginx/README.md +++ b/inputs/nginx/README.md @@ -1,4 +1,4 @@ -- 该插件依赖**nginx**的 **http_stub_status_module +- 该插件依赖**nginx**的 **http_stub_status_module** # 应用场景 一般用于业务系统做对外或对外路由映射时使用代理服务,是运维最常见且最重要的代理工具。 @@ -187,12 +187,11 @@ journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!" # 监控告警规则配置 -``` -``` + 个人经验仅供参考: -超过2000毫秒,为P2级别,启用企业微信应用推送告警,3分钟内恢复发出恢复告警。 -超过5000毫秒,为P1级别,启用电话语音告警&企业微信应用告警,3分钟内恢复发出恢复告警。 -``` + +- 超过2000毫秒,为P2级别,启用企业微信应用推送告警,3分钟内恢复发出恢复告警。 +- 超过5000毫秒,为P1级别,启用电话语音告警&企业微信应用告警,3分钟内恢复发出恢复告警。 # 监控图表配置 diff --git a/inputs/sqlserver/README.md b/inputs/sqlserver/README.md index 5ac682cbf..8c08f8246 100644 --- a/inputs/sqlserver/README.md +++ b/inputs/sqlserver/README.md @@ -1,6 +1,6 @@ -# kubernetes +# sqlserver -forked from telegraf/sqlserver. 这个插件的作用是获取sqlserver的监控指标,这里去掉了Azure相关部分监控,只保留了本地部署sqlserver情况。 +forked from [telegraf/sqlserver](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/sqlserver). 这个插件的作用是获取sqlserver的监控指标,这里去掉了Azure相关部分监控,只保留了本地部署sqlserver情况。 # 按照下面方法创建监控账号,用于读取监控数据 USE master; diff --git a/inputs/tpl/README.md b/inputs/tpl/README.md index 9574433c8..615d34e29 100644 --- a/inputs/tpl/README.md +++ b/inputs/tpl/README.md @@ -10,4 +10,4 @@ It serves as a **Plugin Development Template**. If you want to develop a new, cu 3. Modify the `inputName` constant to reflect your plugin's name. 4. Implement the logic to fetch metrics inside the `Gather(slist *types.SampleList)` function. 5. Create a corresponding configuration template under the `conf/` directory. -6. Modify the main entry file `metrics_agent.go` to anonymously import your new plugin (or configure build tags as needed). +6. Modify the main entry file `agent/metrics_agent.go` to anonymously import your new plugin (or configure build tags as needed). From 68383645e3b56bccebf652924e0479be8b1cb17a Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Thu, 11 Jun 2026 16:28:24 +0800 Subject: [PATCH 3/8] docs(kube_proxy,kubelet): remove contradictory template mention --- inputs/kube_proxy/README.md | 2 +- inputs/kube_proxy/README_CN.md | 2 +- inputs/kubelet/README.md | 2 +- inputs/kubelet/README_CN.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inputs/kube_proxy/README.md b/inputs/kube_proxy/README.md index 8f5c5a9f2..3e255d18e 100644 --- a/inputs/kube_proxy/README.md +++ b/inputs/kube_proxy/README.md @@ -4,7 +4,7 @@ This component is not an independent Go native input plugin. Instead, it leverag ## Configuration -To scrape Kube-Proxy metrics, you should configure the `prometheus` plugin. We have prepared a dedicated scraping template for Kube-Proxy in the example configuration directory. +To scrape Kube-Proxy metrics, you should configure the `prometheus` plugin. Reference configuration: `prometheus.toml` diff --git a/inputs/kube_proxy/README_CN.md b/inputs/kube_proxy/README_CN.md index bab0444c0..3569b96cf 100644 --- a/inputs/kube_proxy/README_CN.md +++ b/inputs/kube_proxy/README_CN.md @@ -4,7 +4,7 @@ ## 配置说明 -要采集 Kube-Proxy 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。我们在示例配置中已经准备好了一个专用于 Kube-Proxy 的抓取模板。 +要采集 Kube-Proxy 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。 参考配置:`prometheus.toml` diff --git a/inputs/kubelet/README.md b/inputs/kubelet/README.md index 3c4c77f44..0787f136b 100644 --- a/inputs/kubelet/README.md +++ b/inputs/kubelet/README.md @@ -4,7 +4,7 @@ This component is not an independent Go native input plugin. Instead, it leverag ## Configuration -To scrape Kubelet metrics, you should configure the `prometheus` plugin. We have prepared a dedicated scraping template for Kubelet in the example configuration directory. +To scrape Kubelet metrics, you should configure the `prometheus` plugin. Reference configuration: `prometheus.toml` diff --git a/inputs/kubelet/README_CN.md b/inputs/kubelet/README_CN.md index 304aa9277..0b9dde830 100644 --- a/inputs/kubelet/README_CN.md +++ b/inputs/kubelet/README_CN.md @@ -4,7 +4,7 @@ ## 配置说明 -要采集 Kubelet 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。我们在示例配置中已经准备好了一个专用于 Kubelet 的抓取模板。 +要采集 Kubelet 的指标,请使用并修改 Categraf 的 `prometheus` 插件配置。 参考配置:`prometheus.toml` From c8892e0648ce108c7ae4543f7380c125c7199bf3 Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Thu, 11 Jun 2026 16:41:31 +0800 Subject: [PATCH 4/8] docs,dashboards: fix metric normalizations and endpoint typos per review --- inputs/appdynamics/dashboard.json | 2 +- inputs/linux_sysctl_fs/README.md | 16 ++++++++-------- inputs/linux_sysctl_fs/README_CN.md | 16 ++++++++-------- inputs/linux_sysctl_fs/dashboard.json | 14 +++++++------- inputs/nginx/README.md | 2 +- inputs/self_metrics/README.md | 10 +++++----- inputs/self_metrics/README_CN.md | 10 +++++----- inputs/self_metrics/dashboard.json | 4 ++-- inputs/xskyapi/README.md | 14 ++++++-------- inputs/xskyapi/dashboard.json | 12 ++++++------ 10 files changed, 49 insertions(+), 51 deletions(-) diff --git a/inputs/appdynamics/dashboard.json b/inputs/appdynamics/dashboard.json index f8876d4f4..948b9aa20 100644 --- a/inputs/appdynamics/dashboard.json +++ b/inputs/appdynamics/dashboard.json @@ -19,7 +19,7 @@ "id": 1, "targets": [ { - "expr": "up{job=\"appdynamics\"}", + "expr": "appdynamics_up", "legendFormat": "{{metric_path}}", "refId": "A" } diff --git a/inputs/linux_sysctl_fs/README.md b/inputs/linux_sysctl_fs/README.md index f236d8514..68372ae5e 100644 --- a/inputs/linux_sysctl_fs/README.md +++ b/inputs/linux_sysctl_fs/README.md @@ -18,14 +18,14 @@ It is highly recommended for monitoring system-wide file descriptor limits (file All collected metrics are prefixed with `linux_sysctl_fs_`. Key metrics include: -- `linux_sysctl_fs_file-nr`: Number of allocated file handles -- `linux_sysctl_fs_file-max`: Maximum number of allowed file handles -- `linux_sysctl_fs_inode-nr`: Number of allocated inodes -- `linux_sysctl_fs_inode-free-nr`: Number of free inodes -- `linux_sysctl_fs_dentry-nr`: Number of dentry cache entries -- `linux_sysctl_fs_dentry-unused-nr`: Number of unused dentry cache entries -- `linux_sysctl_fs_aio-nr`: Current number of asynchronous I/O (AIO) requests -- `linux_sysctl_fs_aio-max-nr`: Maximum allowed number of AIO requests +- `linux_sysctl_fs_file_nr`: Number of allocated file handles +- `linux_sysctl_fs_file_max`: Maximum number of allowed file handles +- `linux_sysctl_fs_inode_nr`: Number of allocated inodes +- `linux_sysctl_fs_inode_free_nr`: Number of free inodes +- `linux_sysctl_fs_dentry_nr`: Number of dentry cache entries +- `linux_sysctl_fs_dentry_unused_nr`: Number of unused dentry cache entries +- `linux_sysctl_fs_aio_nr`: Current number of asynchronous I/O (AIO) requests +- `linux_sysctl_fs_aio_max_nr`: Maximum allowed number of AIO requests ## Dashboards diff --git a/inputs/linux_sysctl_fs/README_CN.md b/inputs/linux_sysctl_fs/README_CN.md index 737ff3fd3..e464ccef8 100644 --- a/inputs/linux_sysctl_fs/README_CN.md +++ b/inputs/linux_sysctl_fs/README_CN.md @@ -18,14 +18,14 @@ 所有收集到的指标名称前缀为 `linux_sysctl_fs_`。 主要指标如下: -- `linux_sysctl_fs_file-nr`: 系统当前已经分配的文件句柄数 -- `linux_sysctl_fs_file-max`: 系统允许分配的最大文件句柄数 -- `linux_sysctl_fs_inode-nr`: 当前分配的 inode 数量 -- `linux_sysctl_fs_inode-free-nr`: 当前空闲的 inode 数量 -- `linux_sysctl_fs_dentry-nr`: dentry 缓存的数量 -- `linux_sysctl_fs_dentry-unused-nr`: 未使用的 dentry 缓存数量 -- `linux_sysctl_fs_aio-nr`: 当前的异步 I/O (AIO) 请求数量 -- `linux_sysctl_fs_aio-max-nr`: 允许的最大异步 I/O 请求数量 +- `linux_sysctl_fs_file_nr`: 系统当前已经分配的文件句柄数 +- `linux_sysctl_fs_file_max`: 系统允许分配的最大文件句柄数 +- `linux_sysctl_fs_inode_nr`: 当前分配的 inode 数量 +- `linux_sysctl_fs_inode_free_nr`: 当前空闲的 inode 数量 +- `linux_sysctl_fs_dentry_nr`: dentry 缓存的数量 +- `linux_sysctl_fs_dentry_unused_nr`: 未使用的 dentry 缓存数量 +- `linux_sysctl_fs_aio_nr`: 当前的异步 I/O (AIO) 请求数量 +- `linux_sysctl_fs_aio_max_nr`: 允许的最大异步 I/O 请求数量 ## 监控大盘 diff --git a/inputs/linux_sysctl_fs/dashboard.json b/inputs/linux_sysctl_fs/dashboard.json index dac1f9389..48793a8b8 100644 --- a/inputs/linux_sysctl_fs/dashboard.json +++ b/inputs/linux_sysctl_fs/dashboard.json @@ -19,7 +19,7 @@ "id": 1, "targets": [ { - "expr": "linux_sysctl_fs_file\\-nr / linux_sysctl_fs_file\\-max * 100", + "expr": "linux_sysctl_fs_file_nr / linux_sysctl_fs_file_max * 100", "legendFormat": "{{agent_hostname}}", "refId": "A" } @@ -37,12 +37,12 @@ "id": 2, "targets": [ { - "expr": "linux_sysctl_fs_file\\-nr", + "expr": "linux_sysctl_fs_file_nr", "legendFormat": "Allocated", "refId": "A" }, { - "expr": "linux_sysctl_fs_file\\-max", + "expr": "linux_sysctl_fs_file_max", "legendFormat": "Max", "refId": "B" } @@ -60,12 +60,12 @@ "id": 3, "targets": [ { - "expr": "linux_sysctl_fs_inode\\-nr", + "expr": "linux_sysctl_fs_inode_nr", "legendFormat": "Total Inodes", "refId": "A" }, { - "expr": "linux_sysctl_fs_inode\\-free\\-nr", + "expr": "linux_sysctl_fs_inode_free_nr", "legendFormat": "Free Inodes", "refId": "B" } @@ -83,12 +83,12 @@ "id": 4, "targets": [ { - "expr": "linux_sysctl_fs_aio\\-nr", + "expr": "linux_sysctl_fs_aio_nr", "legendFormat": "AIO Allocated", "refId": "A" }, { - "expr": "linux_sysctl_fs_aio\\-max\\-nr", + "expr": "linux_sysctl_fs_aio_max_nr", "legendFormat": "AIO Max", "refId": "B" } diff --git a/inputs/nginx/README.md b/inputs/nginx/README.md index 5d4f53ead..5c69e6fe1 100644 --- a/inputs/nginx/README.md +++ b/inputs/nginx/README.md @@ -93,7 +93,7 @@ server { } 浏览器访问https://nginx.domains.com出现: -Active connections: 5 +Active connections: 5 server accepts handled requests 90837 90837 79582 Reading: 0 Writing: 1 Waiting: 4 diff --git a/inputs/self_metrics/README.md b/inputs/self_metrics/README.md index e9e1b3b28..3877bb12e 100644 --- a/inputs/self_metrics/README.md +++ b/inputs/self_metrics/README.md @@ -18,16 +18,16 @@ Since it is a built-in plugin gathering its own state, the configuration is extr ## Metrics -All relevant metrics are prefixed with `categraf_` or Go's default `go_` / `process_`. Core self-monitoring metrics include: +All relevant metrics are prefixed with `categraf_` and Go's runtime metrics like `categraf_go_` / `categraf_process_`. Core self-monitoring metrics include: - `categraf_info`: Categraf version information (value is 1, carrying a `version` tag) - `categraf_metrics_enqueue_sum`: Total number of metrics enqueued to the sending queue - `categraf_metrics_enqueue_failed_sum`: Total number of metrics that failed to enqueue - `categraf_current_queue_size`: Current number of pending metrics in the memory queue (if this value keeps rising, it means the pushing rate to the backend is slower than the scraping rate, or the backend is failing) -- `go_goroutines`: Current number of Goroutines -- `go_memstats_alloc_bytes`: Memory allocated by the Go runtime -- `process_cpu_seconds_total`: Total CPU time consumed by the Categraf process -- `process_resident_memory_bytes`: Resident Set Size (RSS) physical memory used by the Categraf process +- `categraf_go_goroutines`: Current number of Goroutines +- `categraf_go_memstats_alloc_bytes`: Memory allocated by the Go runtime +- `categraf_process_cpu_seconds_total`: Total CPU time consumed by the Categraf process +- `categraf_process_resident_memory_bytes`: Resident Set Size (RSS) physical memory used by the Categraf process These metrics are automatically tagged with `version` and other environmental tags. diff --git a/inputs/self_metrics/README_CN.md b/inputs/self_metrics/README_CN.md index 6a96ef5e3..e257377b7 100644 --- a/inputs/self_metrics/README_CN.md +++ b/inputs/self_metrics/README_CN.md @@ -18,16 +18,16 @@ ## 采集指标 -所有相关指标均以 `categraf_` 或 Go 默认的 `go_` / `process_` 为前缀。核心自监控指标包括: +所有相关指标均以 `categraf_` 和 Go 运行时指标 `categraf_go_` / `categraf_process_` 为前缀。核心自监控指标包括: - `categraf_info`: Categraf 版本信息,值为 1,带有 `version` 标签 - `categraf_metrics_enqueue_sum`: 指标入队总数 (推送到发送队列) - `categraf_metrics_enqueue_failed_sum`: 指标入队失败总数 - `categraf_current_queue_size`: 当前待发送指标在内存队列中的堆积量 (如果此值持续上升,说明发送到服务端的速率跟不上采集速率,或服务端出现故障) -- `go_goroutines`: 当前 Goroutine 的数量 -- `go_memstats_alloc_bytes`: Go 运行时分配的内存大小 -- `process_cpu_seconds_total`: Categraf 进程累计消耗的 CPU 时间 -- `process_resident_memory_bytes`: Categraf 进程占用的常驻物理内存大小 (RSS) +- `categraf_go_goroutines`: 当前 Goroutine 的数量 +- `categraf_go_memstats_alloc_bytes`: Go 运行时分配的内存大小 +- `categraf_process_cpu_seconds_total`: Categraf 进程累计消耗的 CPU 时间 +- `categraf_process_resident_memory_bytes`: Categraf 进程占用的常驻物理内存大小 (RSS) 这些指标都会自动打上 `version` 等标签。 diff --git a/inputs/self_metrics/dashboard.json b/inputs/self_metrics/dashboard.json index e7605324c..eecf099b9 100644 --- a/inputs/self_metrics/dashboard.json +++ b/inputs/self_metrics/dashboard.json @@ -55,7 +55,7 @@ "id": 3, "targets": [ { - "expr": "process_resident_memory_bytes{job=~'.*categraf.*'}", + "expr": "categraf_process_resident_memory_bytes", "legendFormat": "{{agent_hostname}}", "refId": "A" } @@ -73,7 +73,7 @@ "id": 4, "targets": [ { - "expr": "go_goroutines{job=~'.*categraf.*'}", + "expr": "categraf_go_goroutines", "legendFormat": "{{agent_hostname}}", "refId": "A" } diff --git a/inputs/xskyapi/README.md b/inputs/xskyapi/README.md index 022bd04ca..a0678060f 100644 --- a/inputs/xskyapi/README.md +++ b/inputs/xskyapi/README.md @@ -12,7 +12,7 @@ You can configure multiple XSKY Management Server (XMS) API endpoints and their [[instances]] # XSKY storage type -# dss_type = "xsky" +# dss_type = "oss" # or gfs, eus # List of XSKY XMS API endpoints servers = ["http://10.10.10.10:8056"] @@ -23,20 +23,18 @@ xms_auth_tokens = ["xxxxxxxxxxxxx"] # Request timeout # response_timeout = "5s" -# (Optional) Specify JSON keys to be converted into labels instead of metrics -# tag_keys = ["pool_id", "volume_id"] ``` ## Metrics -By default, the plugin queries endpoints such as `/api/v1/clusters`, `/api/v1/pools`, `/api/v1/volumes`, `/api/v1/hosts`, and `/api/v1/disks`, mapping the returned status codes and counter values directly to metrics. +By default, the plugin queries endpoints such as `/v1/os-users`, `/v1/os-buckets`, `/v1/dfs-quotas`, `/v1/fs-folders`, and `/v1/block-volumes`, mapping the returned status codes and counter values directly to metrics. All metrics are prefixed with `xskyapi_`. Typical metrics include: -- `xskyapi_cluster_status`: Overall cluster health status. -- `xskyapi_pool_allocated_capacity`: Allocated capacity of storage pools. -- `xskyapi_volume_iops` / `xskyapi_volume_bandwidth`: IOPS and bandwidth performance data for volumes (exact names depend on API returns). -- `xskyapi_disk_status`: Disk presence and health status. +- `xskyapi_oss_bucket_used_size`: Used size of OSS buckets. +- `xskyapi_dfs_quota`: DFS quota metrics. +- `xskyapi_block_volume_used_size`: Used size of block volumes. +- `xskyapi_oss_user_quota`: OSS user quota metrics. ## Dashboards diff --git a/inputs/xskyapi/dashboard.json b/inputs/xskyapi/dashboard.json index 7d221fd8d..b5c5f0807 100644 --- a/inputs/xskyapi/dashboard.json +++ b/inputs/xskyapi/dashboard.json @@ -19,8 +19,8 @@ "id": 1, "targets": [ { - "expr": "xskyapi_cluster_status", - "legendFormat": "{{server}}", + "expr": "xskyapi_oss_bucket_used_size", + "legendFormat": "{{name}} @ {{server}}", "refId": "A" } ] @@ -37,8 +37,8 @@ "id": 2, "targets": [ { - "expr": "xskyapi_pool_allocated_capacity", - "legendFormat": "{{pool_id}} @ {{server}}", + "expr": "xskyapi_dfs_quota", + "legendFormat": "{{name}} @ {{server}}", "refId": "A" } ] @@ -55,8 +55,8 @@ "id": 3, "targets": [ { - "expr": "xskyapi_disk_status", - "legendFormat": "{{disk_id}} @ {{server}}", + "expr": "xskyapi_block_volume_used_size", + "legendFormat": "{{name}} @ {{server}}", "refId": "A" } ] From c6779ffc45f14fa4e40efbd979636a3c40af7133 Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Thu, 11 Jun 2026 16:44:25 +0800 Subject: [PATCH 5/8] dashboards(xskyapi): update panel titles to match exported metrics --- inputs/xskyapi/dashboard.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inputs/xskyapi/dashboard.json b/inputs/xskyapi/dashboard.json index b5c5f0807..aa3110569 100644 --- a/inputs/xskyapi/dashboard.json +++ b/inputs/xskyapi/dashboard.json @@ -8,7 +8,7 @@ "schemaVersion": 30, "panels": [ { - "title": "Cluster Status (1=OK)", + "title": "OSS Bucket Used Size", "type": "timeseries", "gridPos": { "x": 0, @@ -26,7 +26,7 @@ ] }, { - "title": "Pool Allocated Capacity", + "title": "DFS Quota", "type": "timeseries", "gridPos": { "x": 12, @@ -44,7 +44,7 @@ ] }, { - "title": "Host Disk Status", + "title": "Block Volume Used Size", "type": "timeseries", "gridPos": { "x": 0, From 4b657f30c168f331dcece4b1badf621a58e03b71 Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Thu, 11 Jun 2026 16:47:04 +0800 Subject: [PATCH 6/8] docs(xskyapi): fix review issues in chinese readme --- inputs/xskyapi/README_CN.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/inputs/xskyapi/README_CN.md b/inputs/xskyapi/README_CN.md index c61d0b6f3..d0a78d908 100644 --- a/inputs/xskyapi/README_CN.md +++ b/inputs/xskyapi/README_CN.md @@ -12,7 +12,7 @@ [[instances]] # XSKY 存储类型 -# dss_type = "xsky" +# dss_type = "oss" # or gfs, eus # XSKY 管理节点 (XMS) 的 API 地址列表 servers = ["http://10.10.10.10:8056"] @@ -22,21 +22,18 @@ xms_auth_tokens = ["xxxxxxxxxxxxx"] # 请求超时时间 # response_timeout = "5s" - -# (可选) 指定将哪些 JSON 字段转化为 Label (而非指标字段) -# tag_keys = ["pool_id", "volume_id"] ``` ## 采集指标 -插件默认会将从 `/api/v1/clusters`, `/api/v1/pools`, `/api/v1/volumes`, `/api/v1/hosts` 和 `/api/v1/disks` 等 API 获取到的状态码和计数值直接映射为指标。 +插件默认会将从 `/v1/os-users`, `/v1/os-buckets`, `/v1/dfs-quotas`, `/v1/fs-folders` 和 `/v1/block-volumes` 等 API 获取到的状态码和计数值直接映射为指标。 所有的指标统一带有 `xskyapi_` 前缀。 典型指标举例: -- `xskyapi_cluster_status`: 集群整体健康状态。 -- `xskyapi_pool_allocated_capacity`: 存储池已分配容量。 -- `xskyapi_volume_iops` / `xskyapi_volume_bandwidth`: 卷的 IOPS 与吞吐性能数据(具体字段名依赖 API 实际返回)。 -- `xskyapi_disk_status`: 硬盘的在位与健康状态。 +- `xskyapi_oss_bucket_used_size`: OSS Bucket 已使用容量。 +- `xskyapi_dfs_quota`: DFS Quota 指标。 +- `xskyapi_block_volume_used_size`: 块存储卷(Block Volume)已使用容量。 +- `xskyapi_oss_user_quota`: OSS 用户配额指标。 ## 监控大盘 From 528b4ed5eb4f56ec0fcb965ce2120a6febc02eb8 Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Thu, 11 Jun 2026 16:53:16 +0800 Subject: [PATCH 7/8] docs,dashs: fix metrics in weblogic, appdynamics, processes and paths per review --- inputs/appdynamics/dashboard.json | 2 +- inputs/hadoop_hdfs/README_CN.md | 4 ++-- inputs/nfsclient/README.md | 2 +- inputs/processes/README.md | 1 - inputs/processes/README_CN.md | 1 - inputs/processes/dashboard.json | 2 +- inputs/weblogic/dashboard.json | 6 +++--- 7 files changed, 8 insertions(+), 10 deletions(-) diff --git a/inputs/appdynamics/dashboard.json b/inputs/appdynamics/dashboard.json index 948b9aa20..b9ae4afb1 100644 --- a/inputs/appdynamics/dashboard.json +++ b/inputs/appdynamics/dashboard.json @@ -19,7 +19,7 @@ "id": 1, "targets": [ { - "expr": "appdynamics_up", + "expr": "up", "legendFormat": "{{metric_path}}", "refId": "A" } diff --git a/inputs/hadoop_hdfs/README_CN.md b/inputs/hadoop_hdfs/README_CN.md index 72041cb42..f62bbd237 100644 --- a/inputs/hadoop_hdfs/README_CN.md +++ b/inputs/hadoop_hdfs/README_CN.md @@ -6,10 +6,10 @@ Categraf 监控 Hadoop HDFS 时,不需要专门的独立的二进制原生插 要配置 HDFS 的监控,请直接修改 `jolokia_agent` 的配置文件。我们在配置示例目录中已经准备好了一份适用于 Hadoop HDFS 的模板。 -请参考:[hadoop-hdfs.toml](../../conf/example.input.jolokia_agent/hadoop-hdfs.toml) +请参考:[hadoop-hdfs.toml](../../conf/input.jolokia_agent_misc/hadoop-hdfs.toml) 具体步骤: -1. 将上述参考配置复制到您的 Categraf `conf/input.jolokia_agent/` 目录中。 +1. 将上述参考配置复制到您的 Categraf `conf/input.jolokia_agent_misc/` 目录中。 2. 确保您的 Hadoop NameNode 或 DataNode 启用了 Jolokia Agent。 3. 修改配置文件中的 `urls`,指向真实的 Jolokia JMX HTTP Endpoint (例如: `http://localhost:8778/jolokia/`)。 diff --git a/inputs/nfsclient/README.md b/inputs/nfsclient/README.md index 4671bf11e..f6dc1e412 100644 --- a/inputs/nfsclient/README.md +++ b/inputs/nfsclient/README.md @@ -29,7 +29,7 @@ fullstat = false The plugin supports NFSv3 and NFSv4. All metrics are tagged with `mountpoint`, `server` (NFS server address), and `export` (exported path). Key metric categories include: -- **Bytes Statistics (`nfsclient_bytes_*)**: `read`, `write`, `direct_read`, `direct_write` +- **Bytes Statistics (`nfsclient_bytes_*`)**: `read`, `write`, `direct_read`, `direct_write` - **Event Statistics (`nfsclient_events_*)**: `inoderevalidates`, `dentryrevalidates`, `datainvalidates`, etc. - **Operation Statistics (`nfsclient_ops_*`)**: - `ops`: Total number of requests for the operation diff --git a/inputs/processes/README.md b/inputs/processes/README.md index 641dbb30b..4afcbcdd8 100644 --- a/inputs/processes/README.md +++ b/inputs/processes/README.md @@ -28,7 +28,6 @@ All metrics are prefixed with `processes_`. Key metrics include but are not limi - `processes_paging`: Number of paging processes - `processes_dead`: Number of dead processes - `processes_idle`: Number of idle processes -- `processes_threads`: Total number of threads in the system - `processes_total_threads`: Same as above, total number of threads ## Dashboards diff --git a/inputs/processes/README_CN.md b/inputs/processes/README_CN.md index 16d1950c5..9ccd31c7b 100644 --- a/inputs/processes/README_CN.md +++ b/inputs/processes/README_CN.md @@ -28,7 +28,6 @@ - `processes_paging`: 处于 paging 状态的进程数 - `processes_dead`: 处于 dead 状态的进程数 - `processes_idle`: 处于 idle 状态的进程数 -- `processes_threads`: 系统中总的线程数 - `processes_total_threads`: 同上,系统中总的线程数 ## 监控大盘 diff --git a/inputs/processes/dashboard.json b/inputs/processes/dashboard.json index edd76bc3e..ec41b2a42 100644 --- a/inputs/processes/dashboard.json +++ b/inputs/processes/dashboard.json @@ -37,7 +37,7 @@ "id": 2, "targets": [ { - "expr": "processes_threads", + "expr": "processes_total_threads", "legendFormat": "{{agent_hostname}}", "refId": "A" } diff --git a/inputs/weblogic/dashboard.json b/inputs/weblogic/dashboard.json index 160018711..d1d25116b 100644 --- a/inputs/weblogic/dashboard.json +++ b/inputs/weblogic/dashboard.json @@ -19,14 +19,14 @@ "id": 1, "targets": [ { - "expr": "java_lang_Memory_HeapMemoryUsage_used", + "expr": "weblogic_jvm_memory_HeapMemoryUsage_used", "legendFormat": "{{agent_hostname}}", "refId": "A" } ] }, { - "title": "WebLogic Thread Count", + "title": "WebLogic Execute Thread Total Count", "type": "timeseries", "gridPos": { "x": 12, @@ -37,7 +37,7 @@ "id": 2, "targets": [ { - "expr": "java_lang_Threading_ThreadCount", + "expr": "weblogic_ThreadPoolRuntime_ExecuteThreadTotalCount", "legendFormat": "{{agent_hostname}}", "refId": "A" } From ae36468be12c39326c2bd05b224e71d3ca7d06ce Mon Sep 17 00:00:00 2001 From: kongfei605 Date: Wed, 17 Jun 2026 09:39:19 +0800 Subject: [PATCH 8/8] chore(inputs): update plugin docs and dashboards --- inputs/activemq/dashboard.json | 4 +-- inputs/aliyun/README.md | 12 ++++---- inputs/amd_rocm_smi/README.md | 4 +-- inputs/amd_rocm_smi/README_CN.md | 2 +- inputs/amd_rocm_smi/dashboard.json | 4 +-- inputs/apache/README.md | 6 ++++ inputs/apache/README_CN.md | 6 ++++ inputs/apache/dashboard.json | 4 +-- inputs/appdynamics/dashboard.json | 4 +-- inputs/arp_packet/README.md | 2 +- inputs/arp_packet/README_CN.md | 2 +- inputs/bind/README.md | 9 +++--- inputs/bind/README_CN.md | 9 +++--- inputs/bind/dashboard.json | 38 ++++++++++++------------- inputs/cadvisor/README.md | 3 +- inputs/cadvisor/README_CN.md | 7 +++-- inputs/clickhouse/dashboard.json | 10 +++---- inputs/conntrack/README.md | 3 ++ inputs/conntrack/README_CN.md | 5 +++- inputs/conntrack/dashboard.json | 16 +++++++++-- inputs/cpu/README.md | 1 - inputs/cpu/README_CN.md | 3 +- inputs/dcgm/README.md | 6 ---- inputs/dcgm/README_CN.md | 6 ---- inputs/disk/README.md | 11 +++++-- inputs/disk/README_CN.md | 13 ++++++--- inputs/dns_query/README.md | 6 ++-- inputs/dns_query/README_CN.md | 6 ++-- inputs/docker/README.md | 2 +- inputs/docker/README_CN.md | 2 +- inputs/elasticsearch/README.md | 14 +++++++++ inputs/exec/dashboard.json | 6 +++- inputs/filecount/README_CN.md | 4 +-- inputs/gnmi/README_CN.md | 2 +- inputs/gnmi/dashboard.json | 6 +++- inputs/googlecloud/README.md | 6 ++-- inputs/googlecloud/README_CN.md | 6 ++-- inputs/googlecloud/dashboard.json | 6 +++- inputs/greenplum/README.md | 3 -- inputs/greenplum/README_CN.md | 3 -- inputs/hadoop_hdfs/README.md | 2 +- inputs/hadoop_hdfs/README_CN.md | 2 +- inputs/hadoop_hdfs/dashboard.json | 6 +++- inputs/http_response/README.md | 15 +++++----- inputs/huatuo/dashboard.json | 6 +++- inputs/ipvs/README.md | 35 +++++++++++------------ inputs/ipvs/README_CN.md | 35 +++++++++++------------ inputs/ipvs/dashboard.json | 18 ++++++------ inputs/jboss/README.md | 3 +- inputs/jboss/README_CN.md | 3 +- inputs/jboss/dashboard.json | 6 +++- inputs/jenkins/dashboard.json | 4 +-- inputs/jolokia/dashboard.json | 6 +++- inputs/jolokia_agent/dashboard.json | 6 +++- inputs/jolokia_proxy/dashboard.json | 6 +++- inputs/kafka/README.md | 2 +- inputs/kafka_connect/README.md | 4 +-- inputs/kafka_connect/README_CN.md | 4 +-- inputs/kafka_connect/dashboard.json | 6 +++- inputs/kernel/README.md | 1 - inputs/kernel/README_CN.md | 3 +- inputs/kernel_vmstat/README.md | 1 - inputs/kube_proxy/README.md | 11 +++++++ inputs/kube_proxy/README_CN.md | 11 +++++++ inputs/kubelet/README.md | 14 +++++++++ inputs/kubelet/README_CN.md | 14 +++++++++ inputs/linux_sysctl_fs/README.md | 1 - inputs/linux_sysctl_fs/README_CN.md | 1 - inputs/mem/README.md | 1 - inputs/mem/README_CN.md | 3 +- inputs/mysql/README.md | 4 +-- inputs/net/README.md | 1 - inputs/net/README_CN.md | 3 +- inputs/netstat/README.md | 1 - inputs/netstat/README_CN.md | 3 +- inputs/nfsclient/README.md | 18 ++++++------ inputs/nfsclient/README_CN.md | 18 ++++++------ inputs/nfsclient/dashboard.json | 20 ++++++------- inputs/node_exporter/README.md | 1 - inputs/node_exporter/README_CN.md | 1 - inputs/nvidia_smi/README.md | 1 - inputs/nvidia_smi/README_CN.md | 3 +- inputs/processes/README.md | 1 - inputs/processes/README_CN.md | 3 +- inputs/redis_sentinel/README.md | 14 ++++----- inputs/redis_sentinel/README_CN.md | 14 ++++----- inputs/redis_sentinel/dashboard.json | 14 ++++----- inputs/redis_sentinel/redis_sentinel.go | 2 +- inputs/self_metrics/README.md | 1 - inputs/self_metrics/README_CN.md | 1 - inputs/sockstat/README.md | 3 +- inputs/sockstat/README_CN.md | 3 +- inputs/systemd/README.md | 1 - inputs/weblogic/README.md | 2 +- inputs/weblogic/README_CN.md | 2 +- 95 files changed, 366 insertions(+), 261 deletions(-) diff --git a/inputs/activemq/dashboard.json b/inputs/activemq/dashboard.json index 087ab97df..a3a3e5521 100644 --- a/inputs/activemq/dashboard.json +++ b/inputs/activemq/dashboard.json @@ -66,7 +66,7 @@ "type": "timeseries", "gridPos": { "x": 12, - "y": 0, + "y": 16, "w": 12, "h": 8 }, @@ -139,4 +139,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/aliyun/README.md b/inputs/aliyun/README.md index 52fa9add6..c36c4d021 100644 --- a/inputs/aliyun/README.md +++ b/inputs/aliyun/README.md @@ -24,13 +24,13 @@ RAM 用户授权。RAM 用户调用云监控 API 前,需要所属的阿里云 4. 配置 -不同namespace附件label的权限点 - - rds: +不同 namespace 附加 label 所需权限点: + - rds: - `rds:DescribeDBInstances` `acs:rds:{#regionId}:{#accountId}:dbinstance/{#dbinstanceId}` [链接](https://next.api.aliyun.com/document/Rds/2014-08-15/DescribeDBInstances) - `rds:ListTagResources` `acs:rds:{#regionId}:{#accountId}:dbinstance/{#dbinstanceId}` [链接](https://next.api.aliyun.com/document/Rds/2014-08-15/ListTagResources) - - polardb: `acs:DescribeDBClusters` `acs:polardb:*:*:dbcluster` [链接](https://help.aliyun.com/document_detail/118034.html?spm=a2c4g.98094.0.0) - - kvstore: `DescribeInstances` `acs:kvstore:$regionid:$accountid:instance/$instanceid` [链接](https://help.aliyun.com/apsara/enterprise/v_3_18_0/kvstore/enterprise-developer-guide/api-authentication-rules.html) - - ecs: `DescribeInstances` `acs:ecs:$regionid:$accountid:instance/*` [链接](https://help.aliyun.com/document_detail/25497.html?spm=a2c4g.25506.0.0) + - polardb: `polardb:DescribeDBClusters` `acs:polardb:*:*:dbcluster/*` [链接](https://help.aliyun.com/document_detail/118034.html?spm=a2c4g.98094.0.0) + - kvstore: `kvstore:DescribeInstances` `acs:kvstore:$regionid:$accountid:instance/$instanceid` [链接](https://help.aliyun.com/apsara/enterprise/v_3_18_0/kvstore/enterprise-developer-guide/api-authentication-rules.html) + - ecs: `ecs:DescribeInstances` `acs:ecs:$regionid:$accountid:instance/*` [链接](https://help.aliyun.com/document_detail/25497.html?spm=a2c4g.25506.0.0) ```toml # # categraf采集周期,阿里云指标的粒度一般是60秒,建议设置不要少于60秒 @@ -70,4 +70,4 @@ timeout="5s" 5. Dashboard - dashboard_for_redis_kvstore_standard:适用于 [Redis/Tair 内存型(标准版)](https://cms.console.aliyun.com/metric-meta/acs_kvstore/kvstore_standard?spm=a2c4g.11186623.0.0.5ed876abviVWI8) -- dashboard_for_polardb_mysql:适用于 [云数据库POLARDB-MySQL(新版)](https://cms.console.aliyun.com/metric-meta/acs_polardb/polardb_mysql_cluster?spm=a2c4g.11186623.0.0.1f1d76abbgD9eJ) \ No newline at end of file +- dashboard_for_polardb_mysql:适用于 [云数据库POLARDB-MySQL(新版)](https://cms.console.aliyun.com/metric-meta/acs_polardb/polardb_mysql_cluster?spm=a2c4g.11186623.0.0.1f1d76abbgD9eJ) diff --git a/inputs/amd_rocm_smi/README.md b/inputs/amd_rocm_smi/README.md index ec699ae69..6170397e2 100644 --- a/inputs/amd_rocm_smi/README.md +++ b/inputs/amd_rocm_smi/README.md @@ -57,7 +57,7 @@ Check the full output by running `rocm-smi` binary manually. Linux: ```sh -rocm-smi rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json +rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json ``` Please include the output of this command if opening a GitHub issue, together @@ -81,4 +81,4 @@ appearing in the metrics upon updates. The `rocm-smi` JSON output is not perfectly homogeneous and is possibly changing in the future, hence parsing and unmarshaling can start failing upon updating ROCm. -Inspired by the current state of the art of the `nvidia-smi` plugin. \ No newline at end of file +Inspired by the current state of the art of the `nvidia-smi` plugin. diff --git a/inputs/amd_rocm_smi/README_CN.md b/inputs/amd_rocm_smi/README_CN.md index 6438d46bd..ab6b93e1b 100644 --- a/inputs/amd_rocm_smi/README_CN.md +++ b/inputs/amd_rocm_smi/README_CN.md @@ -47,7 +47,7 @@ Linux 环境下: ```sh -rocm-smi rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json +rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json ``` 如果在 GitHub 提交 issue,请附上此命令的输出结果以及您所使用的 ROCm 版本。 diff --git a/inputs/amd_rocm_smi/dashboard.json b/inputs/amd_rocm_smi/dashboard.json index 4963f1360..0e1af4103 100644 --- a/inputs/amd_rocm_smi/dashboard.json +++ b/inputs/amd_rocm_smi/dashboard.json @@ -48,7 +48,7 @@ "type": "timeseries", "gridPos": { "x": 16, - "y": 0, + "y": 16, "w": 8, "h": 8 }, @@ -157,4 +157,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/apache/README.md b/inputs/apache/README.md index 971dbe4cc..0e3e97370 100644 --- a/inputs/apache/README.md +++ b/inputs/apache/README.md @@ -17,6 +17,12 @@ scrape_uri = "http://localhost/server-status/?auto" # Optional: Skip TLS verification # insecure = false + +# Optional: Custom request headers +# custom_headers = {} + +# Optional: Log level, one of debug, info, warn, error +# log_level = "info" ``` ### Apache mod_status Setup diff --git a/inputs/apache/README_CN.md b/inputs/apache/README_CN.md index c903feba0..e0837f3d7 100644 --- a/inputs/apache/README_CN.md +++ b/inputs/apache/README_CN.md @@ -17,6 +17,12 @@ scrape_uri = "http://localhost/server-status/?auto" # 可选: 跳过 TLS 证书校验 # insecure = false + +# 可选: 自定义请求 Header +# custom_headers = {} + +# 可选: 日志级别,可选值为 debug、info、warn、error +# log_level = "info" ``` ### Apache mod_status 模块配置 diff --git a/inputs/apache/dashboard.json b/inputs/apache/dashboard.json index bb5101a96..44c1dd7af 100644 --- a/inputs/apache/dashboard.json +++ b/inputs/apache/dashboard.json @@ -48,7 +48,7 @@ "type": "timeseries", "gridPos": { "x": 16, - "y": 0, + "y": 8, "w": 8, "h": 8 }, @@ -103,4 +103,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/appdynamics/dashboard.json b/inputs/appdynamics/dashboard.json index b9ae4afb1..c62839ad9 100644 --- a/inputs/appdynamics/dashboard.json +++ b/inputs/appdynamics/dashboard.json @@ -55,7 +55,7 @@ "id": 3, "targets": [ { - "expr": "appdynamics_average_response_time__ms__current", + "expr": "appdynamics_average_response_time_ms_current", "legendFormat": "{{metric_path}}", "refId": "A" } @@ -85,4 +85,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/arp_packet/README.md b/inputs/arp_packet/README.md index 88c7ce0e6..d843ec4d8 100644 --- a/inputs/arp_packet/README.md +++ b/inputs/arp_packet/README.md @@ -34,7 +34,7 @@ Select the appropriate interface (e.g., `eth0`) and set it in the `eth_device` p ## Metrics - `arp_packet_request_num`: Total number of ARP requests sent from the monitored interface. -- `arp_packet_response_num`: Total number of ARP responses received on the monitored interface. +- `arp_packet_response_num`: Total number of ARP responses sent from the monitored interface. All metrics include the `sourceAddr` tag, which contains the bound local IPv4 address. diff --git a/inputs/arp_packet/README_CN.md b/inputs/arp_packet/README_CN.md index b155cfdf6..52f54dca7 100644 --- a/inputs/arp_packet/README_CN.md +++ b/inputs/arp_packet/README_CN.md @@ -34,7 +34,7 @@ ip addr | grep '^[0-9]' | awk -F':' '{print $2}' ## 采集指标 - `arp_packet_request_num`: 监听网卡上累计发出的 ARP 请求数 -- `arp_packet_response_num`: 监听网卡上累计收到的 ARP 响应数 +- `arp_packet_response_num`: 监听网卡上累计发出的 ARP 响应数 所有指标会附带标签 `sourceAddr`,表示绑定的本地 IPv4 地址。 diff --git a/inputs/bind/README.md b/inputs/bind/README.md index ad3244b35..a2cfc2255 100644 --- a/inputs/bind/README.md +++ b/inputs/bind/README.md @@ -33,7 +33,8 @@ gather_views = true ## Metrics -- `bind_server_*`: Global server metrics, such as total requests, queries, success, nxrrset, failure, recursion, etc. -- `bind_memory_context_*`: Internal memory usage by various BIND modules (requires `gather_memory_contexts = true`). -- `bind_view_*`: Per-view query metrics (requires `gather_views = true`). -- `bind_up`: Whether the statistics channel was reachable. +- `bind_counter_*`: Counters reported by BIND, with labels such as `type`, `url`, `source`, and `port`. +- `bind_memory_*`: BIND memory summary metrics, such as `bind_memory_total_use` and `bind_memory_in_use`. +- `bind_memory_context_*`: Internal memory usage by BIND modules, such as `bind_memory_context_total` and `bind_memory_context_in_use` (requires `gather_memory_contexts = true`). + +When `gather_views = true`, per-view counters are also exported as `bind_counter_*` with an additional `view` label. diff --git a/inputs/bind/README_CN.md b/inputs/bind/README_CN.md index 40d25b126..6b1bd0687 100644 --- a/inputs/bind/README_CN.md +++ b/inputs/bind/README_CN.md @@ -33,7 +33,8 @@ gather_views = true ## 采集指标 -- `bind_server_*`: BIND 服务器的全局请求数、查询数、成功/失败/拒绝的解析数等。 -- `bind_memory_context_*`: BIND 内部各模块的内存使用量(需开启 `gather_memory_contexts`)。 -- `bind_view_*`: 按 DNS View 统计的查询数据(需开启 `gather_views`)。 -- `bind_up`: 目标统计接口是否可达。 \ No newline at end of file +- `bind_counter_*`: BIND 返回的各类计数器,会附带 `type`、`url`、`source`、`port` 等标签。 +- `bind_memory_*`: BIND 内存汇总指标,例如 `bind_memory_total_use`、`bind_memory_in_use`。 +- `bind_memory_context_*`: BIND 内部各模块的内存使用量,例如 `bind_memory_context_total`、`bind_memory_context_in_use`(需开启 `gather_memory_contexts`)。 + +开启 `gather_views = true` 后,按 DNS View 统计的计数器也会以 `bind_counter_*` 上报,并额外附带 `view` 标签。 diff --git a/inputs/bind/dashboard.json b/inputs/bind/dashboard.json index 5980cd4e0..a96e87f1f 100644 --- a/inputs/bind/dashboard.json +++ b/inputs/bind/dashboard.json @@ -8,7 +8,7 @@ "schemaVersion": 30, "panels": [ { - "title": "BIND Status", + "title": "Memory Total Use", "type": "timeseries", "gridPos": { "x": 0, @@ -19,62 +19,62 @@ "id": 1, "targets": [ { - "expr": "bind_up", + "expr": "bind_memory_total_use", "legendFormat": "{{url}}", "refId": "A" } ] }, { - "title": "Queries Total", + "title": "Query Counters", "type": "timeseries", "gridPos": { "x": 8, - "y": 0, + "y": 4, "w": 8, "h": 8 }, "id": 2, "targets": [ { - "expr": "bind_server_queries", - "legendFormat": "Queries", + "expr": "sum by (__name__, url) ({__name__=~\"bind_counter_.*\", type=\"qtype\"})", + "legendFormat": "{{__name__}} {{url}}", "refId": "A" } ] }, { - "title": "Requests Total", + "title": "Request Counters", "type": "timeseries", "gridPos": { "x": 16, - "y": 0, + "y": 4, "w": 8, "h": 8 }, "id": 3, "targets": [ { - "expr": "bind_server_requests", - "legendFormat": "Requests", + "expr": "sum by (__name__, url) ({__name__=~\"bind_counter_.*\", type=\"opcode\"})", + "legendFormat": "{{__name__}} {{url}}", "refId": "A" } ] }, { - "title": "Responses Total", + "title": "Response Code Counters", "type": "timeseries", "gridPos": { "x": 0, - "y": 8, + "y": 4, "w": 8, "h": 8 }, "id": 4, "targets": [ { - "expr": "bind_server_responses", - "legendFormat": "Responses", + "expr": "sum by (__name__, url) ({__name__=~\"bind_counter_.*\", type=\"rcode\"})", + "legendFormat": "{{__name__}} {{url}}", "refId": "A" } ] @@ -84,14 +84,14 @@ "type": "timeseries", "gridPos": { "x": 0, - "y": 16, + "y": 12, "w": 12, "h": 8 }, "id": 5, "targets": [ { - "expr": "bind_memory_context_TotalUse", + "expr": "bind_memory_context_total", "legendFormat": "Total Use", "refId": "A" } @@ -102,14 +102,14 @@ "type": "timeseries", "gridPos": { "x": 12, - "y": 16, + "y": 12, "w": 12, "h": 8 }, "id": 6, "targets": [ { - "expr": "bind_memory_context_InUse", + "expr": "bind_memory_context_in_use", "legendFormat": "In Use", "refId": "A" } @@ -121,4 +121,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/cadvisor/README.md b/inputs/cadvisor/README.md index 7454103d7..14c2ab37a 100644 --- a/inputs/cadvisor/README.md +++ b/inputs/cadvisor/README.md @@ -33,6 +33,7 @@ bearer_token_file = "/path/to/token/file" ignore_label_keys = ["id","name", "container_label*"] # Label keys to explicitly choose. It is recommended to leave this empty to collect all labels. # This takes precedence over ignore_label_keys. +# When this is not ["*"], include "pod" and "namespace" if you need pod labels or annotations. #choose_label_keys = ["*"] timeout = "3s" @@ -73,6 +74,6 @@ The related variables are generated using the URL template fields: | `{{.Host}}` | 1.2.3.4:8080 | | `{{.Hostname}}` | 1.2.3.4 | | `{{.Port}}` | 8080 | -| `{{.Path}}` | search | +| `{{.Path}}` | /search | | `{{.Query}}` | q=keyword | | `{{.Fragment}}` | results | diff --git a/inputs/cadvisor/README_CN.md b/inputs/cadvisor/README_CN.md index dfe20e46c..57ec6074a 100644 --- a/inputs/cadvisor/README_CN.md +++ b/inputs/cadvisor/README_CN.md @@ -2,7 +2,7 @@ cadvisor 采集插件, 采集cadvisor 数据,如果是通过kubelet采集,可以附加pod的label和annotation -## Configuration +## 配置说明 ```toml # # collect interval @@ -30,6 +30,7 @@ bearer_token_file = "/path/to/token/file" # 需要忽略的label key ignore_label_keys = ["id","name", "container_label*"] # 只采集那些label key, 建议保持为空,采集所有的label。 优先级高于ignore_label_keys。 +# 放开 choose_label_keys 配置时,如果不使用 ["*"],需要包含 "pod" 和 "namespace",否则无法附加 pod 标签和 annotation。 #choose_label_keys = ["*"] timeout = "3s" @@ -97,6 +98,6 @@ func (ul *UrlLabel) GenerateLabel(u *url.URL) (string, string, error) { |{{.Host}} |1.2.3.4:8080| |{{.Hostname}}|1.2.3.4| |{{.Port}}|8080| -|{{.Path}}|search| +|{{.Path}}|/search| |{{.Query}}|q=keyword| -|{{.Fragment}}| results| \ No newline at end of file +|{{.Fragment}}|results| diff --git a/inputs/clickhouse/dashboard.json b/inputs/clickhouse/dashboard.json index e458a760e..b05bc80aa 100644 --- a/inputs/clickhouse/dashboard.json +++ b/inputs/clickhouse/dashboard.json @@ -19,7 +19,7 @@ "id": 1, "targets": [ { - "expr": "rate(clickhouse_events_InsertQuery[5m])", + "expr": "rate(clickhouse_events_insert_query[5m])", "legendFormat": "{{source}}", "refId": "A" } @@ -37,7 +37,7 @@ "id": 2, "targets": [ { - "expr": "rate(clickhouse_events_SelectQuery[5m])", + "expr": "rate(clickhouse_events_select_query[5m])", "legendFormat": "{{source}}", "refId": "A" } @@ -55,7 +55,7 @@ "id": 3, "targets": [ { - "expr": "clickhouse_metrics_MemoryTracking", + "expr": "clickhouse_metrics_memory_tracking", "legendFormat": "{{source}}", "refId": "A" } @@ -73,7 +73,7 @@ "id": 4, "targets": [ { - "expr": "clickhouse_asynchronous_metrics_MaxPartCountForPartition", + "expr": "clickhouse_asynchronous_metrics_max_part_count_for_partition", "legendFormat": "{{source}}", "refId": "A" } @@ -85,4 +85,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/conntrack/README.md b/inputs/conntrack/README.md index 62ea27090..61fdb3dcc 100644 --- a/inputs/conntrack/README.md +++ b/inputs/conntrack/README.md @@ -10,6 +10,8 @@ All metrics are recorded under the `conntrack` measurement: - `conntrack_ip_conntrack_count`: The current number of entries in the conntrack table. - `conntrack_ip_conntrack_max`: The maximum capacity of the conntrack table. +- `conntrack_nf_conntrack_count`: The current number of entries in the nf_conntrack table. +- `conntrack_nf_conntrack_max`: The maximum capacity of the nf_conntrack table. ## Alerting Recommendation @@ -17,4 +19,5 @@ You can configure an alerting rule in your monitoring system (like Prometheus or ```promql conntrack_ip_conntrack_count / conntrack_ip_conntrack_max > 0.8 +conntrack_nf_conntrack_count / conntrack_nf_conntrack_max > 0.8 ``` diff --git a/inputs/conntrack/README_CN.md b/inputs/conntrack/README_CN.md index 8de201c57..88f8827c1 100644 --- a/inputs/conntrack/README_CN.md +++ b/inputs/conntrack/README_CN.md @@ -10,6 +10,8 @@ - `conntrack_ip_conntrack_count`: 当前 conntrack 表中的连接条目数 (count)。 - `conntrack_ip_conntrack_max`: 当前 conntrack 表的最大容量限制 (size)。 +- `conntrack_nf_conntrack_count`: 当前 nf_conntrack 表中的连接条目数 (count)。 +- `conntrack_nf_conntrack_max`: 当前 nf_conntrack 表的最大容量限制 (size)。 ## 告警配置建议 @@ -17,4 +19,5 @@ ```promql conntrack_ip_conntrack_count / conntrack_ip_conntrack_max > 0.8 -``` \ No newline at end of file +conntrack_nf_conntrack_count / conntrack_nf_conntrack_max > 0.8 +``` diff --git a/inputs/conntrack/dashboard.json b/inputs/conntrack/dashboard.json index cf061ea21..a663404eb 100644 --- a/inputs/conntrack/dashboard.json +++ b/inputs/conntrack/dashboard.json @@ -20,8 +20,13 @@ "targets": [ { "expr": "conntrack_ip_conntrack_count / conntrack_ip_conntrack_max * 100", - "legendFormat": "Usage %", + "legendFormat": "ip_conntrack Usage %", "refId": "A" + }, + { + "expr": "conntrack_nf_conntrack_count / conntrack_nf_conntrack_max * 100", + "legendFormat": "nf_conntrack Usage %", + "refId": "B" } ] }, @@ -38,8 +43,13 @@ "targets": [ { "expr": "conntrack_ip_conntrack_count", - "legendFormat": "Count", + "legendFormat": "ip_conntrack Count", "refId": "A" + }, + { + "expr": "conntrack_nf_conntrack_count", + "legendFormat": "nf_conntrack Count", + "refId": "B" } ] } @@ -49,4 +59,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/cpu/README.md b/inputs/cpu/README.md index d6f967958..cb6f0aefb 100644 --- a/inputs/cpu/README.md +++ b/inputs/cpu/README.md @@ -7,7 +7,6 @@ By default, the plugin only collects global (total) CPU metrics. If you want to ## Configuration ```toml -[[instances]] # Whether to collect metrics for each individual CPU core collect_per_cpu = false ``` diff --git a/inputs/cpu/README_CN.md b/inputs/cpu/README_CN.md index 777cd9059..02f4b61fc 100644 --- a/inputs/cpu/README_CN.md +++ b/inputs/cpu/README_CN.md @@ -7,7 +7,6 @@ CPU 采集插件主要用于自动收集本机 CPU 的使用率、空闲率等 ## 配置说明 ```toml -[[instances]] # 是否采集每个独立 CPU 核心的指标 collect_per_cpu = false ``` @@ -25,4 +24,4 @@ collect_per_cpu = false ## 监控大盘 -建议将 OS 级别的监控 (如 CPU、Mem、Disk 等) 整合到统一的 System Dashboard 中。但为了方便独立查看,这里也提供了一份专门针对 CPU 的参考 Dashboard。 \ No newline at end of file +建议将 OS 级别的监控 (如 CPU、Mem、Disk 等) 整合到统一的 System Dashboard 中。但为了方便独立查看,这里也提供了一份专门针对 CPU 的参考 Dashboard。 diff --git a/inputs/dcgm/README.md b/inputs/dcgm/README.md index c534abffa..789782199 100644 --- a/inputs/dcgm/README.md +++ b/inputs/dcgm/README.md @@ -26,12 +26,6 @@ This plugin collects hardware monitoring metrics for NVIDIA GPUs by integrating # Optional: Connect to a remote hostengine # remote-hostengine-info = "localhost:5555" - # You can declare the collector CSV file inline directly in the config - # [instances.collector_files] - # "/etc/categraf/dcgm/default-counters.csv" = """ - # DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C) - # DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). - # """ ``` ## Metrics diff --git a/inputs/dcgm/README_CN.md b/inputs/dcgm/README_CN.md index 4a5aad969..59f4d1c7e 100644 --- a/inputs/dcgm/README_CN.md +++ b/inputs/dcgm/README_CN.md @@ -26,12 +26,6 @@ # 可选:连接到远端的 hostengine # remote-hostengine-info = "localhost:5555" - # 直接在配置文件中内联声明 collector 文件内容 - # [instances.collector_files] - # "/etc/categraf/dcgm/default-counters.csv" = """ - # DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C) - # DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). - # """ ``` ## 采集指标 diff --git a/inputs/disk/README.md b/inputs/disk/README.md index 1fb2ac078..8ef7c8ae6 100644 --- a/inputs/disk/README.md +++ b/inputs/disk/README.md @@ -8,9 +8,14 @@ The default configuration is already the recommended setting for most environmen ## Configuration ```toml -[[instances]] - # List of filesystem types to ignore - # ignore_fs = [...] +# Set mount_points to restrict collection to specific mount points. +# mount_points = ["/"] + +# List of filesystem types to ignore. +# ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs", "nsfs", "CDFS", "fuse.juicefs"] + +# List of mount point prefixes to ignore. +# ignore_mount_points = ["/boot", "/var/lib/kubelet/pods"] ``` ## Metrics diff --git a/inputs/disk/README_CN.md b/inputs/disk/README_CN.md index 26d4ed8e8..4143425e0 100644 --- a/inputs/disk/README_CN.md +++ b/inputs/disk/README_CN.md @@ -8,9 +8,14 @@ Disk 采集插件主要用于收集操作系统的磁盘分区使用情况。 ## 配置说明 ```toml -[[instances]] - # 是否仅忽略在 ignore_fs 列表中明确配置的文件系统,设为 false 表示忽略常见的虚拟文件系统 - # ignore_fs = [...] +# 设置 mount_points 后,仅采集指定挂载点。 +# mount_points = ["/"] + +# 按文件系统类型忽略挂载点。 +# ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs", "nsfs", "CDFS", "fuse.juicefs"] + +# 按挂载点路径前缀忽略。 +# ignore_mount_points = ["/boot", "/var/lib/kubelet/pods"] ``` ## 采集指标 @@ -29,4 +34,4 @@ Disk 采集插件主要用于收集操作系统的磁盘分区使用情况。 ## 监控大盘 -建议将 OS 级别的监控 (如 CPU、Mem、Disk 等) 整合到统一的 System Dashboard 中。但为了方便独立查看,这里也提供了一份专门针对 Disk 分区使用情况的参考 Dashboard。 \ No newline at end of file +建议将 OS 级别的监控 (如 CPU、Mem、Disk 等) 整合到统一的 System Dashboard 中。但为了方便独立查看,这里也提供了一份专门针对 Disk 分区使用情况的参考 Dashboard。 diff --git a/inputs/dns_query/README.md b/inputs/dns_query/README.md index 403214967..3e668514b 100644 --- a/inputs/dns_query/README.md +++ b/inputs/dns_query/README.md @@ -9,10 +9,10 @@ It is not necessary to enable this plugin on every machine. We recommend enablin ```toml [[instances]] - # Automatically use the DNS servers from the local machine's /etc/resolv.conf + # Automatically use DNS servers from /etc/resolv.conf when servers is empty auto_detect_local_dns_server = true - ## Manually specify external DNS servers to query + ## Manually specify DNS servers to query servers = ["223.5.5.5", "114.114.114.114", "119.29.29.29"] ## Network protocol to use, such as "udp" or "tcp" @@ -21,7 +21,7 @@ It is not necessary to enable this plugin on every machine. We recommend enablin ## List of domains or subdomains to query domains = ["www.huaweicloud.com", "www.baidu.com", "api.yourcompany.com"] - ## Query record type (A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV) + ## Query record type (A, AAAA, ANY, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV) record_type = "A" ## DNS server port diff --git a/inputs/dns_query/README_CN.md b/inputs/dns_query/README_CN.md index c0fb9edff..a5b891985 100644 --- a/inputs/dns_query/README_CN.md +++ b/inputs/dns_query/README_CN.md @@ -9,10 +9,10 @@ DNS Query 采集插件用于对 DNS 服务器的响应质量进行持续监测 ```toml [[instances]] - # 是否自动使用本机的 /etc/resolv.conf 中的 DNS 服务器进行查询 + # 当 servers 为空时,是否自动使用本机 /etc/resolv.conf 中的 DNS 服务器进行查询 auto_detect_local_dns_server = true - ## 手动指定要查询的外部 DNS 服务器 (当上一项为 false 时生效) + ## 手动指定要查询的 DNS 服务器 servers = ["223.5.5.5", "114.114.114.114", "119.29.29.29"] ## 指定查询协议,如 "udp" 或 "tcp" @@ -21,7 +21,7 @@ DNS Query 采集插件用于对 DNS 服务器的响应质量进行持续监测 ## 需要重点监测的域名列表 domains = ["www.huaweicloud.com", "www.baidu.com", "api.yourcompany.com"] - ## 查询记录的类型 (A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV) + ## 查询记录的类型 (A, AAAA, ANY, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV) record_type = "A" ## DNS 服务端口 diff --git a/inputs/docker/README.md b/inputs/docker/README.md index f8094c343..338b55cf9 100644 --- a/inputs/docker/README.md +++ b/inputs/docker/README.md @@ -65,4 +65,4 @@ The plugin collects comprehensive container resource usage. Key metrics include: - `docker_container_mem_limit`: Container Memory limit (Bytes) - `docker_container_net_rx_bytes`: Container network received bytes - `docker_container_net_tx_bytes`: Container network transmitted bytes -- `docker_container_status`: The running status of the container +- `docker_container_status_*`: Container state fields such as PID, exit code, restart count, and uptime. The current state is exposed as the `container_status` tag. diff --git a/inputs/docker/README_CN.md b/inputs/docker/README_CN.md index c0fac76b3..7f8ad0a95 100644 --- a/inputs/docker/README_CN.md +++ b/inputs/docker/README_CN.md @@ -66,4 +66,4 @@ volumes: - `docker_container_mem_limit`: 容器内存限制配额 (Bytes) - `docker_container_net_rx_bytes`: 容器网络接收字节数 (Bytes) - `docker_container_net_tx_bytes`: 容器网络发送字节数 (Bytes) -- `docker_container_status`: 容器运行状态 (通常以枚举值或 boolean 形式体现) \ No newline at end of file +- `docker_container_status_*`: 容器状态相关字段,如 PID、退出码、重启次数和运行时长。当前状态会通过 `container_status` 标签体现。 diff --git a/inputs/elasticsearch/README.md b/inputs/elasticsearch/README.md index 3e53bb86b..07adf98ef 100644 --- a/inputs/elasticsearch/README.md +++ b/inputs/elasticsearch/README.md @@ -483,3 +483,17 @@ ES 7.x 支持基于角色的访问控制(RBACs)。`elasticsearch` 插件需 | elasticsearch_slm_stats_snapshots_deleted_total | counter | 按策略删除的快照数 | | elasticsearch_slm_stats_snapshot_deletion_failures_total | counter | 按策略快照删除失败次数 | | elasticsearch_slm_stats_operation_mode | gauge | SLM操作模式(运行中,停止中,已停止) | + +#### `num_most_recent_indices` + +设置为大于 0 时,插件会对带日期或版本后缀的动态索引只采集最近 N 个索引的指标,可显著减少历史动态索引带来的指标量。该配置可与 `indices_include` 一起使用。 + +#### `dynamic_index_matcher_regexp` + +与 `num_most_recent_indices` 配合使用,用于指定动态索引后缀的匹配逻辑。默认值为: + +```toml +dynamic_index_matcher_regexp = ["(?P(?:\\d{4}|\\d{2})[.-]?(?:\\d{2})[.-]?(?:\\d{2})?[.-]?(?:\\d{2})?)$","[\\.-._]\\d+(\\.\\d+){0,2}$"] +``` + +默认规则支持匹配 `YYYY.MM.DD`、`YYYY-MM-DD`、`YYYYMMDD`、`YYYY-MM-DD-HH`、`YY.MM.DD`、`YY-MM-DD` 以及类似 `v1_001`、`v0.1`、`v5.2.3` 这类版本后缀,也可以按实际索引命名自行扩展。 diff --git a/inputs/exec/dashboard.json b/inputs/exec/dashboard.json index a94b8a8ce..130ea3741 100644 --- a/inputs/exec/dashboard.json +++ b/inputs/exec/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "Exec metrics are defined by the commands configured in exec.toml. Build panels from the metric names emitted by your scripts.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/filecount/README_CN.md b/inputs/filecount/README_CN.md index 358995372..8eea3bedb 100644 --- a/inputs/filecount/README_CN.md +++ b/inputs/filecount/README_CN.md @@ -44,5 +44,5 @@ mtime = "0s" - `filecount_count`: 匹配到的文件总数 - `filecount_size_bytes`: 匹配到的文件总大小 (Bytes) -- `filecount_oldest_file_timestamp`: 最早创建/修改的文件的 Unix 时间戳 (纳秒) -- `filecount_newest_file_timestamp`: 最新创建/修改的文件的 Unix 时间戳 (纳秒) +- `filecount_oldest_file_timestamp`: 最早修改的文件的 Unix 时间戳 (纳秒) +- `filecount_newest_file_timestamp`: 最新修改的文件的 Unix 时间戳 (纳秒) diff --git a/inputs/gnmi/README_CN.md b/inputs/gnmi/README_CN.md index 22a925cbb..2520778c1 100644 --- a/inputs/gnmi/README_CN.md +++ b/inputs/gnmi/README_CN.md @@ -24,7 +24,7 @@ redial = "10s" ## gRPC 的最大消息大小限制,默认 4MB - max_msg_size = "4194304" + max_msg_size = 4194304 ## TLS 认证配置 (如果设备启用了 TLS) # enable_tls = false diff --git a/inputs/gnmi/dashboard.json b/inputs/gnmi/dashboard.json index 2da88eed4..bccfbab3e 100644 --- a/inputs/gnmi/dashboard.json +++ b/inputs/gnmi/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "gNMI metrics depend on the subscribed YANG paths and the subscription name configured in gnmi.toml. Build panels from the emitted measurement and field names.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/googlecloud/README.md b/inputs/googlecloud/README.md index 8579b8e4b..276a6fb25 100644 --- a/inputs/googlecloud/README.md +++ b/inputs/googlecloud/README.md @@ -4,8 +4,10 @@ This plugin pulls cloud resource monitoring metrics from the Google Cloud Platfo ## Prerequisites -Before using this plugin, ensure that the provided GCP Service Account credentials have the following permission: -- `monitoring.read` (Monitoring Viewer) +Before using this plugin, ensure that the provided GCP Service Account credentials can read Cloud Monitoring time series. For OAuth scopes, use: +- `https://www.googleapis.com/auth/monitoring.read` + +For IAM, grant a read-only monitoring role such as Monitoring Viewer. ## Configuration diff --git a/inputs/googlecloud/README_CN.md b/inputs/googlecloud/README_CN.md index 14e8892e6..14ac16dfe 100644 --- a/inputs/googlecloud/README_CN.md +++ b/inputs/googlecloud/README_CN.md @@ -4,8 +4,10 @@ ## 前置条件 -使用该插件前,您需要确保提供的 GCP 服务账号 (Service Account) 凭证拥有以下权限: -- `monitoring.read` (监控查看者) +使用该插件前,您需要确保提供的 GCP 服务账号 (Service Account) 凭证可以读取 Cloud Monitoring 时序数据。OAuth scope 使用: +- `https://www.googleapis.com/auth/monitoring.read` + +IAM 侧请授予只读监控角色,例如 Monitoring Viewer。 ## 配置说明 diff --git a/inputs/googlecloud/dashboard.json b/inputs/googlecloud/dashboard.json index 8ecb33fb4..e0fc49402 100644 --- a/inputs/googlecloud/dashboard.json +++ b/inputs/googlecloud/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "Google Cloud Monitoring metrics depend on the configured filter and GCP services in use. Build panels from the metric names emitted by this plugin.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/greenplum/README.md b/inputs/greenplum/README.md index 9a8d68f06..70eb9b7f1 100644 --- a/inputs/greenplum/README.md +++ b/inputs/greenplum/README.md @@ -14,10 +14,7 @@ The plugin periodically executes the `gpstate -m` command in the background and # # Collect interval # interval = 15 -[[instances]] # There is no instance-specific configuration for this plugin. Just ensure gpstate is in the PATH. -# You can append labels to distinguish different clusters: -# labels = { cluster="gp-cluster-1" } ``` ## Metrics diff --git a/inputs/greenplum/README_CN.md b/inputs/greenplum/README_CN.md index 4a18e23bc..82ee8c1c9 100644 --- a/inputs/greenplum/README_CN.md +++ b/inputs/greenplum/README_CN.md @@ -14,10 +14,7 @@ Greenplum 采集插件用于监控 Greenplum 数据库集群的镜像节点 (Mir # # 采集周期 # interval = 15 -[[instances]] # 该插件没有实例级别的特殊配置。只需确保环境中有 gpstate 即可。 -# 可以加一些标签来区分不同集群 -# labels = { cluster="gp-cluster-1" } ``` ## 采集指标 diff --git a/inputs/hadoop_hdfs/README.md b/inputs/hadoop_hdfs/README.md index 72cd489b7..6048fcf20 100644 --- a/inputs/hadoop_hdfs/README.md +++ b/inputs/hadoop_hdfs/README.md @@ -15,4 +15,4 @@ Steps: ## Metrics and Dashboards -Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend entirely on the `metrics` blocks defined in your configuration file. In your Grafana or Nightingale dashboards, simply query metrics starting with `jolokia_` or whatever `name_prefix` you defined in the configuration. +Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend on the `[[instances.metric]]` blocks defined in your configuration file. The provided template uses `metrics_name_prefix`, so query metrics starting with `hadoop_hdfs_namenode_` or `hadoop_hdfs_datanode_`. diff --git a/inputs/hadoop_hdfs/README_CN.md b/inputs/hadoop_hdfs/README_CN.md index f62bbd237..7f2a11ad5 100644 --- a/inputs/hadoop_hdfs/README_CN.md +++ b/inputs/hadoop_hdfs/README_CN.md @@ -15,4 +15,4 @@ Categraf 监控 Hadoop HDFS 时,不需要专门的独立的二进制原生插 ## 采集指标与大盘 -由于实际上使用的是 Jolokia Agent,采集到的指标完全取决于配置文件中配置的 `metrics`。请在您的 Grafana 或夜莺监控大盘中直接使用 `jolokia_` 或者配置中指定的 `name_prefix` 作为前缀来查询指标。 +由于实际上使用的是 Jolokia Agent,采集到的指标取决于配置文件中的 `[[instances.metric]]`。当前模板使用 `metrics_name_prefix`,请在 Grafana 或夜莺监控大盘中查询 `hadoop_hdfs_namenode_` 或 `hadoop_hdfs_datanode_` 开头的指标。 diff --git a/inputs/hadoop_hdfs/dashboard.json b/inputs/hadoop_hdfs/dashboard.json index c41d3d994..9ba64e37f 100644 --- a/inputs/hadoop_hdfs/dashboard.json +++ b/inputs/hadoop_hdfs/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "Hadoop HDFS metrics are collected through jolokia_agent. The provided template emits metrics prefixed with hadoop_hdfs_namenode_ and hadoop_hdfs_datanode_.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/http_response/README.md b/inputs/http_response/README.md index 55084db79..c92f0bc51 100644 --- a/inputs/http_response/README.md +++ b/inputs/http_response/README.md @@ -45,11 +45,12 @@ method = "POST" ## 指标说明 -- `http_response_dns_request` DNS 解析耗时,单位毫秒 -- `http_response_tcp_connect` TCP 建连耗时,单位毫秒 -- `http_response_tls_handshake` TLS 握手耗时,单位毫秒 -- `http_response_first_byte` 首包响应耗时,单位毫秒 -- `http_response_total_cost` 请求总耗时,单位毫秒 +- `http_response_dns_time` DNS 解析耗时,单位毫秒(需开启 `trace`) +- `http_response_connect_time` TCP 建连耗时,单位毫秒(需开启 `trace`) +- `http_response_tls_time` TLS 握手耗时,单位毫秒(需开启 `trace`) +- `http_response_first_response_time` 首包响应耗时,单位毫秒(需开启 `trace`) +- `http_response_end_response_time` 首包之后到请求结束的耗时,单位毫秒 +- `http_response_response_time_ms` 请求总耗时,单位毫秒 - `http_response_response_time` 响应耗时,单位秒,保留为兼容旧指标 - `http_response_response_code` HTTP 响应码 - `http_response_result_code` 探测结果码 @@ -57,9 +58,9 @@ method = "POST" 说明: -- 使用 IP 直连或连接复用时(HTTP/HTTPS 均可能),部分阶段指标可能为 `-1` +- 未开启 `trace`、使用 IP 直连或连接复用时(HTTP/HTTPS 均可能),部分阶段指标可能不存在 - `http_response_cert_expire_timestamp` 仅在 HTTPS 目标且成功建立 TLS 连接时输出 ## 监控大盘和告警规则 -该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置,alerts.json 是告警规则,可以导入夜莺使用。 \ No newline at end of file +该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置,alerts.json 是告警规则,可以导入夜莺使用。 diff --git a/inputs/huatuo/dashboard.json b/inputs/huatuo/dashboard.json index be438bd2d..f5147f4dc 100644 --- a/inputs/huatuo/dashboard.json +++ b/inputs/huatuo/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "Huatuo eBPF metrics depend on the enabled probes and runtime environment. Build panels from the emitted huatuo metrics.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/ipvs/README.md b/inputs/ipvs/README.md index 045e3c944..857bcbe9f 100644 --- a/inputs/ipvs/README.md +++ b/inputs/ipvs/README.md @@ -12,7 +12,6 @@ In order for this plugin to communicate over netlink sockets, the Categraf proce ```toml # Collect virtual and real server stats from Linux IPVS -[[instances]] # No specific configuration is required. ``` @@ -20,7 +19,7 @@ In order for this plugin to communicate over netlink sockets, the Categraf proce Servers will contain tags identifying how they were configured, using either `address` + `port` + `protocol` *OR* `fwmark`. This corresponds to how you would normally configure a virtual server using `ipvsadm`. -### 1. ipvs_virtual_server +### Virtual server samples - **Tags:** - `sched` (the scheduler in use) - `netmask` (the mask used for determining affinity) @@ -29,14 +28,14 @@ Servers will contain tags identifying how they were configured, using either `ad - `port` - `protocol` - `fwmark` -- **Fields:** - - `connections` - - `pkts_in` / `pkts_out` - - `bytes_in` / `bytes_out` - - `pps_in` / `pps_out` - - `cps` - -### 2. ipvs_real_server +- **Metrics:** + - `ipvs_connections` + - `ipvs_pkts_in` / `ipvs_pkts_out` + - `ipvs_bytes_in` / `ipvs_bytes_out` + - `ipvs_pps_in` / `ipvs_pps_out` + - `ipvs_cps` + +### Real server samples - **Tags:** - `address` - `port` @@ -45,11 +44,11 @@ Servers will contain tags identifying how they were configured, using either `ad - `virtual_port` - `virtual_protocol` - `virtual_fwmark` -- **Fields:** - - `active_connections` - - `inactive_connections` - - `connections` - - `pkts_in` / `pkts_out` - - `bytes_in` / `bytes_out` - - `pps_in` / `pps_out` - - `cps` +- **Metrics:** + - `ipvs_active_connections` + - `ipvs_inactive_connections` + - `ipvs_connections` + - `ipvs_pkts_in` / `ipvs_pkts_out` + - `ipvs_bytes_in` / `ipvs_bytes_out` + - `ipvs_pps_in` / `ipvs_pps_out` + - `ipvs_cps` diff --git a/inputs/ipvs/README_CN.md b/inputs/ipvs/README_CN.md index 6410bdd51..2fc73f2d2 100644 --- a/inputs/ipvs/README_CN.md +++ b/inputs/ipvs/README_CN.md @@ -13,7 +13,6 @@ ```toml # 采集 Linux IPVS 的虚拟和真实服务器指标 -[[instances]] # 无需任何特殊配置,只需启用即可 ``` @@ -21,7 +20,7 @@ 采集的指标会自动打上标签,以标识虚拟服务器的配置方式(例如,使用 `address` + `port` + `protocol` 或者使用 `fwmark` 配置)。这与您平时使用 `ipvsadm` 配置虚拟服务器的方式一致。 -### 1. ipvs_virtual_server +### 虚拟服务器样本 表示虚拟服务器 (负载均衡前端)。 - **Tags:** - `sched`: 使用的调度算法 (如 rr, wrr) @@ -31,25 +30,25 @@ - `port`: 端口 - `protocol`: 协议 (tcp/udp) - `fwmark`: 防火墙标记 -- **Fields (指标):** - - `connections`: 总连接数 - - `pkts_in` / `pkts_out`: 收发数据包总数 - - `bytes_in` / `bytes_out`: 收发字节总数 - - `pps_in` / `pps_out`: 每秒收发数据包速率 - - `cps`: 每秒新建连接数 - -### 2. ipvs_real_server +- **Metrics (指标):** + - `ipvs_connections`: 总连接数 + - `ipvs_pkts_in` / `ipvs_pkts_out`: 收发数据包总数 + - `ipvs_bytes_in` / `ipvs_bytes_out`: 收发字节总数 + - `ipvs_pps_in` / `ipvs_pps_out`: 每秒收发数据包速率 + - `ipvs_cps`: 每秒新建连接数 + +### 真实服务器样本 表示真实服务器 (后端的真实节点)。 - **Tags:** - `address`: Real Server IP - `port`: Real Server 端口 - `address_family`: inet 或 inet6 - `virtual_address` / `virtual_port` / `virtual_protocol` / `virtual_fwmark`: 其所属的虚拟服务器的信息 -- **Fields (指标):** - - `active_connections`: 活跃连接数 - - `inactive_connections`: 非活跃连接数 - - `connections`: 总连接数 - - `pkts_in` / `pkts_out`: 收发数据包总数 - - `bytes_in` / `bytes_out`: 收发字节总数 - - `pps_in` / `pps_out`: 每秒收发数据包速率 - - `cps`: 每秒新建连接数 \ No newline at end of file +- **Metrics (指标):** + - `ipvs_active_connections`: 活跃连接数 + - `ipvs_inactive_connections`: 非活跃连接数 + - `ipvs_connections`: 总连接数 + - `ipvs_pkts_in` / `ipvs_pkts_out`: 收发数据包总数 + - `ipvs_bytes_in` / `ipvs_bytes_out`: 收发字节总数 + - `ipvs_pps_in` / `ipvs_pps_out`: 每秒收发数据包速率 + - `ipvs_cps`: 每秒新建连接数 diff --git a/inputs/ipvs/dashboard.json b/inputs/ipvs/dashboard.json index 8ca371191..13573a9d6 100644 --- a/inputs/ipvs/dashboard.json +++ b/inputs/ipvs/dashboard.json @@ -19,7 +19,7 @@ "id": 1, "targets": [ { - "expr": "ipvs_real_server_active_connections", + "expr": "ipvs_active_connections", "legendFormat": "RS: {{address}}:{{port}}", "refId": "A" } @@ -37,14 +37,14 @@ "id": 2, "targets": [ { - "expr": "ipvs_real_server_inactive_connections", + "expr": "ipvs_inactive_connections", "legendFormat": "RS: {{address}}:{{port}}", "refId": "A" } ] }, { - "title": "IPVS Virtual Server Connections", + "title": "IPVS Connections", "type": "timeseries", "gridPos": { "x": 0, @@ -55,14 +55,14 @@ "id": 3, "targets": [ { - "expr": "ipvs_virtual_server_connections", - "legendFormat": "VS: {{address}}:{{port}}", + "expr": "ipvs_connections", + "legendFormat": "{{address}}:{{port}}", "refId": "A" } ] }, { - "title": "IPVS Virtual Server CPS", + "title": "IPVS CPS", "type": "timeseries", "gridPos": { "x": 12, @@ -73,8 +73,8 @@ "id": 4, "targets": [ { - "expr": "ipvs_virtual_server_cps", - "legendFormat": "VS: {{address}}:{{port}}", + "expr": "ipvs_cps", + "legendFormat": "{{address}}:{{port}}", "refId": "A" } ] @@ -85,4 +85,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/jboss/README.md b/inputs/jboss/README.md index a7cba428b..4c268d7dd 100644 --- a/inputs/jboss/README.md +++ b/inputs/jboss/README.md @@ -15,5 +15,4 @@ Steps: ## Metrics and Dashboards -Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend entirely on the `metrics` blocks defined in your configuration file. Common metrics include JVM memory, thread counts, JBoss Web connections, and sessions. -In your Grafana or Nightingale dashboards, simply query metrics starting with `jolokia_` or whatever `name_prefix` you defined in the configuration. +Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend on the `[[instances.metric]]` blocks defined in your configuration file. The provided template uses `metrics_name_prefix = "jboss_"`, so query metrics starting with `jboss_`. diff --git a/inputs/jboss/README_CN.md b/inputs/jboss/README_CN.md index 93ba3fdad..89c43cd94 100644 --- a/inputs/jboss/README_CN.md +++ b/inputs/jboss/README_CN.md @@ -15,5 +15,4 @@ Categraf 监控 JBoss (WildFly) 时,不需要专门的独立原生插件。JBo ## 采集指标与大盘 -由于实际上使用的是 Jolokia Agent,采集到的指标完全取决于配置文件中配置的 `metrics`。常见的指标包括 JVM 内存、线程数、JBoss Web 连接数、会话数等。 -请在您的 Grafana 或夜莺监控大盘中直接使用 `jolokia_` 或者配置中指定的 `name_prefix` 作为前缀来查询指标。 +由于实际上使用的是 Jolokia Agent,采集到的指标取决于配置文件中的 `[[instances.metric]]`。当前模板使用 `metrics_name_prefix = "jboss_"`,请在 Grafana 或夜莺监控大盘中查询 `jboss_` 开头的指标。 diff --git a/inputs/jboss/dashboard.json b/inputs/jboss/dashboard.json index 3098b9b9e..bd3dcfbba 100644 --- a/inputs/jboss/dashboard.json +++ b/inputs/jboss/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "JBoss metrics are collected through jolokia_agent. The provided template emits metrics prefixed with jboss_.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/jenkins/dashboard.json b/inputs/jenkins/dashboard.json index 1752b1f42..e60610869 100644 --- a/inputs/jenkins/dashboard.json +++ b/inputs/jenkins/dashboard.json @@ -26,7 +26,7 @@ ] }, { - "title": "Jenkins Job Result (0=Success,1=Fail)", + "title": "Jenkins Job Result Code", "type": "timeseries", "gridPos": { "x": 12, @@ -121,4 +121,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/jolokia/dashboard.json b/inputs/jolokia/dashboard.json index ef71f966c..3047fb55e 100644 --- a/inputs/jolokia/dashboard.json +++ b/inputs/jolokia/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "This directory contains the shared Jolokia library used by jolokia_agent and jolokia_proxy. Query metrics from the concrete Jolokia input configuration you enable.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/jolokia_agent/dashboard.json b/inputs/jolokia_agent/dashboard.json index 0038c7e88..cd673bbad 100644 --- a/inputs/jolokia_agent/dashboard.json +++ b/inputs/jolokia_agent/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "Jolokia Agent metrics depend on the configured MBeans, paths, and metrics_name_prefix. Build panels from the emitted metric names in your jolokia_agent configuration.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/jolokia_proxy/dashboard.json b/inputs/jolokia_proxy/dashboard.json index 4634de6e1..70d5fb2fb 100644 --- a/inputs/jolokia_proxy/dashboard.json +++ b/inputs/jolokia_proxy/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "Jolokia Proxy metrics depend on the configured targets, MBeans, paths, and metrics_name_prefix. Build panels from the emitted metric names in your jolokia_proxy configuration.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/kafka/README.md b/inputs/kafka/README.md index 398e69775..705bc2819 100644 --- a/inputs/kafka/README.md +++ b/inputs/kafka/README.md @@ -1,6 +1,6 @@ # kafka -kafka 监控采集插件,由kafka-exporter(https://github.com/davidmparrott/kafka_exporter)封装而来。 +kafka 监控采集插件,由 [kafka-exporter](https://github.com/davidmparrott/kafka_exporter) 封装而来。 ## Configuration diff --git a/inputs/kafka_connect/README.md b/inputs/kafka_connect/README.md index 41c32020f..f01dbc236 100644 --- a/inputs/kafka_connect/README.md +++ b/inputs/kafka_connect/README.md @@ -15,5 +15,5 @@ Steps: ## Metrics and Dashboards -Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend entirely on the `metrics` blocks defined in your configuration file. Common metrics include Source/Sink Task status, commit latency, and throughput. -In your Grafana or Nightingale dashboards, simply query the mapped JMX metrics prefix defined in your configuration. +Because the actual metric collection is handled by the Jolokia Agent, the metrics collected depend entirely on the `[[instances.metric]]` blocks defined in your configuration file. Common metrics include Source/Sink Task status, commit latency, and throughput. +The provided template uses `metrics_name_prefix = "kafka_connect_"`, so query metrics starting with `kafka_connect_` in Grafana or Nightingale. diff --git a/inputs/kafka_connect/README_CN.md b/inputs/kafka_connect/README_CN.md index 74f9d3655..67ad301c8 100644 --- a/inputs/kafka_connect/README_CN.md +++ b/inputs/kafka_connect/README_CN.md @@ -15,5 +15,5 @@ Categraf 监控 Kafka Connect 时,不需要专门的独立原生插件。Kafka ## 采集指标与大盘 -由于实际上使用的是 Jolokia Agent,采集到的指标完全取决于配置文件中配置的 `metrics`。常见的指标包括 Source/Sink Task 的运行状态、提交延迟、吞吐量等。 -请在您的 Grafana 或夜莺监控大盘中直接使用对应的 JMX 映射前缀查询指标。 +由于实际上使用的是 Jolokia Agent,采集到的指标完全取决于配置文件中的 `[[instances.metric]]`。常见的指标包括 Source/Sink Task 的运行状态、提交延迟、吞吐量等。 +当前模板使用 `metrics_name_prefix = "kafka_connect_"`,请在 Grafana 或夜莺中查询 `kafka_connect_` 开头的指标。 diff --git a/inputs/kafka_connect/dashboard.json b/inputs/kafka_connect/dashboard.json index db1760d04..0eb1aa0bc 100644 --- a/inputs/kafka_connect/dashboard.json +++ b/inputs/kafka_connect/dashboard.json @@ -17,6 +17,10 @@ "h": 8 }, "id": 1, + "options": { + "content": "Kafka Connect metrics are collected through jolokia_agent. Configure kafka-connect.toml and build panels from metrics prefixed with kafka_connect_.", + "mode": "markdown" + }, "targets": [] } ], @@ -25,4 +29,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/kernel/README.md b/inputs/kernel/README.md index 31a98f9e7..361e1d6b9 100644 --- a/inputs/kernel/README.md +++ b/inputs/kernel/README.md @@ -9,7 +9,6 @@ The data is typically sourced from `/proc/stat` and `/proc/vmstat`. ```toml # Collect Linux Kernel metrics -[[instances]] # No specific configuration is required. ``` diff --git a/inputs/kernel/README_CN.md b/inputs/kernel/README_CN.md index b4d373f17..313675ae5 100644 --- a/inputs/kernel/README_CN.md +++ b/inputs/kernel/README_CN.md @@ -9,7 +9,6 @@ ```toml # 采集 Linux 系统的 Kernel 指标 -[[instances]] # 无需任何特殊配置,只需启用即可 ``` @@ -24,4 +23,4 @@ ## 监控大盘 该插件采集的 Kernel 指标通常属于服务器基础监控的一部分,因此在实际应用中往往会与 CPU、内存等指标一起放在全局的 `System` 大盘中。 -为方便单独查看测试,这里也提供了一个简单的 Kernel 专属监控大盘。 \ No newline at end of file +为方便单独查看测试,这里也提供了一个简单的 Kernel 专属监控大盘。 diff --git a/inputs/kernel_vmstat/README.md b/inputs/kernel_vmstat/README.md index dd10099f2..833d3d740 100644 --- a/inputs/kernel_vmstat/README.md +++ b/inputs/kernel_vmstat/README.md @@ -8,7 +8,6 @@ Since `/proc/vmstat` contains a large number of metrics, we use a whitelist mech ```toml # Collect kernel vmstat metrics from /proc/vmstat -[[instances]] # No other settings are needed, the white_list below controls which fields are collected. [white_list] diff --git a/inputs/kube_proxy/README.md b/inputs/kube_proxy/README.md index 3e255d18e..f99d0162c 100644 --- a/inputs/kube_proxy/README.md +++ b/inputs/kube_proxy/README.md @@ -12,6 +12,17 @@ Steps: 1. Add a new `[[instances]]` block in your `conf/input.prometheus/prometheus.toml` for kube-proxy. 2. Ensure that Categraf can access the kube-proxy metrics endpoint (typically `127.0.0.1:10249/metrics` or `NodeIP:10249`). When running as a DaemonSet, this is usually accessed via the Node IP. 3. Modify the `urls` in the configuration to point to the correct address. +4. Set `url_label_key = "ident"` and `labels = { job = "kube-proxy" }`, because the dashboard filters metrics by `job` and groups instances by `ident`. + +Example: + +```toml +[[instances]] +urls = ["http://127.0.0.1:10249/metrics"] +url_label_key = "ident" +url_label_value = "{{.Host}}" +labels = { job = "kube-proxy" } +``` ## Metrics and Dashboards diff --git a/inputs/kube_proxy/README_CN.md b/inputs/kube_proxy/README_CN.md index 3569b96cf..c4dc96786 100644 --- a/inputs/kube_proxy/README_CN.md +++ b/inputs/kube_proxy/README_CN.md @@ -12,6 +12,17 @@ 1. 在 `conf/input.prometheus/prometheus.toml` 中新增一个用于抓取 kube-proxy 的 `[[instances]]` 配置块。 2. 确保您的 Kubernetes 集群中,kube-proxy 的 metrics 接口 (通常是 `127.0.0.1:10249/metrics` 或者节点 IP 的 `10249` 端口) 可以被 Categraf 访问到。如果在 DaemonSet 模式下,通常通过 Node IP 访问。 3. 修改配置中的 `urls` 指向正确的地址。 +4. 设置 `url_label_key = "ident"` 和 `labels = { job = "kube-proxy" }`,因为本目录下的 Dashboard 使用 `job` 过滤指标,并使用 `ident` 区分实例。 + +示例: + +```toml +[[instances]] +urls = ["http://127.0.0.1:10249/metrics"] +url_label_key = "ident" +url_label_value = "{{.Host}}" +labels = { job = "kube-proxy" } +``` ## 采集指标与监控大盘 diff --git a/inputs/kubelet/README.md b/inputs/kubelet/README.md index 0787f136b..e1f0d26b1 100644 --- a/inputs/kubelet/README.md +++ b/inputs/kubelet/README.md @@ -12,6 +12,20 @@ Steps: 1. Add a new `[[instances]]` block in your `conf/input.prometheus/prometheus.toml` for Kubelet. 2. Ensure that Categraf (usually deployed as a DaemonSet on each Node) can access the Kubelet API on the current node. This often involves using the Node IP and a service account token. 3. Configure the correct authentication in your prometheus configuration according to your Kubernetes cluster's security setup (e.g., TLS settings, token file paths). +4. Set `url_label_key = "ident"` and `labels = { job = "kubelet" }`, because the dashboard filters metrics by `job` and groups nodes by `ident`. + +Example: + +```toml +[[instances]] +urls = ["https://127.0.0.1:10250/metrics", "https://127.0.0.1:10250/metrics/cadvisor"] +url_label_key = "ident" +url_label_value = "{{.Host}}" +labels = { job = "kubelet" } +# bearer_token_file = "/run/secrets/kubernetes.io/serviceaccount/token" +# use_tls = true +# insecure_skip_verify = true +``` ## Metrics and Dashboards diff --git a/inputs/kubelet/README_CN.md b/inputs/kubelet/README_CN.md index 0b9dde830..8e4813fcd 100644 --- a/inputs/kubelet/README_CN.md +++ b/inputs/kubelet/README_CN.md @@ -12,6 +12,20 @@ 1. 在 `conf/input.prometheus/prometheus.toml` 中新增一个用于抓取 Kubelet 的 `[[instances]]` 配置块。 2. 确保 Categraf 作为 DaemonSet 部署在每个 Node 上时,可以访问到当前节点的 Kubelet API(通常通过挂载 Node 的 IP 和相应的认证 Token 获取)。 3. 根据您的 Kubernetes 集群的安全配置(如是否需要 TLS,Token 文件路径),在相应的配置块中配置正确的认证信息。 +4. 设置 `url_label_key = "ident"` 和 `labels = { job = "kubelet" }`,因为本目录下的 Dashboard 使用 `job` 过滤指标,并使用 `ident` 区分节点。 + +示例: + +```toml +[[instances]] +urls = ["https://127.0.0.1:10250/metrics", "https://127.0.0.1:10250/metrics/cadvisor"] +url_label_key = "ident" +url_label_value = "{{.Host}}" +labels = { job = "kubelet" } +# bearer_token_file = "/run/secrets/kubernetes.io/serviceaccount/token" +# use_tls = true +# insecure_skip_verify = true +``` ## 采集指标与监控大盘 diff --git a/inputs/linux_sysctl_fs/README.md b/inputs/linux_sysctl_fs/README.md index 68372ae5e..9d3b01e47 100644 --- a/inputs/linux_sysctl_fs/README.md +++ b/inputs/linux_sysctl_fs/README.md @@ -9,7 +9,6 @@ It is highly recommended for monitoring system-wide file descriptor limits (file ```toml # Collect Linux system file descriptor and inode status limits -[[instances]] # This plugin requires no special configuration. Just enable it. ``` diff --git a/inputs/linux_sysctl_fs/README_CN.md b/inputs/linux_sysctl_fs/README_CN.md index e464ccef8..fd8fb94c6 100644 --- a/inputs/linux_sysctl_fs/README_CN.md +++ b/inputs/linux_sysctl_fs/README_CN.md @@ -9,7 +9,6 @@ ```toml # 采集 Linux 系统文件句柄与 Inode 等限制状态 -[[instances]] # 该插件无需任何特殊配置,启用即可。 ``` diff --git a/inputs/mem/README.md b/inputs/mem/README.md index ee68d3b7b..7111e25ca 100644 --- a/inputs/mem/README.md +++ b/inputs/mem/README.md @@ -8,7 +8,6 @@ This plugin collects host-level memory metrics, including total memory, availabl ```toml # Collect host physical memory metrics -[[instances]] # Usually requires no specific configuration. Just leave it enabled. ``` diff --git a/inputs/mem/README_CN.md b/inputs/mem/README_CN.md index 7b06e6170..5aa0600ba 100644 --- a/inputs/mem/README_CN.md +++ b/inputs/mem/README_CN.md @@ -8,7 +8,6 @@ ```toml # 采集主机物理内存指标 -[[instances]] # 通常无需任何特殊配置,保持默认启用即可。 ``` @@ -29,4 +28,4 @@ ## 监控大盘 该插件采集的指标是服务器最基础的监控数据之一。通常,OS 的内存监控大盘会与 CPU、磁盘等指标统一放置在全局的 **System (主机系统)** 大盘下面。 -为方便单独查看,本目录也提供了一个仅包含内存维度的基础 Dashboard。 \ No newline at end of file +为方便单独查看,本目录也提供了一个仅包含内存维度的基础 Dashboard。 diff --git a/inputs/mysql/README.md b/inputs/mysql/README.md index a622df7cb..0a439508f 100644 --- a/inputs/mysql/README.md +++ b/inputs/mysql/README.md @@ -187,7 +187,7 @@ labels = { instance = "local-mysql.sock" } | `metric_fields` | []string | 空 | 作为数值指标导出的列名列表 | | `label_fields` | []string | 空 | 作为标签导出的列名列表 | | `field_to_append` | string | 空 | 将某一列的值追加到指标名中,适合动态分组 | -| `timeout` | duration | 继承 `timeout_seconds`,再退化到 `3s` | 单条自定义 SQL 的超时 | +| `timeout` | duration | `0` | 单条自定义 SQL 的超时;当前实现不会继承 `timeout_seconds`,建议显式配置,例如 `3s` | | `request` | string | 空 | 实际执行的 SQL | 自定义 SQL 的使用规则: @@ -657,7 +657,7 @@ parameters = "tls=custom" - `metric_fields` / `label_fields` / `field_to_append` 没与 SQL 结果里的小写列名或小写别名保持一致 - `metric_fields` 对应列不是数值 -- 自定义 SQL 超时;当前默认会继承实例的 `timeout_seconds`,默认值是 3 秒 +- 自定义 SQL 超时;当前实现不会继承实例的 `timeout_seconds`,请在每条自定义 SQL 中显式配置 `timeout` ## 其他说明 diff --git a/inputs/net/README.md b/inputs/net/README.md index 588f69703..285bf3f7a 100644 --- a/inputs/net/README.md +++ b/inputs/net/README.md @@ -10,7 +10,6 @@ In most cases, you can leave the default configuration as is; the plugin will au ```toml # Collect network interface metrics -[[instances]] # interfaces = ["eth0", "enp*"] # ignore_interfaces = ["lo", "docker*", "veth*"] ``` diff --git a/inputs/net/README_CN.md b/inputs/net/README_CN.md index 59d2d3d26..319facc7f 100644 --- a/inputs/net/README_CN.md +++ b/inputs/net/README_CN.md @@ -10,7 +10,6 @@ ```toml # 采集网络接口指标 -[[instances]] # interfaces = ["eth0", "enp*"] # ignore_interfaces = ["lo", "docker*", "veth*"] ``` @@ -29,4 +28,4 @@ ## 监控大盘 这些指标是主机最核心的基础监控数据之一。通常,OS 的网络流量监控会与其他硬件指标统一放置在 **System (主机系统)** 大盘中。 -为方便查看单独维度的网络状态,本目录下也提供了一个仅包含网卡维度的基础 Dashboard。 \ No newline at end of file +为方便查看单独维度的网络状态,本目录下也提供了一个仅包含网卡维度的基础 Dashboard。 diff --git a/inputs/netstat/README.md b/inputs/netstat/README.md index 33693a0bb..0c48e43b7 100644 --- a/inputs/netstat/README.md +++ b/inputs/netstat/README.md @@ -8,7 +8,6 @@ This plugin monitors network connection states. It primarily collects statistics ```toml # Collect network TCP connection state statistics -[[instances]] # Usually requires no specific configuration. Just leave it enabled. ``` diff --git a/inputs/netstat/README_CN.md b/inputs/netstat/README_CN.md index edcd2e2f5..0bab1f03b 100644 --- a/inputs/netstat/README_CN.md +++ b/inputs/netstat/README_CN.md @@ -8,7 +8,6 @@ ```toml # 采集网络 TCP 连接状态统计 -[[instances]] # 通常无需任何特殊配置,保持默认启用即可。 ``` @@ -33,4 +32,4 @@ ## 监控大盘 这些指标是主机最核心的基础监控数据之一。通常,OS 的网络连接监控大盘会与 CPU、磁盘等指标统一放置在 **System (主机系统)** 大盘下面。 -为方便单独查看,本目录也提供了一个仅包含 TCP/UDP 连接状态维度的基础 Dashboard。 \ No newline at end of file +为方便单独查看,本目录也提供了一个仅包含 TCP/UDP 连接状态维度的基础 Dashboard。 diff --git a/inputs/nfsclient/README.md b/inputs/nfsclient/README.md index f6dc1e412..7f3d23d41 100644 --- a/inputs/nfsclient/README.md +++ b/inputs/nfsclient/README.md @@ -11,7 +11,6 @@ It gathers metrics such as read/write bytes, request counts, and latency for var # Collect NFS client metrics # interval = 60 -[[instances]] # Whether to collect full statistics for all NFS operations (defaults to collecting only key operations) fullstat = false @@ -29,20 +28,21 @@ fullstat = false The plugin supports NFSv3 and NFSv4. All metrics are tagged with `mountpoint`, `server` (NFS server address), and `export` (exported path). Key metric categories include: -- **Bytes Statistics (`nfsclient_bytes_*`)**: `read`, `write`, `direct_read`, `direct_write` -- **Event Statistics (`nfsclient_events_*)**: `inoderevalidates`, `dentryrevalidates`, `datainvalidates`, etc. -- **Operation Statistics (`nfsclient_ops_*`)**: +- **Default READ/WRITE statistics (`nfsclient_nfsstat_*`)**: `ops`, `retrans`, `bytes`, `rtt`, `exe`, and `rtt_per_op`, distinguished by the `nfsstat_operation` label. +- **Full bytes statistics (`nfsclient_nfs_bytes_*`)**: `normalreadbytes`, `normalwritebytes`, `directreadbytes`, `directwritebytes`, etc. These require `fullstat = true`. +- **Full event statistics (`nfsclient_nfs_events_*`)**: `inoderevalidates`, `dentryrevalidates`, `datainvalidates`, etc. These require `fullstat = true`. +- **Full operation statistics (`nfsclient_nfs_ops_*`)**: - `ops`: Total number of requests for the operation - `trans`: Number of RPC requests transmitted - `timeouts`: Number of timeouts - `bytes_sent` / `bytes_recv`: Bytes sent and received for the operation - - `queue_time_ms`: Time spent waiting in the queue (in milliseconds) - - `response_time_ms`: Time spent waiting for the server to respond (in milliseconds) - - `total_time_ms`: Total execution time (in milliseconds) + - `queue_time`: Time spent waiting in the queue + - `response_time`: Time spent waiting for the server to respond + - `total_time`: Total execution time - `errors`: Number of operational errors -*Note: Each NFS operation (such as READ, WRITE, GETATTR) generates a corresponding set of `nfsclient_ops_*` metrics, distinguished by the `operation` label.* +*Note: Each NFS operation (such as READ, WRITE, GETATTR) generates a corresponding set of `nfsclient_nfs_ops_*` metrics when `fullstat = true`, distinguished by the `operation` label.* ## Dashboards -A companion Dashboard (`dashboard.json`) is provided in this directory. It can be used to monitor the read/write throughput, latency (Response Time / Queue Time), and timeout errors for each mount point. +A companion Dashboard (`dashboard.json`) is provided in this directory. It uses the default `nfsclient_nfsstat_*` metrics to monitor read/write throughput, latency, operations, and retransmits for each mount point. diff --git a/inputs/nfsclient/README_CN.md b/inputs/nfsclient/README_CN.md index 8d7cf66c3..eed7d09e5 100644 --- a/inputs/nfsclient/README_CN.md +++ b/inputs/nfsclient/README_CN.md @@ -11,7 +11,6 @@ # 采集 NFS 客户端指标 # interval = 60 -[[instances]] # 是否采集全量的 NFS 操作指标(默认只采集常用的关键操作) fullstat = false @@ -29,20 +28,21 @@ fullstat = false 该插件支持 NFSv3 和 NFSv4,所有输出指标都会附带 `mountpoint`、`server` (NFS 服务端地址) 和 `export` (挂载的路径) 标签。 主要指标分类如下: -- **字节统计 (`nfsclient_bytes_*)**: `read`, `write`, `direct_read`, `direct_write` -- **事件统计 (`nfsclient_events_*)**: `inoderevalidates`, `dentryrevalidates`, `datainvalidates` 等 -- **操作统计 (`nfsclient_ops_*`)**: +- **默认 READ/WRITE 统计 (`nfsclient_nfsstat_*`)**: `ops`, `retrans`, `bytes`, `rtt`, `exe`, `rtt_per_op`,通过 `nfsstat_operation` 标签区分 READ 和 WRITE。 +- **全量字节统计 (`nfsclient_nfs_bytes_*`)**: `normalreadbytes`, `normalwritebytes`, `directreadbytes`, `directwritebytes` 等,需要开启 `fullstat = true`。 +- **全量事件统计 (`nfsclient_nfs_events_*`)**: `inoderevalidates`, `dentryrevalidates`, `datainvalidates` 等,需要开启 `fullstat = true`。 +- **全量操作统计 (`nfsclient_nfs_ops_*`)**: - `ops`: 操作的总请求次数 - `trans`: 发送的 RPC 请求次数 - `timeouts`: 超时次数 - `bytes_sent` / `bytes_recv`: 该操作发送和接收的字节数 - - `queue_time_ms`: 在队列中等待的时间 (单位:毫秒) - - `response_time_ms`: 服务端响应时间 (单位:毫秒) - - `total_time_ms`: 总耗时 (单位:毫秒) + - `queue_time`: 在队列中等待的时间 + - `response_time`: 服务端响应时间 + - `total_time`: 总耗时 - `errors`: 操作错误数 -*注意:每种 NFS 操作(如 READ, WRITE, GETATTR)都会生成对应的一组 `nfsclient_ops_*` 指标,并通过 `operation` 标签进行区分。* +*注意:开启 `fullstat = true` 后,每种 NFS 操作(如 READ, WRITE, GETATTR)都会生成对应的一组 `nfsclient_nfs_ops_*` 指标,并通过 `operation` 标签进行区分。* ## 监控大盘 -本目录下提供了一个配套的 Dashboard (`dashboard.json`),可用于监控各挂载点的读写吞吐量、读写延迟(Response Time / Queue Time)以及超时错误等情况。 +本目录下提供了一个配套的 Dashboard (`dashboard.json`),默认使用 `nfsclient_nfsstat_*` 指标监控各挂载点的读写吞吐量、延迟、操作数以及重传情况。 diff --git a/inputs/nfsclient/dashboard.json b/inputs/nfsclient/dashboard.json index d956b39e0..6a0dc6a5d 100644 --- a/inputs/nfsclient/dashboard.json +++ b/inputs/nfsclient/dashboard.json @@ -19,12 +19,12 @@ "id": 1, "targets": [ { - "expr": "rate(nfsclient_bytes_read[5m])", + "expr": "rate(nfsclient_nfsstat_bytes{nfsstat_operation=\"READ\"}[5m])", "legendFormat": "Read: {{mountpoint}}", "refId": "A" }, { - "expr": "rate(nfsclient_bytes_write[5m])", + "expr": "rate(nfsclient_nfsstat_bytes{nfsstat_operation=\"WRITE\"}[5m])", "legendFormat": "Write: {{mountpoint}}", "refId": "B" } @@ -42,8 +42,8 @@ "id": 2, "targets": [ { - "expr": "rate(nfsclient_ops_response_time_ms[5m]) / rate(nfsclient_ops_ops[5m])", - "legendFormat": "{{mountpoint}} ({{operation}})", + "expr": "rate(nfsclient_nfsstat_rtt[5m]) / rate(nfsclient_nfsstat_ops[5m])", + "legendFormat": "{{mountpoint}} ({{nfsstat_operation}})", "refId": "A" } ] @@ -60,14 +60,14 @@ "id": 3, "targets": [ { - "expr": "rate(nfsclient_ops_ops[5m])", - "legendFormat": "{{mountpoint}} ({{operation}})", + "expr": "rate(nfsclient_nfsstat_ops[5m])", + "legendFormat": "{{mountpoint}} ({{nfsstat_operation}})", "refId": "A" } ] }, { - "title": "NFS Client Timeouts/s", + "title": "NFS Client Retransmits/s", "type": "timeseries", "gridPos": { "x": 12, @@ -78,8 +78,8 @@ "id": 4, "targets": [ { - "expr": "rate(nfsclient_ops_timeouts[5m])", - "legendFormat": "{{mountpoint}} ({{operation}})", + "expr": "rate(nfsclient_nfsstat_retrans[5m])", + "legendFormat": "{{mountpoint}} ({{nfsstat_operation}})", "refId": "A" } ] @@ -90,4 +90,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/node_exporter/README.md b/inputs/node_exporter/README.md index 111703644..613ed5639 100644 --- a/inputs/node_exporter/README.md +++ b/inputs/node_exporter/README.md @@ -11,7 +11,6 @@ Compared to Categraf's native plugins (like `cpu`, `mem`, `disk`), this plugin p # Collect Node Exporter compatible metrics # interval = 15 -[[instances]] # Typically, you just need to enable this plugin. # If you need to toggle specific collectors, you can pass arguments to categraf's startup command line. # Example: --collector.textfile.directory=/var/lib/node_exporter/textfile_collector diff --git a/inputs/node_exporter/README_CN.md b/inputs/node_exporter/README_CN.md index 457b4df9e..2645439c2 100644 --- a/inputs/node_exporter/README_CN.md +++ b/inputs/node_exporter/README_CN.md @@ -11,7 +11,6 @@ # 采集 Node Exporter 兼容指标 # interval = 15 -[[instances]] # 通常只需启用该插件即可。 # 如果有特别的 collector 开启/关闭需求,您可以在 categraf 的命令行启动参数中传入 # 例如:--collector.textfile.directory=/var/lib/node_exporter/textfile_collector diff --git a/inputs/nvidia_smi/README.md b/inputs/nvidia_smi/README.md index 400d7c24f..44f43ed5f 100644 --- a/inputs/nvidia_smi/README.md +++ b/inputs/nvidia_smi/README.md @@ -12,7 +12,6 @@ The configuration file is located at `conf/input.nvidia_smi/nvidia_smi.toml` # Collect NVIDIA GPU status # interval = 15 -[[instances]] # The following option is critical. To collect nvidia-smi information, uncomment it and provide the absolute path to the nvidia-smi command. # This instructs Categraf to execute the local nvidia-smi command to get the GPU status. # nvidia_smi_command = "/usr/bin/nvidia-smi" diff --git a/inputs/nvidia_smi/README_CN.md b/inputs/nvidia_smi/README_CN.md index 2600d28f1..0899a5191 100644 --- a/inputs/nvidia_smi/README_CN.md +++ b/inputs/nvidia_smi/README_CN.md @@ -12,7 +12,6 @@ # 采集 NVIDIA GPU 状态 # interval = 15 -[[instances]] # 下面的配置是最核心的配置。如果要采集 nvidia-smi 的信息,请取消注释并给出 nvidia-smi 命令的绝对路径。 # 相当于让 Categraf 执行本机的 nvidia-smi 命令,获取本机 GPU 的状态信息 # nvidia_smi_command = "/usr/bin/nvidia-smi" @@ -40,4 +39,4 @@ query_field_names = "AUTO" ## 监控大盘 -本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),帮助您快速建立 GPU 的利用率、显存使用情况、温度与功耗的监控可视化体系。 \ No newline at end of file +本目录下提供了一个配套的基础 Dashboard (`dashboard.json`),帮助您快速建立 GPU 的利用率、显存使用情况、温度与功耗的监控可视化体系。 diff --git a/inputs/processes/README.md b/inputs/processes/README.md index 4afcbcdd8..3549c2ce2 100644 --- a/inputs/processes/README.md +++ b/inputs/processes/README.md @@ -12,7 +12,6 @@ In most cases, no specific configuration is required; just leave it enabled. ```toml # Collect OS process state distributions -[[instances]] # No specific configuration required ``` diff --git a/inputs/processes/README_CN.md b/inputs/processes/README_CN.md index 9ccd31c7b..99cfae668 100644 --- a/inputs/processes/README_CN.md +++ b/inputs/processes/README_CN.md @@ -12,7 +12,6 @@ ```toml # 采集系统进程状态分布 -[[instances]] # 无特别配置项 ``` @@ -33,4 +32,4 @@ ## 监控大盘 这些指标是主机基础监控的一部分。通常,OS 的进程监控会与其他硬件指标统一放置在 **System (主机系统)** 大盘中。 -本目录下也为您提供了一个仅包含进程状态分布的基础 Dashboard。 \ No newline at end of file +本目录下也为您提供了一个仅包含进程状态分布的基础 Dashboard。 diff --git a/inputs/redis_sentinel/README.md b/inputs/redis_sentinel/README.md index 64ae2c87d..a20477b97 100644 --- a/inputs/redis_sentinel/README.md +++ b/inputs/redis_sentinel/README.md @@ -12,7 +12,7 @@ You can configure single or multiple Sentinel nodes within an `instance`. If you # interval = 15 [[instances]] -# List of Sentinel node addresses, formatted as "tcp://host:port" or "host:port" +# List of Sentinel node addresses, formatted as "tcp://host:port" or "unix:///path/to/socket" servers = ["tcp://localhost:26379"] # (Optional) Sentinel password @@ -27,14 +27,14 @@ servers = ["tcp://localhost:26379"] All metrics are prefixed with `redis_sentinel_`. Depending on the data collected, they are mainly divided into two categories: ### Basic Sentinel Metrics (`redis_sentinel_*`) -E.g., `redis_sentinel_uptime_in_seconds`, `redis_sentinel_connected_clients`, `redis_sentinel_mem_used`, etc., which reflect the Sentinel process's liveness and basic resource usage. +E.g., `redis_sentinel_uptime_ns`, `redis_sentinel_clients`, `redis_sentinel_sentinel_masters`, etc., which reflect the Sentinel process's liveness and basic resource usage. The source Sentinel is identified by `source` and `port` labels for TCP endpoints, or the `socket` label for Unix sockets. -### Master / Slave Status Metrics +### Master / Replica Status Metrics These metrics carry labels such as `master` (the master's name) to reflect the cluster topology as seen by Sentinel: -- `redis_sentinel_master_slaves`: Number of Slaves attached to the current Master -- `redis_sentinel_master_sentinels`: Number of Sentinel nodes monitoring this Master -- `redis_sentinel_master_status`: Master status (typically "ok" maps to 1, others map to 0 or specific error codes) -- `redis_sentinel_master_failover_state`: Current state value of the failover process +- `redis_sentinel_masters_num_slaves`: Number of replicas attached to the current master +- `redis_sentinel_masters_num_other_sentinels`: Number of other Sentinel nodes monitoring this master +- `redis_sentinel_masters_has_quorum`: Whether Sentinel reports quorum for the master +- `redis_sentinel_replicas_slave_repl_offset`: Replica replication offset ## Dashboards diff --git a/inputs/redis_sentinel/README_CN.md b/inputs/redis_sentinel/README_CN.md index f10572a41..1b7df1124 100644 --- a/inputs/redis_sentinel/README_CN.md +++ b/inputs/redis_sentinel/README_CN.md @@ -12,7 +12,7 @@ # interval = 15 [[instances]] -# Sentinel 节点地址列表,格式为 "tcp://host:port" 或 "host:port" +# Sentinel 节点地址列表,格式为 "tcp://host:port" 或 "unix:///path/to/socket" servers = ["tcp://localhost:26379"] # (可选) Sentinel 密码 @@ -27,14 +27,14 @@ servers = ["tcp://localhost:26379"] 所有的指标均以 `redis_sentinel_` 作为前缀。根据采集内容不同,主要包含两类数据: ### Sentinel 自身基础指标 (`redis_sentinel_*`) -例如 `redis_sentinel_uptime_in_seconds`, `redis_sentinel_connected_clients`, `redis_sentinel_mem_used` 等,用于反映 Sentinel 进程的存活与基础资源开销。 +例如 `redis_sentinel_uptime_ns`, `redis_sentinel_clients`, `redis_sentinel_sentinel_masters` 等,用于反映 Sentinel 进程的存活与基础资源开销。TCP 地址会带有 `source` 和 `port` 标签,Unix Socket 地址会带有 `socket` 标签。 -### Master / Slave 状态指标 +### Master / Replica 状态指标 这些指标携带 `master` (名字) 等标签,用于反映 Sentinel 眼中的集群拓扑: -- `redis_sentinel_master_slaves`: 当前 Master 下挂载的 Slave 数量 -- `redis_sentinel_master_sentinels`: 监控该 Master 的 Sentinel 节点数 -- `redis_sentinel_master_status`: Master 状态 (通常 "ok" 映射为 1,其他映射为 0 或具体错误码) -- `redis_sentinel_master_failover_state`: 故障转移(Failover)的当前状态值 +- `redis_sentinel_masters_num_slaves`: 当前 Master 下挂载的副本数量 +- `redis_sentinel_masters_num_other_sentinels`: 监控该 Master 的其他 Sentinel 节点数 +- `redis_sentinel_masters_has_quorum`: Sentinel 是否认为该 Master 满足 quorum +- `redis_sentinel_replicas_slave_repl_offset`: 副本复制偏移量 ## 监控大盘 diff --git a/inputs/redis_sentinel/dashboard.json b/inputs/redis_sentinel/dashboard.json index bb03b6395..0cb401dd0 100644 --- a/inputs/redis_sentinel/dashboard.json +++ b/inputs/redis_sentinel/dashboard.json @@ -19,8 +19,8 @@ "id": 1, "targets": [ { - "expr": "redis_sentinel_uptime_in_seconds / 86400", - "legendFormat": "{{server}}", + "expr": "redis_sentinel_uptime_ns / 86400 / 1000000000", + "legendFormat": "{{source}}:{{port}}", "refId": "A" } ] @@ -37,8 +37,8 @@ "id": 2, "targets": [ { - "expr": "redis_sentinel_connected_clients", - "legendFormat": "{{server}}", + "expr": "redis_sentinel_clients", + "legendFormat": "{{source}}:{{port}}", "refId": "A" } ] @@ -55,7 +55,7 @@ "id": 3, "targets": [ { - "expr": "redis_sentinel_master_sentinels", + "expr": "redis_sentinel_masters_num_other_sentinels", "legendFormat": "Master: {{master}}", "refId": "A" } @@ -73,7 +73,7 @@ "id": 4, "targets": [ { - "expr": "redis_sentinel_master_slaves", + "expr": "redis_sentinel_masters_num_slaves", "legendFormat": "Master: {{master}}", "refId": "A" } @@ -85,4 +85,4 @@ "from": "now-1h", "to": "now" } -} \ No newline at end of file +} diff --git a/inputs/redis_sentinel/redis_sentinel.go b/inputs/redis_sentinel/redis_sentinel.go index d2988d6fa..8c2c09fbd 100644 --- a/inputs/redis_sentinel/redis_sentinel.go +++ b/inputs/redis_sentinel/redis_sentinel.go @@ -467,4 +467,4 @@ func prepareFieldValues(fields map[string]string, typeMap map[string]configField } return preparedFields, nil -} \ No newline at end of file +} diff --git a/inputs/self_metrics/README.md b/inputs/self_metrics/README.md index 3877bb12e..81f1ddd5d 100644 --- a/inputs/self_metrics/README.md +++ b/inputs/self_metrics/README.md @@ -12,7 +12,6 @@ Since it is a built-in plugin gathering its own state, the configuration is extr # Collect Categraf's own metrics # interval = 15 -# [[instances]] # No specific configuration required ``` diff --git a/inputs/self_metrics/README_CN.md b/inputs/self_metrics/README_CN.md index e257377b7..eee732604 100644 --- a/inputs/self_metrics/README_CN.md +++ b/inputs/self_metrics/README_CN.md @@ -12,7 +12,6 @@ # 采集 Categraf 自身指标 # interval = 15 -# [[instances]] # 无特殊配置项 ``` diff --git a/inputs/sockstat/README.md b/inputs/sockstat/README.md index 72e6c27a0..056cb69fe 100644 --- a/inputs/sockstat/README.md +++ b/inputs/sockstat/README.md @@ -14,7 +14,6 @@ Generally, no special configuration is needed; just enable the plugin. # Collect Linux sockstat metrics # interval = 15 -[[instances]] # No specific configuration parameters required ``` @@ -39,4 +38,4 @@ These fields provide a snapshot of the socket usage on the system. This is extre ## Dashboards These metrics are part of basic host monitoring and are typically integrated into global **System** or **Network** dashboards. -A dedicated basic Dashboard focusing exclusively on the sockstat socket state distribution and memory usage is also provided in this directory. \ No newline at end of file +A dedicated basic Dashboard focusing exclusively on the sockstat socket state distribution and memory usage is also provided in this directory. diff --git a/inputs/sockstat/README_CN.md b/inputs/sockstat/README_CN.md index 95bf817e1..96645b253 100644 --- a/inputs/sockstat/README_CN.md +++ b/inputs/sockstat/README_CN.md @@ -14,7 +14,6 @@ # 采集 Linux sockstat 状态 # interval = 15 -[[instances]] # 无需任何特定配置参数 ``` @@ -39,4 +38,4 @@ ## 监控大盘 这些指标是主机基础监控的一部分,通常会被整合在 **System (主机系统)** 或 **Network (网络)** 全局大盘中。 -本目录下也为您提供了一个仅针对 sockstat Socket 状态分布与内存占用的专属基础 Dashboard。 \ No newline at end of file +本目录下也为您提供了一个仅针对 sockstat Socket 状态分布与内存占用的专属基础 Dashboard。 diff --git a/inputs/systemd/README.md b/inputs/systemd/README.md index a6ea86733..ba4e729f9 100644 --- a/inputs/systemd/README.md +++ b/inputs/systemd/README.md @@ -13,7 +13,6 @@ You can enable and configure the systemd plugin in your Categraf configuration f # Collect systemd unit metrics # interval = 15 -[[instances]] # Regex: Used to match the unit names to be collected. Default is all (".+"). # unit_include = ".+" diff --git a/inputs/weblogic/README.md b/inputs/weblogic/README.md index 8add866e7..1a82d2b6d 100644 --- a/inputs/weblogic/README.md +++ b/inputs/weblogic/README.md @@ -12,4 +12,4 @@ For specific configurations and pre-defined WebLogic JMX metrics collection item ## Dashboards Since the data is collected via `jolokia_agent`, all metrics and tagging systems will follow the Jolokia standards. -A placeholder `dashboard.json` is provided in this directory. For actual JVM monitoring dashboards, it is recommended to use the generic Dashboards associated with `jolokia` or `jvm`. +A basic `dashboard.json` is provided in this directory for the bundled WebLogic Jolokia template. For broader JVM monitoring, you can also use the generic dashboards associated with `jolokia` or `jvm`. diff --git a/inputs/weblogic/README_CN.md b/inputs/weblogic/README_CN.md index 859ab31cb..28bd0d495 100644 --- a/inputs/weblogic/README_CN.md +++ b/inputs/weblogic/README_CN.md @@ -12,4 +12,4 @@ WebLogic 当前可以使用 `jolokia_agent` 插件来监控,通过 HTTP 请求 ## 监控大盘 既然数据是通过 `jolokia_agent` 采集的,所有的指标和标签体系将遵循 Jolokia 规范。 -本目录下提供了一个占位用的 `dashboard.json`。如果你想查看真正的 JVM 监控大盘,建议直接使用 `jolokia` 或 `jvm` 相关的通用 Dashboard。 +本目录下提供了一个适配当前 WebLogic Jolokia 示例配置的基础 `dashboard.json`。如果你还需要更完整的 JVM 监控,也可以配合使用 `jolokia` 或 `jvm` 相关的通用 Dashboard。