-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
83 lines (78 loc) · 3.71 KB
/
docker-compose.yml
File metadata and controls
83 lines (78 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
services:
gpu-monitor:
build: .
restart: unless-stopped
runtime: nvidia # requires NVIDIA Container Toolkit
environment:
# ── Notification channels (configure one or more) ──────────────────────
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
- DISCORD_WEBHOOK_URL=${DISCORD_WEBHOOK_URL:-}
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
- TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID:-}
- EMAIL_SMTP_HOST=${EMAIL_SMTP_HOST:-}
- EMAIL_SMTP_PORT=${EMAIL_SMTP_PORT:-587}
- EMAIL_USER=${EMAIL_USER:-}
- EMAIL_PASS=${EMAIL_PASS:-}
- EMAIL_TO=${EMAIL_TO:-}
- TWILIO_ACCOUNT_SID=${TWILIO_ACCOUNT_SID:-}
- TWILIO_AUTH_TOKEN=${TWILIO_AUTH_TOKEN:-}
- TWILIO_FROM=${TWILIO_FROM:-}
- TWILIO_TO=${TWILIO_TO:-}
- WECOM_WEBHOOK_URL=${WECOM_WEBHOOK_URL:-}
- FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
- DINGTALK_WEBHOOK_URL=${DINGTALK_WEBHOOK_URL:-}
- BARK_URL=${BARK_URL:-}
- NTFY_URL=${NTFY_URL:-}
- NTFY_TOKEN=${NTFY_TOKEN:-}
- GOTIFY_URL=${GOTIFY_URL:-}
- GOTIFY_TOKEN=${GOTIFY_TOKEN:-}
- PUSHOVER_TOKEN=${PUSHOVER_TOKEN:-}
- PUSHOVER_USER=${PUSHOVER_USER:-}
- ROCKETCHAT_WEBHOOK_URL=${ROCKETCHAT_WEBHOOK_URL:-}
- MATTERMOST_WEBHOOK_URL=${MATTERMOST_WEBHOOK_URL:-}
- TEAMS_WEBHOOK_URL=${TEAMS_WEBHOOK_URL:-}
- GOOGLE_CHAT_WEBHOOK_URL=${GOOGLE_CHAT_WEBHOOK_URL:-}
- ZULIP_SITE=${ZULIP_SITE:-}
- ZULIP_EMAIL=${ZULIP_EMAIL:-}
- ZULIP_API_KEY=${ZULIP_API_KEY:-}
- OPENCLAW_WEBHOOK_URL=${OPENCLAW_WEBHOOK_URL:-}
- OPENCLAW_WEBHOOK_SECRET=${OPENCLAW_WEBHOOK_SECRET:-}
- PAGERDUTY_INTEGRATION_KEY=${PAGERDUTY_INTEGRATION_KEY:-}
- APPRISE_URLS=${APPRISE_URLS:-}
# ── Alerting thresholds ─────────────────────────────────────────────────
- CHECK_INTERVAL=${CHECK_INTERVAL:-60}
- IDLE_THRESHOLD=${IDLE_THRESHOLD:-10}
- IDLE_MINUTES=${IDLE_MINUTES:-5}
- ALERT_COOLDOWN=${ALERT_COOLDOWN:-30}
- GPU_TEMP_WARN=${GPU_TEMP_WARN:-85}
- GPU_TEMP_CRIT=${GPU_TEMP_CRIT:-92}
- MEMLEAK_THRESHOLD=${MEMLEAK_THRESHOLD:-30}
- MEMLEAK_MINUTES=${MEMLEAK_MINUTES:-10}
- ALERT_WEBHOOK_URL=${ALERT_WEBHOOK_URL:-}
# ── Metrics backends ────────────────────────────────────────────────────
- WEB_PORT=${WEB_PORT:-8080}
- INFLUXDB_URL=${INFLUXDB_URL:-}
- INFLUXDB_TOKEN=${INFLUXDB_TOKEN:-}
- INFLUXDB_BUCKET=${INFLUXDB_BUCKET:-gpu_metrics}
- INFLUXDB_ORG=${INFLUXDB_ORG:-}
- DATADOG_STATSD_HOST=${DATADOG_STATSD_HOST:-}
- DATADOG_STATSD_PORT=${DATADOG_STATSD_PORT:-8125}
- OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:-}
- OTEL_SERVICE_NAME=${OTEL_SERVICE_NAME:-gpu-monitor}
- OTEL_EXPORTER_OTLP_HEADERS=${OTEL_EXPORTER_OTLP_HEADERS:-}
# ── GitHub Pages dashboard ─────────────────────────────────────────────
- GITHUB_PAGES_TOKEN=${GITHUB_PAGES_TOKEN:-}
- GITHUB_PAGES_REPO=${GITHUB_PAGES_REPO:-}
- LOG_FILE=/var/log/gpu-monitor.log
volumes:
- gpu-monitor-logs:/var/log
ports:
- "${WEB_PORT:-8080}:${WEB_PORT:-8080}"
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:${WEB_PORT:-8080}/metrics', timeout=5)"]
interval: 60s
timeout: 10s
retries: 3
start_period: 30s
volumes:
gpu-monitor-logs: