diff --git a/changes/11845.feature.md b/changes/11845.feature.md new file mode 100644 index 00000000000..7e66398c388 --- /dev/null +++ b/changes/11845.feature.md @@ -0,0 +1 @@ +Expand and rename the default Prometheus query presets with explicit aggregation variants (sum/avg/max/min) for container and vLLM metrics diff --git a/fixtures/manager/example-prometheus-query-presets.json b/fixtures/manager/example-prometheus-query-presets.json index e8c2369e7e2..a1360631856 100644 --- a/fixtures/manager/example-prometheus-query-presets.json +++ b/fixtures/manager/example-prometheus-query-presets.json @@ -2,8 +2,8 @@ "prometheus_query_presets": [ { "id": "ff5df9a1-e92d-4636-af73-eb491d5aeaca", - "name": "container_gauge", - "description": "Current container utilization as a raw gauge value", + "name": "Per-Kernel Resource Metric — Instant Value (sum)", + "description": "Instant value of the per-kernel resource utilization gauge, summed across the selected grouping", "rank": 100, "category_name": "container", "metric_name": "backendai_container_utilization", @@ -30,14 +30,72 @@ } }, { - "id": "a1863e52-a678-45f7-979c-127af5658417", - "name": "container_rate", - "description": "Container utilization rate of change normalized over 5 minutes", - "rank": 200, + "id": "106513b9-0a8b-4b07-9820-4ce541441509", + "name": "Per-Kernel Resource Metric — Instant Value (avg)", + "description": "Instant value of the per-kernel resource utilization gauge, averaged across the selected grouping", + "rank": 110, "category_name": "container", "metric_name": "backendai_container_utilization", - "query_template": "sum by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}])) / 5.0", - "time_window": "5m", + "query_template": "avg by ({group_by})(backendai_container_utilization{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "container_metric_name", + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ], + "group_labels": [ + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ] + } + }, + { + "id": "0d8023df-807b-4fe1-953a-6bfb3870a9e4", + "name": "Per-Kernel Resource Metric — Instant Value (max)", + "description": "Instant value of the per-kernel resource utilization gauge, maximum across the selected grouping", + "rank": 120, + "category_name": "container", + "metric_name": "backendai_container_utilization", + "query_template": "max by ({group_by})(backendai_container_utilization{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "container_metric_name", + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ], + "group_labels": [ + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ] + } + }, + { + "id": "8d97af1c-67d5-4ef6-af30-3cfe4689dc99", + "name": "Per-Kernel Resource Metric — Instant Value (min)", + "description": "Instant value of the per-kernel resource utilization gauge, minimum across the selected grouping", + "rank": 130, + "category_name": "container", + "metric_name": "backendai_container_utilization", + "query_template": "min by ({group_by})(backendai_container_utilization{{{labels}}})", + "time_window": null, "options": { "filter_labels": [ "container_metric_name", @@ -60,9 +118,9 @@ }, { "id": "d5d83b5e-3463-4b5f-beda-1afbe4761e3c", - "name": "container_diff", - "description": "Container utilization rate of change over 5 minutes (unnormalized)", - "rank": 300, + "name": "Per-Kernel Resource Metric — 5-Minute Rate (sum)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, summed across the selected grouping", + "rank": 200, "category_name": "container", "metric_name": "backendai_container_utilization", "query_template": "sum by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}]))", @@ -87,11 +145,98 @@ ] } }, + { + "id": "234c0e84-fe6f-46ec-87b9-d177cb1c9b85", + "name": "Per-Kernel Resource Metric — 5-Minute Rate (avg)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, averaged across the selected grouping", + "rank": 210, + "category_name": "container", + "metric_name": "backendai_container_utilization", + "query_template": "avg by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}]))", + "time_window": "5m", + "options": { + "filter_labels": [ + "container_metric_name", + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ], + "group_labels": [ + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ] + } + }, + { + "id": "40b3b678-ab2a-4dcc-9045-120bd208a19f", + "name": "Per-Kernel Resource Metric — 5-Minute Rate (max)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, maximum across the selected grouping", + "rank": 220, + "category_name": "container", + "metric_name": "backendai_container_utilization", + "query_template": "max by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}]))", + "time_window": "5m", + "options": { + "filter_labels": [ + "container_metric_name", + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ], + "group_labels": [ + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ] + } + }, + { + "id": "537ffea7-dc84-457e-b663-52e21be34085", + "name": "Per-Kernel Resource Metric — 5-Minute Rate (min)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, minimum across the selected grouping", + "rank": 230, + "category_name": "container", + "metric_name": "backendai_container_utilization", + "query_template": "min by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}]))", + "time_window": "5m", + "options": { + "filter_labels": [ + "container_metric_name", + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ], + "group_labels": [ + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type" + ] + } + }, { "id": "2f0634e3-a976-4eeb-ba01-1e1829965453", - "name": "vllm_requests_running", - "description": "Number of requests currently being processed by vLLM", - "rank": 400, + "name": "vLLM Inflight Requests (sum)", + "description": "Number of requests currently being processed by vLLM, summed across the selected grouping", + "rank": 300, "category_name": "vllm-inference", "metric_name": "vllm:num_requests_running", "query_template": "sum by ({group_by})(vllm:num_requests_running{{{labels}}})", @@ -105,11 +250,47 @@ ] } }, + { + "id": "52ca4304-4f61-414c-a69f-66a70b25a637", + "name": "vLLM Inflight Requests (avg)", + "description": "Number of requests currently being processed by vLLM, averaged across the selected grouping", + "rank": 310, + "category_name": "vllm-inference", + "metric_name": "vllm:num_requests_running", + "query_template": "avg by ({group_by})(vllm:num_requests_running{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, + { + "id": "0520d769-0240-46db-a8ac-e557f7739d56", + "name": "vLLM Inflight Requests (max)", + "description": "Number of requests currently being processed by vLLM, maximum across the selected grouping", + "rank": 320, + "category_name": "vllm-inference", + "metric_name": "vllm:num_requests_running", + "query_template": "max by ({group_by})(vllm:num_requests_running{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, { "id": "2a0939c6-b634-4f1e-ac8c-8738d7bbc244", - "name": "vllm_requests_waiting", - "description": "Number of requests waiting in the vLLM queue", - "rank": 500, + "name": "vLLM Queued Requests (sum)", + "description": "Number of requests waiting in the vLLM queue, summed across the selected grouping", + "rank": 400, "category_name": "vllm-inference", "metric_name": "vllm:num_requests_waiting", "query_template": "sum by ({group_by})(vllm:num_requests_waiting{{{labels}}})", @@ -123,11 +304,47 @@ ] } }, + { + "id": "598d0148-c3d5-4282-b31c-ea05e7fca98c", + "name": "vLLM Queued Requests (avg)", + "description": "Number of requests waiting in the vLLM queue, averaged across the selected grouping", + "rank": 410, + "category_name": "vllm-inference", + "metric_name": "vllm:num_requests_waiting", + "query_template": "avg by ({group_by})(vllm:num_requests_waiting{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, + { + "id": "baf09246-9bbd-4368-91fd-76c6f97233d4", + "name": "vLLM Queued Requests (max)", + "description": "Number of requests waiting in the vLLM queue, maximum across the selected grouping", + "rank": 420, + "category_name": "vllm-inference", + "metric_name": "vllm:num_requests_waiting", + "query_template": "max by ({group_by})(vllm:num_requests_waiting{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, { "id": "00c6467d-58da-4285-9166-fa36404f2012", - "name": "vllm_gpu_cache_usage", - "description": "Average GPU KV cache usage percentage across vLLM replicas", - "rank": 600, + "name": "vLLM GPU KV Cache Usage Ratio (avg)", + "description": "GPU KV cache usage ratio (0.0-1.0) reported by vLLM, averaged across the selected grouping", + "rank": 500, "category_name": "vllm-inference", "metric_name": "vllm:gpu_cache_usage_perc", "query_template": "avg by ({group_by})(vllm:gpu_cache_usage_perc{{{labels}}})", @@ -141,11 +358,47 @@ ] } }, + { + "id": "4bed8110-4718-4300-a39d-d64a25e6aa6e", + "name": "vLLM GPU KV Cache Usage Ratio (max)", + "description": "GPU KV cache usage ratio (0.0-1.0) reported by vLLM, maximum across the selected grouping", + "rank": 510, + "category_name": "vllm-inference", + "metric_name": "vllm:gpu_cache_usage_perc", + "query_template": "max by ({group_by})(vllm:gpu_cache_usage_perc{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, + { + "id": "aab38882-1c45-40ae-bd71-d60e20a76849", + "name": "vLLM GPU KV Cache Usage Ratio (min)", + "description": "GPU KV cache usage ratio (0.0-1.0) reported by vLLM, minimum across the selected grouping", + "rank": 520, + "category_name": "vllm-inference", + "metric_name": "vllm:gpu_cache_usage_perc", + "query_template": "min by ({group_by})(vllm:gpu_cache_usage_perc{{{labels}}})", + "time_window": null, + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, { "id": "73f67248-497b-4794-a71a-562fcebc248a", - "name": "vllm_request_throughput", - "description": "Request success rate per second over 5 minutes", - "rank": 700, + "name": "vLLM Successful Requests per Second — 5-Minute Rate (sum)", + "description": "Per-second rate of vLLM successful request completions over a 5-minute window, summed across the selected grouping", + "rank": 600, "category_name": "vllm-inference", "metric_name": "vllm:request_success_total", "query_template": "sum by ({group_by})(rate(vllm:request_success_total{{{labels}}}[{window}]))", @@ -159,11 +412,47 @@ ] } }, + { + "id": "4b4cf413-c00a-464d-8694-ab0de1eb0d6d", + "name": "vLLM Successful Requests per Second — 5-Minute Rate (avg)", + "description": "Per-second rate of vLLM successful request completions over a 5-minute window, averaged across the selected grouping", + "rank": 610, + "category_name": "vllm-inference", + "metric_name": "vllm:request_success_total", + "query_template": "avg by ({group_by})(rate(vllm:request_success_total{{{labels}}}[{window}]))", + "time_window": "5m", + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, + { + "id": "0a4762f4-1eb9-46ff-96f2-ffaadcfc4957", + "name": "vLLM Successful Requests per Second — 5-Minute Rate (max)", + "description": "Per-second rate of vLLM successful request completions over a 5-minute window, maximum across the selected grouping", + "rank": 620, + "category_name": "vllm-inference", + "metric_name": "vllm:request_success_total", + "query_template": "max by ({group_by})(rate(vllm:request_success_total{{{labels}}}[{window}]))", + "time_window": "5m", + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } + }, { "id": "119da10b-23df-4579-8c8b-d6bff18ee1b8", - "name": "vllm_avg_latency", - "description": "Average end-to-end request latency over 5 minutes", - "rank": 800, + "name": "vLLM End-to-End Request Latency — 5-Minute Average (avg)", + "description": "End-to-end request latency averaged over a 5-minute window, averaged across the selected grouping", + "rank": 700, "category_name": "vllm-inference", "metric_name": "vllm:e2e_request_latency_seconds", "query_template": "avg by ({group_by})(rate(vllm:e2e_request_latency_seconds_sum{{{labels}}}[{window}]) / rate(vllm:e2e_request_latency_seconds_count{{{labels}}}[{window}]))", @@ -176,6 +465,24 @@ "deployment_id" ] } + }, + { + "id": "c658562f-6740-4347-ad1a-42bb76a848c9", + "name": "vLLM End-to-End Request Latency — 5-Minute Average (max)", + "description": "End-to-end request latency averaged over a 5-minute window, maximum across the selected grouping", + "rank": 710, + "category_name": "vllm-inference", + "metric_name": "vllm:e2e_request_latency_seconds", + "query_template": "max by ({group_by})(rate(vllm:e2e_request_latency_seconds_sum{{{labels}}}[{window}]) / rate(vllm:e2e_request_latency_seconds_count{{{labels}}}[{window}]))", + "time_window": "5m", + "options": { + "filter_labels": [ + "deployment_id" + ], + "group_labels": [ + "deployment_id" + ] + } } ] } diff --git a/src/ai/backend/manager/models/alembic/versions/7af18070fdef_expand_prometheus_query_presets.py b/src/ai/backend/manager/models/alembic/versions/7af18070fdef_expand_prometheus_query_presets.py new file mode 100644 index 00000000000..8795dcb5887 --- /dev/null +++ b/src/ai/backend/manager/models/alembic/versions/7af18070fdef_expand_prometheus_query_presets.py @@ -0,0 +1,393 @@ +"""expand_prometheus_query_presets + +Revision ID: 7af18070fdef +Revises: 1a2b3c4d5e6f +Create Date: 2026-05-27 00:00:00.000000 + +# Part of: 26.3.0 (main) +""" + +import json +import textwrap +import uuid +from typing import Any, cast + +import sqlalchemy as sa +from alembic import op + +revision = "7af18070fdef" +down_revision = "1a2b3c4d5e6f" +branch_labels = None +depends_on = None + +# Matched by name because the previous seed migration generated UUIDs at insert +# time (uuid_generate_v4), so the row id differs per environment. +CONTAINER_RENAMES: list[dict[str, Any]] = [ + { + "old_name": "container_gauge", + "new_name": "Per-Kernel Resource Metric — Instant Value (sum)", + "description": "Instant value of the per-kernel resource utilization gauge, summed across the selected grouping", + "rank": 100, + }, + { + "old_name": "container_diff", + "new_name": "Per-Kernel Resource Metric — 5-Minute Rate (sum)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, summed across the selected grouping", + "rank": 200, + }, +] + +CONTAINER_OPTIONS = json.dumps({ + "filter_labels": [ + "container_metric_name", + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type", + ], + "group_labels": [ + "kernel_id", + "session_id", + "agent_id", + "user_id", + "project_id", + "value_type", + ], +}) + +CONTAINER_INSERTIONS: list[dict[str, Any]] = [ + { + "id": "106513b9-0a8b-4b07-9820-4ce541441509", + "name": "Per-Kernel Resource Metric — Instant Value (avg)", + "description": "Instant value of the per-kernel resource utilization gauge, averaged across the selected grouping", + "rank": 110, + "metric_name": "backendai_container_utilization", + "query_template": "avg by ({group_by})(backendai_container_utilization{{{labels}}})", + "time_window": None, + }, + { + "id": "0d8023df-807b-4fe1-953a-6bfb3870a9e4", + "name": "Per-Kernel Resource Metric — Instant Value (max)", + "description": "Instant value of the per-kernel resource utilization gauge, maximum across the selected grouping", + "rank": 120, + "metric_name": "backendai_container_utilization", + "query_template": "max by ({group_by})(backendai_container_utilization{{{labels}}})", + "time_window": None, + }, + { + "id": "8d97af1c-67d5-4ef6-af30-3cfe4689dc99", + "name": "Per-Kernel Resource Metric — Instant Value (min)", + "description": "Instant value of the per-kernel resource utilization gauge, minimum across the selected grouping", + "rank": 130, + "metric_name": "backendai_container_utilization", + "query_template": "min by ({group_by})(backendai_container_utilization{{{labels}}})", + "time_window": None, + }, + { + "id": "234c0e84-fe6f-46ec-87b9-d177cb1c9b85", + "name": "Per-Kernel Resource Metric — 5-Minute Rate (avg)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, averaged across the selected grouping", + "rank": 210, + "metric_name": "backendai_container_utilization", + "query_template": "avg by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}]))", + "time_window": "5m", + }, + { + "id": "40b3b678-ab2a-4dcc-9045-120bd208a19f", + "name": "Per-Kernel Resource Metric — 5-Minute Rate (max)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, maximum across the selected grouping", + "rank": 220, + "metric_name": "backendai_container_utilization", + "query_template": "max by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}]))", + "time_window": "5m", + }, + { + "id": "537ffea7-dc84-457e-b663-52e21be34085", + "name": "Per-Kernel Resource Metric — 5-Minute Rate (min)", + "description": "Per-second rate of the per-kernel resource utilization gauge over a 5-minute window, minimum across the selected grouping", + "rank": 230, + "metric_name": "backendai_container_utilization", + "query_template": "min by ({group_by})(rate(backendai_container_utilization{{{labels}}}[{window}]))", + "time_window": "5m", + }, +] + +# vLLM presets were never seeded into production DBs by a previous migration, +# but the example fixture seeds 5 of these ids under legacy names. All 14 rows +# are upserted by id, so both production and fixture-seeded DBs converge. +VLLM_INSERTIONS: list[dict[str, Any]] = [ + { + "id": "2f0634e3-a976-4eeb-ba01-1e1829965453", + "name": "vLLM Inflight Requests (sum)", + "description": "Number of requests currently being processed by vLLM, summed across the selected grouping", + "rank": 300, + "metric_name": "vllm:num_requests_running", + "query_template": "sum by ({group_by})(vllm:num_requests_running{{{labels}}})", + "time_window": None, + }, + { + "id": "52ca4304-4f61-414c-a69f-66a70b25a637", + "name": "vLLM Inflight Requests (avg)", + "description": "Number of requests currently being processed by vLLM, averaged across the selected grouping", + "rank": 310, + "metric_name": "vllm:num_requests_running", + "query_template": "avg by ({group_by})(vllm:num_requests_running{{{labels}}})", + "time_window": None, + }, + { + "id": "0520d769-0240-46db-a8ac-e557f7739d56", + "name": "vLLM Inflight Requests (max)", + "description": "Number of requests currently being processed by vLLM, maximum across the selected grouping", + "rank": 320, + "metric_name": "vllm:num_requests_running", + "query_template": "max by ({group_by})(vllm:num_requests_running{{{labels}}})", + "time_window": None, + }, + { + "id": "2a0939c6-b634-4f1e-ac8c-8738d7bbc244", + "name": "vLLM Queued Requests (sum)", + "description": "Number of requests waiting in the vLLM queue, summed across the selected grouping", + "rank": 400, + "metric_name": "vllm:num_requests_waiting", + "query_template": "sum by ({group_by})(vllm:num_requests_waiting{{{labels}}})", + "time_window": None, + }, + { + "id": "598d0148-c3d5-4282-b31c-ea05e7fca98c", + "name": "vLLM Queued Requests (avg)", + "description": "Number of requests waiting in the vLLM queue, averaged across the selected grouping", + "rank": 410, + "metric_name": "vllm:num_requests_waiting", + "query_template": "avg by ({group_by})(vllm:num_requests_waiting{{{labels}}})", + "time_window": None, + }, + { + "id": "baf09246-9bbd-4368-91fd-76c6f97233d4", + "name": "vLLM Queued Requests (max)", + "description": "Number of requests waiting in the vLLM queue, maximum across the selected grouping", + "rank": 420, + "metric_name": "vllm:num_requests_waiting", + "query_template": "max by ({group_by})(vllm:num_requests_waiting{{{labels}}})", + "time_window": None, + }, + { + "id": "00c6467d-58da-4285-9166-fa36404f2012", + "name": "vLLM GPU KV Cache Usage Ratio (avg)", + "description": "GPU KV cache usage ratio (0.0-1.0) reported by vLLM, averaged across the selected grouping", + "rank": 500, + "metric_name": "vllm:gpu_cache_usage_perc", + "query_template": "avg by ({group_by})(vllm:gpu_cache_usage_perc{{{labels}}})", + "time_window": None, + }, + { + "id": "4bed8110-4718-4300-a39d-d64a25e6aa6e", + "name": "vLLM GPU KV Cache Usage Ratio (max)", + "description": "GPU KV cache usage ratio (0.0-1.0) reported by vLLM, maximum across the selected grouping", + "rank": 510, + "metric_name": "vllm:gpu_cache_usage_perc", + "query_template": "max by ({group_by})(vllm:gpu_cache_usage_perc{{{labels}}})", + "time_window": None, + }, + { + "id": "aab38882-1c45-40ae-bd71-d60e20a76849", + "name": "vLLM GPU KV Cache Usage Ratio (min)", + "description": "GPU KV cache usage ratio (0.0-1.0) reported by vLLM, minimum across the selected grouping", + "rank": 520, + "metric_name": "vllm:gpu_cache_usage_perc", + "query_template": "min by ({group_by})(vllm:gpu_cache_usage_perc{{{labels}}})", + "time_window": None, + }, + { + "id": "73f67248-497b-4794-a71a-562fcebc248a", + "name": "vLLM Successful Requests per Second — 5-Minute Rate (sum)", + "description": "Per-second rate of vLLM successful request completions over a 5-minute window, summed across the selected grouping", + "rank": 600, + "metric_name": "vllm:request_success_total", + "query_template": "sum by ({group_by})(rate(vllm:request_success_total{{{labels}}}[{window}]))", + "time_window": "5m", + }, + { + "id": "4b4cf413-c00a-464d-8694-ab0de1eb0d6d", + "name": "vLLM Successful Requests per Second — 5-Minute Rate (avg)", + "description": "Per-second rate of vLLM successful request completions over a 5-minute window, averaged across the selected grouping", + "rank": 610, + "metric_name": "vllm:request_success_total", + "query_template": "avg by ({group_by})(rate(vllm:request_success_total{{{labels}}}[{window}]))", + "time_window": "5m", + }, + { + "id": "0a4762f4-1eb9-46ff-96f2-ffaadcfc4957", + "name": "vLLM Successful Requests per Second — 5-Minute Rate (max)", + "description": "Per-second rate of vLLM successful request completions over a 5-minute window, maximum across the selected grouping", + "rank": 620, + "metric_name": "vllm:request_success_total", + "query_template": "max by ({group_by})(rate(vllm:request_success_total{{{labels}}}[{window}]))", + "time_window": "5m", + }, + { + "id": "119da10b-23df-4579-8c8b-d6bff18ee1b8", + "name": "vLLM End-to-End Request Latency — 5-Minute Average (avg)", + "description": "End-to-end request latency averaged over a 5-minute window, averaged across the selected grouping", + "rank": 700, + "metric_name": "vllm:e2e_request_latency_seconds", + "query_template": "avg by ({group_by})(rate(vllm:e2e_request_latency_seconds_sum{{{labels}}}[{window}]) / rate(vllm:e2e_request_latency_seconds_count{{{labels}}}[{window}]))", + "time_window": "5m", + }, + { + "id": "c658562f-6740-4347-ad1a-42bb76a848c9", + "name": "vLLM End-to-End Request Latency — 5-Minute Average (max)", + "description": "End-to-end request latency averaged over a 5-minute window, maximum across the selected grouping", + "rank": 710, + "metric_name": "vllm:e2e_request_latency_seconds", + "query_template": "max by ({group_by})(rate(vllm:e2e_request_latency_seconds_sum{{{labels}}}[{window}]) / rate(vllm:e2e_request_latency_seconds_count{{{labels}}}[{window}]))", + "time_window": "5m", + }, +] + +VLLM_OPTIONS = json.dumps({ + "filter_labels": ["deployment_id"], + "group_labels": ["deployment_id"], +}) + + +def _seed_category(conn: sa.Connection, name: str, description: str) -> uuid.UUID: + # Insert only if the category is missing (name is unique), letting the DB + # generate the id, then return whichever id is now in place — the existing + # one if it was already seeded, otherwise the freshly generated one. + conn.execute( + sa.text( + textwrap.dedent("""\ + INSERT INTO prometheus_query_preset_categories (name, description) + SELECT :name, :description + WHERE NOT EXISTS ( + SELECT 1 FROM prometheus_query_preset_categories + WHERE name = CAST(:name AS varchar) + ) + """) + ), + parameters={ + "name": name, + "description": description, + }, + ) + return cast( + uuid.UUID, + conn.execute( + sa.text("SELECT id FROM prometheus_query_preset_categories WHERE name = :name"), + parameters={"name": name}, + ).scalar_one(), + ) + + +def _upsert_presets( + conn: sa.Connection, + presets: list[dict[str, Any]], + category_id: uuid.UUID, + options: str, +) -> None: + # Upsert by id (the table's only unique key): environments seeded from the + # example fixture already hold these ids under their old names, so a plain + # insert would hit the primary key. ON CONFLICT converges those rows to the + # new definitions, while fresh production DBs simply insert. + for preset in presets: + conn.execute( + sa.text( + textwrap.dedent("""\ + INSERT INTO prometheus_query_presets + (id, name, description, rank, category_id, + metric_name, query_template, time_window, options) + VALUES (CAST(:id AS uuid), :name, :description, :rank, + CAST(:category_id AS uuid), + :metric_name, :query_template, :time_window, + CAST(:options AS jsonb)) + ON CONFLICT (id) DO UPDATE SET + name = EXCLUDED.name, + description = EXCLUDED.description, + rank = EXCLUDED.rank, + category_id = EXCLUDED.category_id, + metric_name = EXCLUDED.metric_name, + query_template = EXCLUDED.query_template, + time_window = EXCLUDED.time_window, + options = EXCLUDED.options + """) + ), + parameters={ + **preset, + "category_id": str(category_id), + "options": options, + }, + ) + + +def _rename_presets( + conn: sa.Connection, renames: list[dict[str, Any]], category_id: uuid.UUID +) -> None: + # Rename existing seeded presets only if the row is still in its original + # state (name unchanged), preserving any user customization. + for rename in renames: + conn.execute( + sa.text( + textwrap.dedent("""\ + UPDATE prometheus_query_presets + SET name = :new_name, + description = :description, + rank = :rank, + category_id = CAST(:category_id AS uuid) + WHERE name = CAST(:old_name AS varchar) + """) + ), + parameters={ + **rename, + "category_id": str(category_id), + }, + ) + + +def _delete_presets(conn: sa.Connection, names: list[str]) -> None: + for name in names: + conn.execute( + sa.text("DELETE FROM prometheus_query_presets WHERE name = CAST(:name AS varchar)"), + parameters={"name": name}, + ) + + +def upgrade() -> None: + conn = op.get_bind() + + container_category_id = _seed_category( + conn, + name="container", + description="Container-level utilization metrics collected by Backend.AI agents", + ) + vllm_category_id = _seed_category( + conn, + name="vllm-inference", + description="vLLM inference runtime metrics scraped from model serving endpoints", + ) + + _rename_presets(conn, renames=CONTAINER_RENAMES, category_id=container_category_id) + + # Drop container_rate: its sum(rate)/5.0 normalization doesn't compose with + # the new avg/min/max variants. + _delete_presets(conn, names=["container_rate"]) + + _upsert_presets( + conn, + presets=CONTAINER_INSERTIONS, + category_id=container_category_id, + options=CONTAINER_OPTIONS, + ) + _upsert_presets( + conn, + presets=VLLM_INSERTIONS, + category_id=vllm_category_id, + options=VLLM_OPTIONS, + ) + + +def downgrade() -> None: + # Data-only migration: seeded rows are not removed on downgrade + # to avoid deleting user-modified presets that share the same identifiers. + pass