diff --git a/ocs_ci/helpers/ocs_metrics_exporter_helpers.py b/ocs_ci/helpers/ocs_metrics_exporter_helpers.py new file mode 100644 index 000000000000..dadbef9b721c --- /dev/null +++ b/ocs_ci/helpers/ocs_metrics_exporter_helpers.py @@ -0,0 +1,302 @@ +# -*- coding: utf8 -*- +""" +Helpers for RHSTOR-7964 / ocs-metrics-exporter validation (internal and provider paths). + +Metrics scrape aligns with manual QE: for TLS metrics on 8443, use +``oc create token prometheus-k8s -n openshift-monitoring`` and +``curl -sk -H 'Authorization: Bearer …' https://localhost:8443/metrics`` from inside the pod. +""" + +import logging +import re +import shlex + +from ocs_ci.framework import config +from ocs_ci.ocs import constants +from ocs_ci.ocs.exceptions import CommandFailed +from ocs_ci.ocs.resources.pod import Pod, get_pods_having_label +from ocs_ci.utility.utils import exec_cmd + + +logger = logging.getLogger(__name__) + + +PROMETHEUS_K8S_SA = "prometheus-k8s" +OPENSHIFT_MONITORING_NS = "openshift-monitoring" + + +def get_ocs_metrics_exporter_pod(namespace=None): + """ + Return the single running ocs-metrics-exporter Pod object, or None if not found. + + Args: + namespace (str): Storage namespace; defaults from config. + + Returns: + Pod or None + """ + namespace = namespace or config.ENV_DATA["cluster_namespace"] + pods = get_pods_having_label(constants.OCS_METRICS_EXPORTER, namespace=namespace) + running = [ + p for p in pods if p.get("status", {}).get("phase") == constants.STATUS_RUNNING + ] + if not running: + return None + return Pod(**running[0]) + + +def resolve_metrics_endpoint(pod_obj): + """ + Resolve /metrics URL and curl options from pod container ports. + + Prefers HTTPS on 8443 (RHSTOR-7964 / kube TLS stack) over plain HTTP metrics. + + Args: + pod_obj (Pod): ocs-metrics-exporter pod + + Returns: + dict: keys ``url`` (str), ``tls_skip_verify`` (bool), ``bearer_auth`` (bool) + """ + https_port = None + http_port = None + for container in pod_obj.pod_data.get("spec", {}).get("containers", []): + for port_def in container.get("ports") or []: + name = (port_def.get("name") or "").lower() + container_port = port_def.get("containerPort") + if not container_port: + continue + if container_port == 8443 or "https" in name: + https_port = container_port + elif "metric" in name or name in ("http", "probe"): + http_port = container_port + + if https_port: + return { + "url": f"https://127.0.0.1:{https_port}/metrics", + "tls_skip_verify": True, + "bearer_auth": True, + } + port = http_port or 8080 + return { + "url": f"http://127.0.0.1:{port}/metrics", + "tls_skip_verify": False, + "bearer_auth": False, + } + + +def create_prometheus_k8s_bearer_token(): + """ + Create a short-lived token for prometheus-k8s in openshift-monitoring (same as manual QA). + + Used to authorize ``curl`` to the exporter's TLS /metrics listener inside the pod. + + Returns: + str: bearer token (sensitive; pass to ``exec_cmd_on_pod(..., secrets=[token])``). + + Raises: + CommandFailed: if ``oc create token`` is not supported or SA is missing. + """ + base_cmd = f"oc create token {PROMETHEUS_K8S_SA} -n {OPENSHIFT_MONITORING_NS}" + last_exc = None + for suffix in (" --duration=15m", ""): + cmd = base_cmd + suffix + try: + completed = exec_cmd(cmd, secrets=[]) + token = (completed.stdout or "").strip() + if token: + return token + except CommandFailed as exc: + last_exc = exc + continue + msg = ( + "failed to create prometheus-k8s token in openshift-monitoring " + "(tried with and without --duration); check OCP version and RBAC" + ) + if last_exc: + raise CommandFailed(msg) from last_exc + raise CommandFailed(msg) + + +def scrape_metrics_text_sample(pod_obj, bearer_token=None, max_bytes=8192): + """ + Curl /metrics from inside the exporter pod (loopback), matching manual QE. + + For HTTPS (e.g. 8443), uses ``curl -sk`` and ``Authorization: Bearer`` from + ``prometheus-k8s`` unless ``bearer_token`` is passed in. + + Args: + pod_obj (Pod): exporter pod + bearer_token (str): optional pre-created token; if None and bearer auth is + required, ``create_prometheus_k8s_bearer_token()`` is used. + max_bytes (int): limit response size for logging and assertions + + Returns: + str: beginning of Prometheus text exposition + """ + endpoint = resolve_metrics_endpoint(pod_obj) + url = endpoint["url"] + secrets = [] + parts = [ + "curl", + "-sS", + "--connect-timeout", + "5", + "--max-time", + "15", + "-f", + ] + if endpoint["tls_skip_verify"]: + parts.append("-k") + if endpoint["bearer_auth"]: + token = bearer_token or create_prometheus_k8s_bearer_token() + secrets.append(token) + parts.extend(["-H", f"Authorization: Bearer {token}"]) + parts.append(url) + inner = " ".join(shlex.quote(p) for p in parts) + f" | head -c {max_bytes}" + cmd = f"sh -c {shlex.quote(inner)}" + return pod_obj.exec_cmd_on_pod( + cmd, out_yaml_format=False, secrets=secrets if secrets else None + ) + + +def scrape_full_metrics_text(pod_obj, bearer_token=None, max_bytes=65536): + """ + Curl the full /metrics body (up to ``max_bytes``) from inside the exporter pod. + + Unlike ``scrape_metrics_text_sample`` this fetches a larger payload suitable for + metric-level assertions. + + Args: + pod_obj (Pod): ocs-metrics-exporter pod + bearer_token (str): optional pre-created bearer token + max_bytes (int): cap response size + + Returns: + str: Prometheus text exposition body + """ + return scrape_metrics_text_sample( + pod_obj, bearer_token=bearer_token, max_bytes=max_bytes + ) + + +def assert_prometheus_exposition_text(text): + """ + Assert the payload looks like Prometheus text exposition (not HTML/JSON error page). + + Args: + text (str): body from /metrics + + Raises: + AssertionError: if body does not match minimal Prometheus text format heuristics. + """ + assert text and text.strip(), "metrics endpoint returned an empty body" + stripped = text.lstrip() + first_line = stripped.split("\n", 1)[0] + prom_comment = first_line.startswith("# HELP") or first_line.startswith("# TYPE") + prom_metric = bool(re.match(r"^[a-zA-Z_:][a-zA-Z0-9_:]*(?:\{|\s)", first_line)) + assert prom_comment or prom_metric, ( + "expected Prometheus text format from /metrics (line starting with " + f"'# HELP', '# TYPE', or metric_name); got first line: {first_line[:200]!r}" + ) + + +def skip_if_no_provider_with_consumers(): + """ + Skip the calling test if not running on a provider with onboarded consumers. + + Raises: + pytest.skip: if single-cluster or no consumer/client in multicluster config. + """ + import pytest + + if not getattr(config, "multicluster", False): + pytest.skip("single-cluster deployment; requires provider+consumer") + if not (config.is_consumer_exist() or config.hci_client_exist()): + pytest.skip("no consumer/client clusters in multicluster config") + + +def parse_metric_families(metrics_text): + """ + Parse raw Prometheus text exposition into a dict of metric name -> list of samples. + + Each sample is a dict with keys ``labels`` (dict) and ``value`` (str). + + Args: + metrics_text (str): raw Prometheus text from /metrics + + Returns: + dict: {metric_name: [{"labels": {...}, "value": str}, ...]} + """ + families = {} + sample_re = re.compile( + r"^([a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{([^}]*)\})?\s+(\S+)(?:\s+\S+)?$" + ) + for line in metrics_text.splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + m = sample_re.match(line) + if not m: + continue + name = m.group(1) + labels_str = m.group(2) or "" + value = m.group(3) + labels = {} + if labels_str: + for pair in re.findall(r'(\w+)="([^"]*)"', labels_str): + labels[pair[0]] = pair[1] + families.setdefault(name, []).append({"labels": labels, "value": value}) + return families + + +def assert_metric_present(families, metric_name): + """ + Assert that a named metric has at least one sample in the parsed families. + + Args: + families (dict): output of ``parse_metric_families`` + metric_name (str): e.g. ``ocs_rbd_pv_metadata`` + + Raises: + AssertionError: if metric is missing + """ + assert ( + metric_name in families and families[metric_name] + ), f"metric {metric_name!r} not found in /metrics output" + + +def assert_metric_has_consumer_name(families, metric_name): + """ + Assert every sample of ``metric_name`` carries a ``consumer_name`` label. + + Args: + families (dict): output of ``parse_metric_families`` + metric_name (str): metric to check + + Raises: + AssertionError + """ + assert_metric_present(families, metric_name) + for sample in families[metric_name]: + assert ( + "consumer_name" in sample["labels"] + ), f"metric {metric_name!r} sample missing consumer_name label: {sample!r}" + + +def get_consumer_names_from_metrics(families, metric_name): + """ + Extract the set of distinct ``consumer_name`` label values from a metric. + + Args: + families (dict): output of ``parse_metric_families`` + metric_name (str): metric to inspect + + Returns: + set: unique consumer_name values + """ + assert_metric_present(families, metric_name) + return { + s["labels"]["consumer_name"] + for s in families[metric_name] + if "consumer_name" in s["labels"] + } diff --git a/tests/functional/monitoring/prometheus/metrics/test_ocs_exporter_provider.py b/tests/functional/monitoring/prometheus/metrics/test_ocs_exporter_provider.py new file mode 100644 index 000000000000..dbb876124e19 --- /dev/null +++ b/tests/functional/monitoring/prometheus/metrics/test_ocs_exporter_provider.py @@ -0,0 +1,100 @@ +# -*- coding: utf8 -*- +""" +Provider-mode metric label validation with consumer_name. +per-client metrics carry consumer_name label +on the provider cluster when remote consumers are onboarded. +""" + +import logging + +import pytest + +from ocs_ci.framework import config +from ocs_ci.framework.pytest_customization.marks import ( + blue_squad, + runs_on_provider, + skipif_external_mode, + skipif_hci_client, + skipif_mcg_only, + skipif_ms_consumer, +) +from ocs_ci.framework.testlib import skipif_managed_service, tier1 +from ocs_ci.helpers import ocs_metrics_exporter_helpers as ome_helpers + + +logger = logging.getLogger(__name__) + + +def _get_provider_exporter_families(): + """ + Scrape /metrics from ocs-metrics-exporter on provider, return parsed families. + """ + namespace = config.ENV_DATA["cluster_namespace"] + pod = ome_helpers.get_ocs_metrics_exporter_pod(namespace) + if pod is None: + pytest.skip("ocs-metrics-exporter not deployed on provider") + body = ome_helpers.scrape_full_metrics_text(pod) + ome_helpers.assert_prometheus_exposition_text(body) + return ome_helpers.parse_metric_families(body) + + +@blue_squad +@tier1 +@skipif_managed_service +@skipif_external_mode +@skipif_mcg_only +@skipif_ms_consumer +@skipif_hci_client +@runs_on_provider +@pytest.mark.parametrize( + "metric_name", + [ + pytest.param("ocs_rbd_pv_metadata", marks=pytest.mark.polarion_id("ocs-tm014")), + pytest.param( + "ocs_rbd_children_count", marks=pytest.mark.polarion_id("ocs-tm015") + ), + pytest.param( + "ocs_rbd_mirror_image_state", marks=pytest.mark.polarion_id("ocs-tm016") + ), + pytest.param( + "ocs_pool_mirroring_image_health", + marks=pytest.mark.polarion_id("ocs-tm017"), + ), + pytest.param( + "ocs_pool_mirroring_status", marks=pytest.mark.polarion_id("ocs-tm017") + ), + pytest.param( + "ocs_rbd_client_blocklisted", marks=pytest.mark.polarion_id("ocs-tm023") + ), + ], +) +def test_consumer_name_label_on_provider_metric(metric_name): + """ + TC-7.1 through TC-7.5 consolidated: Verify per-client metrics carry consumer_name + label on the provider cluster. + + Each metric is expected to have consumer_name populated for remote client workloads. + Metrics that require mirroring or specific setup are skipped if not present. + + Polarion: + ocs-tm014, ocs-tm015, ocs-tm016, ocs-tm017, ocs-tm023 + """ + ome_helpers.skip_if_no_provider_with_consumers() + families = _get_provider_exporter_families() + if metric_name not in families: + pytest.skip( + f"{metric_name} not present in /metrics " + "(may need mirroring, clones, or blocklist setup)" + ) + ome_helpers.assert_metric_has_consumer_name(families, metric_name) + + consumer_names = ome_helpers.get_consumer_names_from_metrics(families, metric_name) + logger.info( + "metric %s: consumer_name values found: %s", + metric_name, + consumer_names, + ) + assert len(consumer_names) >= 1, ( + f"expected at least one consumer_name value for {metric_name}; " + f"got {consumer_names}" + )