diff --git a/.secrets.baseline b/.secrets.baseline index 8e0612d5f0f0..f81c0506555d 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -645,7 +645,7 @@ "hashed_secret": "a12337323b638ab044b1166bff4b1a1f83162819", "is_secret": false, "is_verified": false, - "line_number": 829, + "line_number": 832, "type": "Secret Keyword", "verified_result": null }, @@ -653,7 +653,7 @@ "hashed_secret": "fb947972c92f052c0a08866d182be0075a2b601b", "is_secret": false, "is_verified": false, - "line_number": 838, + "line_number": 841, "type": "Secret Keyword", "verified_result": null }, @@ -661,7 +661,7 @@ "hashed_secret": "03e227627ab8681281fdb8aa3d799b03f782d672", "is_secret": false, "is_verified": false, - "line_number": 2031, + "line_number": 2034, "type": "Secret Keyword", "verified_result": null }, @@ -669,7 +669,7 @@ "hashed_secret": "ef5f3d909f23bd0aa02b4253f98350384f709c86", "is_secret": false, "is_verified": false, - "line_number": 2138, + "line_number": 2141, "type": "Secret Keyword", "verified_result": null }, @@ -677,7 +677,7 @@ "hashed_secret": "cb1ae2b504c4615841d8144267a131231d2bd677", "is_secret": false, "is_verified": false, - "line_number": 2139, + "line_number": 2142, "type": "Secret Keyword", "verified_result": null }, @@ -685,7 +685,7 @@ "hashed_secret": "1a1e70e87dd0452c42f33ce9bf74aa28134dba6b", "is_secret": false, "is_verified": false, - "line_number": 2140, + "line_number": 2143, "type": "Secret Keyword", "verified_result": null }, @@ -693,7 +693,7 @@ "hashed_secret": "7b1ba2f04f2f1604dc4e3caffcadf9fcbce7df5b", "is_secret": false, "is_verified": false, - "line_number": 2141, + "line_number": 2144, "type": "Secret Keyword", "verified_result": null }, @@ -701,7 +701,7 @@ "hashed_secret": "0fa3b21ced80146d752888f2b60ec80e0d4b8925", "is_secret": false, "is_verified": false, - "line_number": 2146, + "line_number": 2149, "type": "Secret Keyword", "verified_result": null }, @@ -709,7 +709,7 @@ "hashed_secret": "f084f2068494b8d1cd06811dd97d02c3d85f40ee", "is_secret": false, "is_verified": false, - "line_number": 2161, + "line_number": 2164, "type": "Secret Keyword", "verified_result": null }, @@ -717,7 +717,7 @@ "hashed_secret": "adfa401a3b0a733d8f00519ac8c6b3893a2e7e8e", "is_secret": false, "is_verified": false, - "line_number": 2162, + "line_number": 2165, "type": "Secret Keyword", "verified_result": null }, @@ -725,7 +725,7 @@ "hashed_secret": "898e46bbadc12f87120548bd445eb4210c8407c8", "is_secret": false, "is_verified": false, - "line_number": 2170, + "line_number": 2173, "type": "Secret Keyword", "verified_result": null }, @@ -733,7 +733,7 @@ "hashed_secret": "f57ccec6b8f7b12b635ab53d26c3bf7300247341", "is_secret": false, "is_verified": false, - "line_number": 2171, + "line_number": 2174, "type": "Secret Keyword", "verified_result": null }, @@ -741,7 +741,7 @@ "hashed_secret": "77b044ea736f8cbe568d1954424186d901f89db9", "is_secret": false, "is_verified": false, - "line_number": 2172, + "line_number": 2175, "type": "Secret Keyword", "verified_result": null }, @@ -749,7 +749,7 @@ "hashed_secret": "d64368f12ca17c69568c6a132f17d44d56e60660", "is_secret": false, "is_verified": false, - "line_number": 2173, + "line_number": 2176, "type": "Secret Keyword", "verified_result": null }, @@ -757,7 +757,7 @@ "hashed_secret": "8f9ca35156c02cb6ba58c5b51230b9bedc38de4f", "is_secret": false, "is_verified": false, - "line_number": 2174, + "line_number": 2177, "type": "Secret Keyword", "verified_result": null }, @@ -765,7 +765,7 @@ "hashed_secret": "9ec53cfd9929c70c3f87c210b6a7b77fb6d79d43", "is_secret": false, "is_verified": false, - "line_number": 2767, + "line_number": 2773, "type": "Secret Keyword", "verified_result": null }, @@ -773,7 +773,7 @@ "hashed_secret": "ee977806d7286510da8b9a7492ba58e2484c0ecc", "is_secret": false, "is_verified": false, - "line_number": 2924, + "line_number": 2930, "type": "Secret Keyword", "verified_result": null }, @@ -781,7 +781,7 @@ "hashed_secret": "adc1f5c8707f7d7aba3aabe13c15e5ef1151872e", "is_secret": false, "is_verified": false, - "line_number": 2925, + "line_number": 2931, "type": "Secret Keyword", "verified_result": null }, @@ -789,7 +789,7 @@ "hashed_secret": "ee46262b2df945e46ea310b925ad087465dbd3f2", "is_secret": false, "is_verified": false, - "line_number": 3646, + "line_number": 3652, "type": "Secret Keyword", "verified_result": null }, @@ -797,7 +797,7 @@ "hashed_secret": "f678cad4ab874d71b559a069d5e34a95fe38a480", "is_secret": false, "is_verified": false, - "line_number": 3647, + "line_number": 3653, "type": "Secret Keyword", "verified_result": null } diff --git a/ocs_ci/deployment/helpers/external_cluster_helpers.py b/ocs_ci/deployment/helpers/external_cluster_helpers.py index 0a002ce72242..3583790c72f5 100644 --- a/ocs_ci/deployment/helpers/external_cluster_helpers.py +++ b/ocs_ci/deployment/helpers/external_cluster_helpers.py @@ -41,6 +41,7 @@ encode, decode, download_file, + exec_cmd, wait_for_machineconfigpool_status, create_config_ini_file, ) @@ -120,14 +121,11 @@ def __init__(self, host, user, password=None, ssh_key=None): "No SSH Auth to connect to external RHCS cluster provided! " "Either password or SSH key is missing in EXTERNAL_MODE['login'] section!" ) - # adding jump host configuration to connect to external RHCS cluster on ibmcloud via jump host - self.jump_host = None - if config.ENV_DATA.get("platform") == constants.IBMCLOUD_PLATFORM: - self.jump_host = config.DEPLOYMENT.get("ssh_jump_host", None) - if self.jump_host and not self.jump_host.get("private_key"): - self.jump_host["private_key"] = os.path.expanduser( - config.DEPLOYMENT["ssh_key_private"] - ) + self.jump_host = config.DEPLOYMENT.get("ssh_jump_host") + if self.jump_host and not self.jump_host.get("private_key"): + ssh_key_private = config.DEPLOYMENT.get("ssh_key_private") + if ssh_key_private: + self.jump_host["private_key"] = os.path.expanduser(ssh_key_private) self.rhcs_conn = Connection( host=self.host, @@ -169,30 +167,33 @@ def exec_external_ceph_cmd( raise exception_class(f"{error_msg}: {err}") return retcode, out, err - def get_external_cluster_details(self): + def build_exporter_base_params(self, include_rgw=True): """ - Gets the external RHCS cluster details and updates to config.EXTERNAL_MODE - - Raises: - ExternalClusterExporterRunFailed: If exporter script failed to run on external RHCS cluster + Build the base parameter string for the exporter script. - """ - # get rgw endpoint port - rgw_endpoint_port = self.get_rgw_endpoint_api_port() + Reads cluster config to construct the flags needed by + create-external-cluster-resources.py. This method has no side effects + (does not modify config, delete users, or create namespaces). - # get rgw endpoint - rgw_endpoint = get_rgw_endpoint() - rgw_endpoint_with_port = f"{rgw_endpoint}:{rgw_endpoint_port}" + Args: + include_rgw (bool): If True (default), include --rgw-endpoint. + Set to False for clusters without RGW deployed. - # get ceph filesystem - ceph_fs_name = config.ENV_DATA.get("cephfs_name") or self.get_ceph_fs() + Returns: + str: Parameter string for run_exporter_script(). + """ rbd_name = config.ENV_DATA.get("rbd_name") or defaults.RBD_NAME cluster_name = config.ENV_DATA.get("cluster_name") or defaults.RHCS_CLUSTER_NAME - params = ( - f"--rbd-data-pool-name {rbd_name} --rgw-endpoint {rgw_endpoint_with_port}" - ) + params = f"--rbd-data-pool-name {rbd_name}" + + if include_rgw: + rgw_endpoint_port = self.get_rgw_endpoint_api_port() + rgw_endpoint = get_rgw_endpoint() + params += f" --rgw-endpoint {rgw_endpoint}:{rgw_endpoint_port}" + + ceph_fs_name = config.ENV_DATA.get("cephfs_name") or self.get_ceph_fs() if config.ENV_DATA["restricted-auth-permission"]: if config.ENV_DATA["use_k8s_cluster_name"]: @@ -207,8 +208,6 @@ def get_external_cluster_details(self): f"{params} --restricted-auth-permission true --cluster-name {cluster_name} " f"--alias-rbd-data-pool-name {alias_rbd_name}" ) - config.ENV_DATA["restricted-auth-permission"] = True - config.ENV_DATA["alias_rbd_name"] = alias_rbd_name if config.ENV_DATA.get("rgw-realm"): rgw_realm = config.ENV_DATA["rgw-realm"] @@ -219,6 +218,42 @@ def get_external_cluster_details(self): f"--rgw-zone-name {rgw_zone}" ) + if config.EXTERNAL_MODE.get("run_as_user"): + ceph_user = config.EXTERNAL_MODE["run_as_user"] + params = f"{params} --run-as-user {ceph_user}" + + if config.EXTERNAL_MODE.get("use_rbd_namespace"): + rbd_namespace = config.EXTERNAL_MODE.get("rbd_namespace") + if rbd_namespace: + params = f"{params} --rados-namespace {rbd_namespace}" + if "restricted-auth-permission" not in params: + params += " --restricted-auth-permission true" + if "cluster-name" not in params: + params += f" --k8s-cluster-name {cluster_name}" + + return params + + def get_external_cluster_details(self): + """ + Gets the external RHCS cluster details and updates to config.EXTERNAL_MODE + + Raises: + ExternalClusterExporterRunFailed: If exporter script failed to run on external RHCS cluster + + """ + rbd_name = config.ENV_DATA.get("rbd_name") or defaults.RBD_NAME + cluster_name = config.ENV_DATA.get("cluster_name") or defaults.RHCS_CLUSTER_NAME + + # Side effects that must run before building params so that + # build_exporter_base_params() sees restricted-auth and alias flags + if "." in rbd_name or "_" in rbd_name: + config.ENV_DATA["restricted-auth-permission"] = True + config.ENV_DATA["alias_rbd_name"] = rbd_name.replace(".", "-").replace( + "_", "-" + ) + + params = self.build_exporter_base_params() + # remove user 'rgw-admin-ops-user' if it exists since user creation is handled by # external python script with necessary caps if config.MULTICLUSTER.get( @@ -230,22 +265,21 @@ def get_external_cluster_details(self): else: self.remove_rgw_user() - if config.EXTERNAL_MODE.get("run_as_user"): - ceph_user = config.EXTERNAL_MODE["run_as_user"] - params = f"{params} --run-as-user {ceph_user}" - if config.EXTERNAL_MODE.get("use_rbd_namespace"): rbd_namespace = config.EXTERNAL_MODE.get("rbd_namespace") if not rbd_namespace: rbd_namespace = self.create_rbd_namespace(rbd=rbd_name) config.EXTERNAL_MODE["rbd_namespace"] = rbd_namespace - - params = f"{params} --rados-namespace {rbd_namespace}" - if "restricted-auth-permission" not in params: - params += " --restricted-auth-permission true" + # Append params that build_exporter_base_params skipped + # (namespace didn't exist in config yet when it ran) + params += f" --rados-namespace {rbd_namespace}" + if "restricted-auth-permission" not in params: + params += " --restricted-auth-permission true" + if "cluster-name" not in params: + params += f" --k8s-cluster-name {cluster_name}" + + if not config.ENV_DATA.get("restricted-auth-permission"): config.ENV_DATA["restricted-auth-permission"] = True - if "cluster-name" not in params: - params += f" --k8s-cluster-name {cluster_name}" out = self.run_exporter_script(params=params) @@ -269,16 +303,17 @@ def upload_exporter_script(self): if ocs_version <= version.VERSION_4_18: use_configmap = False script_path = generate_exporter_script(use_configmap=use_configmap) + remote_script_path = f"/tmp/{os.path.basename(script_path)}" upload_file( self.host, script_path, - script_path, + remote_script_path, self.user, self.password, self.ssh_key, ssh_connection=self.rhcs_conn if self.jump_host else None, ) - return script_path + return remote_script_path def upload_rgw_cert_ca(self): """ @@ -1021,6 +1056,148 @@ def cleanup_zone_crush_rules(self, rule_names: list[str]) -> None: logger.info("Cleanup of CRUSH rules completed") + def create_topology_pools( + self, + pool_names: list[str], + pool_size: int = 3, + pg_num: int = 32, + ) -> list[str]: + """ + Create replicated RBD pools for topology-aware provisioning. + + Unlike create_replica_one_pools(), this creates standard replicated pools + (size >= 2) without per-zone CRUSH rules — the default replicated_rule + distributes replicas across hosts automatically. + + For each pool executes: + - ceph osd pool create replicated + - ceph osd pool set size + - ceph osd pool application enable rbd + + Args: + pool_names (list[str]): List of pool names to create. + pool_size (int): Replication factor (default 3). + pg_num (int): Placement groups per pool (default 32). + + Returns: + list[str]: List of created pool names. + + Raises: + ExternalClusterPoolCreationFailed: If pool creation fails. + + """ + if not pool_names: + raise ValueError("pool_names cannot be empty") + + _, out, _ = self.exec_external_ceph_cmd( + cmd="ceph osd pool ls", + error_msg="Failed to list existing pools", + exception_class=ExternalClusterPoolCreationFailed, + ) + existing_pools = out.strip().split("\n") if out.strip() else [] + + created_pools = [] + for pool_name in pool_names: + if pool_name in existing_pools: + logger.info(f"Pool {pool_name} already exists, skipping creation") + created_pools.append(pool_name) + continue + + logger.info( + f"Creating topology pool: {pool_name} (size={pool_size}, pg_num={pg_num})" + ) + + self.exec_external_ceph_cmd( + cmd=f"ceph osd pool create {pool_name} {pg_num} {pg_num} replicated", + error_msg=f"Failed to create pool {pool_name}", + exception_class=ExternalClusterPoolCreationFailed, + ) + + self.exec_external_ceph_cmd( + cmd=f"ceph osd pool set {pool_name} size {pool_size}", + error_msg=f"Failed to set size {pool_size} for pool {pool_name}", + exception_class=ExternalClusterPoolCreationFailed, + ) + + self.exec_external_ceph_cmd( + cmd=f"ceph osd pool application enable {pool_name} rbd", + error_msg=f"Failed to enable rbd for pool {pool_name}", + exception_class=ExternalClusterPoolCreationFailed, + ) + + logger.info(f"Created topology pool: {pool_name}") + created_pools.append(pool_name) + + return created_pools + + +def save_external_cluster_secret(): + """ + Save the current external cluster secret data for later restoration. + + Returns: + str: The base64-encoded external_cluster_details value. + + """ + ns = config.ENV_DATA["cluster_namespace"] + secret_ocp = OCP(kind="Secret", namespace=ns) + secret_data = secret_ocp.get(resource_name="rook-ceph-external-cluster-details") + return secret_data["data"]["external_cluster_details"] + + +def patch_external_cluster_secret(exporter_json_output): + """ + Patch the rook-ceph-external-cluster-details secret with new exporter output. + + Args: + exporter_json_output (str): Raw JSON output from the exporter script. + + """ + ns = config.ENV_DATA["cluster_namespace"] + with tempfile.NamedTemporaryFile( + mode="w", prefix="external-cluster-details-", suffix=".json", delete=False + ) as fd: + fd.write(exporter_json_output) + tmp_path = fd.name + + try: + cmd = ( + f"oc set data secret/rook-ceph-external-cluster-details -n {ns} " + f"--from-file=external_cluster_details={tmp_path}" + ) + exec_cmd(cmd) + logger.info("Patched rook-ceph-external-cluster-details secret") + finally: + os.unlink(tmp_path) + + +def restore_external_cluster_secret(original_b64_value): + """ + Restore the external cluster secret to its original value. + + Args: + original_b64_value (str): The original base64-encoded value + from save_external_cluster_secret(). + + """ + ns = config.ENV_DATA["cluster_namespace"] + secret_ocp = OCP(kind="Secret", namespace=ns) + params = json.dumps( + [ + { + "op": "replace", + "path": "/data/external_cluster_details", + "value": original_b64_value, + } + ] + ) + secret_ocp.patch( + resource_name="rook-ceph-external-cluster-details", + params=params, + format_type="json", + ) + logger.info("Restored rook-ceph-external-cluster-details secret") + def get_exporter_script_from_configmap(): """ diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index aff7a213613f..5ed94975fbd8 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -271,6 +271,9 @@ EXTERNAL_RGW_SC_NAME = "ocs-external-storagecluster-ceph-rgw" CEPH_CLUSTER_NAME = "ocs-storagecluster-cephcluster" REPLICA1_STORAGECLASS = "ocs-storagecluster-ceph-non-resilient-rbd" +DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD = ( + "ocs-external-storagecluster-ceph-non-resilient-rbd" +) ENDPOINTS = "Endpoints" WEBHOOK = "ValidatingWebhookConfiguration" ROOK_CEPH_WEBHOOK = "rook-ceph-webhook" diff --git a/tests/functional/external_mode/test_topology_awareness.py b/tests/functional/external_mode/test_topology_awareness.py new file mode 100644 index 000000000000..8875cdc5216e --- /dev/null +++ b/tests/functional/external_mode/test_topology_awareness.py @@ -0,0 +1,602 @@ +""" +Test topology-aware provisioning in ODF external mode (GA). + +Validates that PVs are created in the correct Ceph pool based on the node's +failure domain (hostname or zone) when using the non-resilient StorageClass +with topologyConstrainedPools. + +Jira: RHSTOR-5525 +""" + +import json +import logging +import time + +import pytest + +from ocs_ci.framework import config +from ocs_ci.framework.pytest_customization.marks import external_mode_required +from ocs_ci.framework.testlib import ( + ManageTest, + brown_squad, + ignore_leftovers, + polarion_id, + tier2, +) +from ocs_ci.helpers.helpers import ( + create_pvc, + create_pod, + create_resource, + is_volume_present_in_backend, + verify_volume_deleted_in_backend, + wait_for_resource_state, +) +from ocs_ci.deployment.helpers.external_cluster_helpers import ( + get_external_cluster_instance, + patch_external_cluster_secret, + restore_external_cluster_secret, + save_external_cluster_secret, +) +from ocs_ci.ocs import constants +from ocs_ci.ocs.exceptions import CommandFailed +from ocs_ci.ocs.node import get_worker_nodes, get_node_objs +from ocs_ci.ocs.ocp import OCP +from ocs_ci.ocs.resources.pod import ( + delete_pods, + get_ocs_operator_pod, + get_operator_pods, + get_pod_node, +) +from ocs_ci.utility.utils import TimeoutSampler + +log = logging.getLogger(__name__) + + +def _get_worker_hostnames(): + """ + Get hostnames for all worker nodes. + + Returns: + list[str]: Worker node hostnames (kubernetes.io/hostname label values). + + """ + worker_names = get_worker_nodes() + workers = get_node_objs(worker_names) + return [w.data["metadata"]["labels"][constants.HOSTNAME_LABEL] for w in workers] + + +def _build_topology_config(): + """ + Build topology configuration from config overrides or auto-detection. + + Returns: + dict: Keys: pool_names, failure_domain_label, failure_domain_values, + pool_size, pg_num. + + """ + topo_cfg = config.EXTERNAL_MODE.get("topology", {}) + hostnames = _get_worker_hostnames() + + if len(hostnames) < 3: + pytest.skip( + f"Need at least 3 worker nodes for topology test, found {len(hostnames)}" + ) + + pool_names = topo_cfg.get( + "pool_names", + [f"topology-pool-{i + 1}" for i in range(3)], + ) + fd_label = topo_cfg.get("failure_domain_label", "host") + fd_values = topo_cfg.get("failure_domain_values", hostnames[:3]) + pool_size = topo_cfg.get("pool_size", 3) + pg_num = topo_cfg.get("pg_num", 32) + + assert len(pool_names) == len(fd_values), ( + f"Topology config mismatch: pool_names={len(pool_names)} " + f"!= failure_domain_values={len(fd_values)}" + ) + + return { + "pool_names": pool_names, + "failure_domain_label": fd_label, + "failure_domain_values": fd_values, + "pool_size": pool_size, + "pg_num": pg_num, + } + + +def _build_pool_to_node_map(pool_names, fd_values): + """ + Build a mapping from pool name to failure domain value (hostname/zone). + + Args: + pool_names (list[str]): Pool names. + fd_values (list[str]): Failure domain values (same order as pools). + + Returns: + dict[str, str]: {pool_name: failure_domain_value} + + """ + return dict(zip(pool_names, fd_values)) + + +def _get_pv_pool(pvc_obj): + """ + Get the Ceph pool name from a bound PVC's backing PV. + + Args: + pvc_obj: PVC object (must be Bound). + + Returns: + str: Pool name from PV's CSI volumeAttributes. + + """ + pv_data = pvc_obj.backed_pv_obj.get() + return pv_data["spec"]["csi"]["volumeAttributes"]["pool"] + + +def _restart_operators_and_wait(): + """ + Restart OCS and Rook-Ceph operators, then wait for StorageCluster Ready. + + """ + ns = config.ENV_DATA["cluster_namespace"] + + log.info("Restarting OCS operator") + ocs_pod = get_ocs_operator_pod(namespace=ns) + delete_pods([ocs_pod]) + + log.info("Restarting Rook-Ceph operator") + rook_pods = get_operator_pods(namespace=ns) + delete_pods(rook_pods) + + log.info("Waiting for StorageCluster to reach Ready state") + sc_ocp = OCP( + kind="StorageCluster", + namespace=ns, + ) + for sample in TimeoutSampler( + timeout=300, + sleep=15, + func=sc_ocp.get, + ): + items = sample.get("items", []) + if items and items[0].get("status", {}).get("phase") == constants.STATUS_READY: + log.info("StorageCluster is Ready") + break + + +def _has_rgw_endpoint(): + """ + Check if the external cluster has RGW deployed. + + Returns: + bool: True if any node has the 'rgw' role. + + """ + node_roles = config.EXTERNAL_MODE.get("external_cluster_node_roles", {}) + return any("rgw" in node.get("role", []) for node in node_roles.values()) + + +def _build_exporter_topology_params(topo_config, ext_cluster): + """ + Build the exporter script parameters string for topology-aware setup. + + Starts with the full base params (CephFS, auth, and optionally RGW) to + ensure the exporter output is complete, then appends topology-specific flags. + + Args: + topo_config (dict): Topology configuration from _build_topology_config(). + ext_cluster (ExternalCluster): ExternalCluster instance. + + Returns: + str: Parameter string for ExternalCluster.run_exporter_script(). + + """ + include_rgw = _has_rgw_endpoint() + params = ext_cluster.build_exporter_base_params(include_rgw=include_rgw) + + pools_csv = ",".join(topo_config["pool_names"]) + fd_values_csv = ",".join(topo_config["failure_domain_values"]) + fd_label = topo_config["failure_domain_label"] + + params += ( + f" --topology-pools {pools_csv}" + f" --topology-failure-domain-label {fd_label}" + f" --topology-failure-domain-values {fd_values_csv}" + ) + return params + + +@brown_squad +@tier2 +@ignore_leftovers +@external_mode_required +class TestTopologyAwarenessExternal(ManageTest): + """ + Test topology-aware provisioning in external mode (GA). + + Validates that PVs are created in the correct Ceph pool based on + the node's failure domain when using the non-resilient StorageClass. + """ + + @pytest.fixture(autouse=True, scope="class") + def topology_setup(self, request): + """ + Set up topology-aware provisioning on the external cluster. + + Steps: + 1. Save original external cluster secret + 2. Auto-detect or load topology config + 3. Create topology pools on external Ceph + 4. Run exporter with topology flags + 5. Patch secret + restart operators + 6. Wait for topology SC to be auto-created + + Cleanup (addfinalizer): + 1. Restore original secret + 2. Restart operators + 3. Delete topology pools + """ + topo_config = _build_topology_config() + pool_names = topo_config["pool_names"] + fd_values = topo_config["failure_domain_values"] + + log.info( + f"Topology config: pools={pool_names}, " + f"fd_label={topo_config['failure_domain_label']}, " + f"fd_values={fd_values}" + ) + + # Save original secret for cleanup + original_secret = save_external_cluster_secret() + + # Register finalizer BEFORE any mutating operations so cleanup + # runs even if setup fails partway through (e.g., SSH timeout + # during pool creation leaves orphaned pools on external Ceph). + def finalizer(): + log.info("Topology test cleanup: restoring original configuration") + + restore_external_cluster_secret(original_secret) + _restart_operators_and_wait() + + sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD + sc_ocp = OCP(kind=constants.STORAGECLASS) + try: + sc_ocp.delete(resource_name=sc_name) + log.info(f"Deleted topology StorageClass {sc_name}") + except CommandFailed: + log.info(f"StorageClass {sc_name} not found, skipping deletion") + + ext_cluster_cleanup = get_external_cluster_instance() + ext_cluster_cleanup.cleanup_replica_one_pools(pool_names) + log.info("Topology cleanup completed") + + request.addfinalizer(finalizer) + + # Store config on the class for test methods + request.cls.topo_config = topo_config + request.cls.pool_names = pool_names + request.cls.pool_to_node = _build_pool_to_node_map(pool_names, fd_values) + request.cls.node_to_pool = {v: k for k, v in request.cls.pool_to_node.items()} + + # Create topology pools on external Ceph + ext_cluster = get_external_cluster_instance() + created_pools = ext_cluster.create_topology_pools( + pool_names=pool_names, + pool_size=topo_config["pool_size"], + pg_num=topo_config["pg_num"], + ) + log.info(f"Created topology pools: {created_pools}") + + # Run exporter with topology flags (includes full base params) + topology_params = _build_exporter_topology_params(topo_config, ext_cluster) + log.info(f"Running exporter with topology params: {topology_params}") + exporter_output = ext_cluster.run_exporter_script(params=topology_params) + log.info("Exporter script completed successfully") + + # Patch secret and restart operators + patch_external_cluster_secret(exporter_output) + _restart_operators_and_wait() + + # Wait for topology SC to be auto-created by the operator. + # StorageCluster reaches Ready before the SC is reconciled. + sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD + sc_ocp = OCP(kind=constants.STORAGECLASS) + log.info(f"Waiting for StorageClass {sc_name} to be created") + sc_created = False + for sample in TimeoutSampler( + timeout=120, + sleep=10, + func=sc_ocp.get, + resource_name=sc_name, + dont_raise=True, + ): + if sample: + log.info(f"StorageClass {sc_name} created by operator") + sc_created = True + break + assert sc_created, f"StorageClass {sc_name} was not created within timeout" + + @polarion_id("OCS-7930") + def test_topology_sc_auto_created(self): + """ + Verify that ODF auto-creates the non-resilient StorageClass with + correct topology parameters after the external cluster secret is + updated with topology configuration. + """ + sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD + sc_ocp = OCP(kind=constants.STORAGECLASS) + sc_data = sc_ocp.get(resource_name=sc_name) + + # volumeBindingMode must be WaitForFirstConsumer + assert ( + sc_data["volumeBindingMode"] == "WaitForFirstConsumer" + ), f"Expected WaitForFirstConsumer, got {sc_data['volumeBindingMode']}" + + # topologyConstrainedPools must list all pools + params = sc_data.get("parameters", {}) + topo_pools_raw = params.get("topologyConstrainedPools") + assert ( + topo_pools_raw + ), f"topologyConstrainedPools not found in SC {sc_name} parameters" + topo_pools = json.loads(topo_pools_raw) + + pool_names_in_sc = [p["poolName"] for p in topo_pools] + for pool in self.pool_names: + assert ( + pool in pool_names_in_sc + ), f"Pool {pool} not found in topologyConstrainedPools: {pool_names_in_sc}" + + # topologyFailureDomainLabel should match config + expected_fd_label = self.topo_config["failure_domain_label"] + assert params.get("topologyFailureDomainLabel") == expected_fd_label, ( + f"Expected topologyFailureDomainLabel={expected_fd_label}, " + f"got {params.get('topologyFailureDomainLabel')}" + ) + log.info(f"StorageClass {sc_name} validated successfully") + + @polarion_id("OCS-7931") + def test_pvc_pending_without_pod(self, project_factory): + """ + Verify that PVC with topology SC stays Pending when no pod + consumes it (WaitForFirstConsumer behavior). + """ + ns = project_factory().namespace + pvc = create_pvc( + namespace=ns, + sc_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD, + size="1Gi", + access_mode=constants.ACCESS_MODE_RWO, + ) + + # PVC should stay Pending (WaitForFirstConsumer) + time.sleep(10) + pvc.reload() + assert ( + pvc.status == constants.STATUS_PENDING + ), f"Expected PVC to be Pending, got {pvc.status}" + log.info("PVC correctly stays Pending without a consuming pod") + + @polarion_id("OCS-7932") + def test_single_pod_topology_placement(self, project_factory): + """ + Verify that a single pod's PV is created in the correct topology + pool based on the scheduled node's failure domain. + """ + ns = project_factory().namespace + pvc = create_pvc( + namespace=ns, + sc_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD, + size="1Gi", + access_mode=constants.ACCESS_MODE_RWO, + ) + + pod_obj = create_pod( + interface_type=constants.CEPHBLOCKPOOL, + pvc_name=pvc.name, + namespace=ns, + ) + wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout=300) + pod_obj.reload() + + # Get the node where the pod is running + node_obj = get_pod_node(pod_obj) + node_hostname = node_obj.data["metadata"]["labels"][constants.HOSTNAME_LABEL] + log.info(f"Pod scheduled on node: {node_hostname}") + + # Get pool from PV + pvc.reload() + wait_for_resource_state(pvc, constants.STATUS_BOUND, timeout=60) + pv_pool = _get_pv_pool(pvc) + log.info(f"PV created in pool: {pv_pool}") + + # Verify pool matches the node's expected mapping + expected_pool = self.node_to_pool.get(node_hostname) + assert ( + expected_pool is not None + ), f"Node {node_hostname} not found in topology mapping: {self.node_to_pool}" + assert pv_pool == expected_pool, ( + f"PV pool {pv_pool} does not match expected pool {expected_pool} " + f"for node {node_hostname}" + ) + log.info(f"Topology placement verified: node={node_hostname} -> pool={pv_pool}") + + # Write data and verify RBD image exists in the correct pool + pod_obj.exec_cmd_on_pod( + command="dd if=/dev/urandom of=/var/lib/www/html/testfile bs=1M count=50" + ) + image_uuid = pvc.image_uuid + assert is_volume_present_in_backend( + interface=constants.CEPHBLOCKPOOL, + image_uuid=image_uuid, + pool_name=pv_pool, + ), f"RBD image {image_uuid} not found in pool {pv_pool}" + log.info(f"Data verified in pool {pv_pool}, image uuid {image_uuid}") + + @polarion_id("OCS-7933") + def test_statefulset_spreads_across_pools(self, project_factory): + """ + Verify that a 3-replica StatefulSet with topologySpreadConstraints + places each PV in the correct pool based on each pod's node. + """ + proj = project_factory() + ns = proj.namespace + sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD + replica_count = len(self.pool_names) + + sts_name = "topo-sts" + sts_yaml = { + "apiVersion": "apps/v1", + "kind": "StatefulSet", + "metadata": {"name": sts_name, "namespace": ns}, + "spec": { + "serviceName": sts_name, + "replicas": replica_count, + "selector": {"matchLabels": {"app": sts_name}}, + "template": { + "metadata": {"labels": {"app": sts_name}}, + "spec": { + "topologySpreadConstraints": [ + { + "maxSkew": 1, + "topologyKey": constants.HOSTNAME_LABEL, + "whenUnsatisfiable": "DoNotSchedule", + "labelSelector": {"matchLabels": {"app": sts_name}}, + } + ], + "containers": [ + { + "name": "test", + "image": "busybox", + "command": ["sh", "-c", "sleep 3600"], + "volumeMounts": [ + {"name": "data", "mountPath": "/data"} + ], + } + ], + }, + }, + "volumeClaimTemplates": [ + { + "metadata": {"name": "data"}, + "spec": { + "accessModes": [constants.ACCESS_MODE_RWO], + "storageClassName": sc_name, + "resources": {"requests": {"storage": "1Gi"}}, + }, + } + ], + }, + } + + create_resource(**sts_yaml) + log.info(f"Created StatefulSet {sts_name} with {replica_count} replicas") + + # Wait for all pods to be Running + pod_ocp = OCP(kind=constants.POD, namespace=ns) + for i in range(replica_count): + pod_name = f"{sts_name}-{i}" + log.info(f"Waiting for pod {pod_name}") + pod_ocp.wait_for_resource( + condition=constants.STATUS_RUNNING, + resource_name=pod_name, + timeout=300, + ) + + # Verify each pod's PV is in the correct pool + pools_used = set() + pvc_ocp = OCP(kind=constants.PVC, namespace=ns) + for i in range(replica_count): + pod_name = f"{sts_name}-{i}" + pvc_name = f"data-{sts_name}-{i}" + + # Get pod's node + pod_data = pod_ocp.get(resource_name=pod_name) + node_name = pod_data["spec"]["nodeName"] + node_objs = get_node_objs([node_name]) + node_hostname = node_objs[0].data["metadata"]["labels"][ + constants.HOSTNAME_LABEL + ] + + # Get PVC's pool + pvc_data = pvc_ocp.get(resource_name=pvc_name) + pv_name = pvc_data["spec"]["volumeName"] + pv_ocp = OCP(kind=constants.PV) + pv_data = pv_ocp.get(resource_name=pv_name) + pv_pool = pv_data["spec"]["csi"]["volumeAttributes"]["pool"] + + expected_pool = self.node_to_pool.get(node_hostname) + assert expected_pool is not None, ( + f"Node {node_hostname} not found in topology mapping: " + f"{self.node_to_pool}" + ) + assert pv_pool == expected_pool, ( + f"Pod {pod_name} on node {node_hostname}: " + f"PV pool {pv_pool} != expected {expected_pool}" + ) + pools_used.add(pv_pool) + log.info(f"Pod {pod_name}: node={node_hostname}, pool={pv_pool} - CORRECT") + + assert len(pools_used) == replica_count, ( + f"Expected {replica_count} different pools, " + f"got {len(pools_used)}: {pools_used}" + ) + log.info(f"All {replica_count} topology pools used: {pools_used}") + + @polarion_id("OCS-7934") + def test_pvc_deletion_cleans_rbd_image(self, project_factory): + """ + Verify that deleting a PVC removes the RBD image from the Ceph pool + (reclaimPolicy: Delete). + """ + ns = project_factory().namespace + pvc = create_pvc( + namespace=ns, + sc_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_NON_RESILIENT_RBD, + size="1Gi", + access_mode=constants.ACCESS_MODE_RWO, + ) + + pod_obj = create_pod( + interface_type=constants.CEPHBLOCKPOOL, + pvc_name=pvc.name, + namespace=ns, + ) + wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout=300) + pod_obj.reload() + + # Write some data + pod_obj.exec_cmd_on_pod( + command="dd if=/dev/urandom of=/var/lib/www/html/testfile bs=1M count=10" + ) + + # Record pool and image uuid + pvc.reload() + pv_pool = _get_pv_pool(pvc) + image_uuid = pvc.image_uuid + log.info(f"RBD image uuid {image_uuid} in pool {pv_pool}") + + # Verify image exists before deletion + assert is_volume_present_in_backend( + interface=constants.CEPHBLOCKPOOL, + image_uuid=image_uuid, + pool_name=pv_pool, + ) + + # Delete pod and PVC + pod_obj.delete() + pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name, timeout=120) + pvc.delete() + pvc.ocp.wait_for_delete(resource_name=pvc.name, timeout=120) + + # Verify RBD image is removed (reclaimPolicy: Delete) + assert verify_volume_deleted_in_backend( + interface=constants.CEPHBLOCKPOOL, + image_uuid=image_uuid, + pool_name=pv_pool, + timeout=120, + ), f"RBD image {image_uuid} was not deleted from pool {pv_pool}" + log.info( + f"RBD image {image_uuid} removed from pool {pv_pool} after PVC deletion" + )