From f871f0aaf63cb16f551bd4964bcfa10db05b90db Mon Sep 17 00:00:00 2001 From: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> Date: Tue, 17 Mar 2026 17:00:40 +0530 Subject: [PATCH 1/3] Fix mon pod selection on LSO deployments in test_mon_data_avail_warn The LSO code path used ceph_daemon_id label to find the mon pod, but this label is shared between mon and mgr pods (both have ceph_daemon_id=a). When the mgr pod was returned first by the API, the test selected it instead of the mon pod, causing mon_suffix=None and mkdir failure. Replace the LSO-specific pod lookup with get_mon_pods() which uses the unambiguous app=rook-ceph-mon label. The worker node for the LSO dd path is now derived from the selected mon pod's spec.nodeName. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> --- .../z_cluster/test_mon_data_avail_warn.py | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/tests/functional/z_cluster/test_mon_data_avail_warn.py b/tests/functional/z_cluster/test_mon_data_avail_warn.py index ae3312a571b9..5fba486a1eec 100644 --- a/tests/functional/z_cluster/test_mon_data_avail_warn.py +++ b/tests/functional/z_cluster/test_mon_data_avail_warn.py @@ -16,7 +16,6 @@ skipif_ocs_version, skipif_external_mode, ) -from ocs_ci.ocs import node from ocs_ci.ocs.resources import pod from ocs_ci.ocs.cluster import CephCluster from ocs_ci.utility import utils @@ -55,29 +54,15 @@ def workloads_dir_setup(self, request): Setting up the environment for the test """ - if config.DEPLOYMENT.get("local_storage"): - self.worker_node = node.get_worker_nodes()[0] - self.oc_cmd = OCP(namespace=config.ENV_DATA["cluster_namespace"]) - mon_pod_name = self.oc_cmd.exec_oc_debug_cmd( - node=self.worker_node, - cmd_list=["ls /var/lib/rook/ | grep mon"], - ) - mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "") - - mon_pods_info = pod.get_pods_having_label( - label=f"ceph_daemon_id={mon_pod_id}", - namespace=config.ENV_DATA["cluster_namespace"], - ) - self.mon_pod = pod.get_pod_obj( - name=mon_pods_info[0]["metadata"]["name"], - namespace=config.ENV_DATA["cluster_namespace"], - ) - else: - self.mon_pod = random.choice(pod.get_mon_pods()) + self.mon_pod = random.choice(pod.get_mon_pods()) self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get("mon") self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads" log.info(f"Selected mon '{self.mon_pod.name}'") + + if config.DEPLOYMENT.get("local_storage"): + self.worker_node = self.mon_pod.get()["spec"]["nodeName"] + self.oc_cmd = OCP(namespace=config.ENV_DATA["cluster_namespace"]) self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}") self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}") From a1bfed445a168e816e116d04777be3bdfda49c7a Mon Sep 17 00:00:00 2001 From: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> Date: Tue, 17 Mar 2026 19:34:02 +0530 Subject: [PATCH 2/3] Fix get_used_percentage for LSO to check actual mon data path On LSO, df -Th | grep /etc/hosts returned empty because df shows mount points not file paths, causing IndexError. Use df -Th with the actual mon data path (/var/lib/rook/mon-{suffix}/data) via oc debug on the worker node. Also switch non-LSO to use df -Th {path} directly instead of grep for reliability. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> --- .../z_cluster/test_mon_data_avail_warn.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/functional/z_cluster/test_mon_data_avail_warn.py b/tests/functional/z_cluster/test_mon_data_avail_warn.py index 5fba486a1eec..db4ae00f803a 100644 --- a/tests/functional/z_cluster/test_mon_data_avail_warn.py +++ b/tests/functional/z_cluster/test_mon_data_avail_warn.py @@ -81,11 +81,18 @@ def get_used_percentage(self): int: Used space percentage """ - path = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}" if config.DEPLOYMENT.get("local_storage"): - path = "/etc/hosts" - cmd = f"df -Th | grep {path}" - mount_details = self.mon_pod.exec_sh_cmd_on_pod(command=cmd, sh="sh") + path = f"/var/lib/rook/mon-{self.mon_suffix}/data" + cmd = f"df -Th {path}" + result = self.oc_cmd.exec_oc_debug_cmd( + node=self.worker_node, + cmd_list=[cmd], + ) + mount_details = result.strip().splitlines()[-1] + else: + path = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}" + cmd = f"df -Th {path}" + mount_details = self.mon_pod.exec_sh_cmd_on_pod(command=cmd, sh="sh") used_percent = mount_details.split()[5].replace("%", "") return int(used_percent) From 29be7c3c59b71e8d7f0125996b95783c279d8d42 Mon Sep 17 00:00:00 2001 From: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> Date: Fri, 20 Mar 2026 10:52:27 +0530 Subject: [PATCH 3/3] Increase dd command timeout to 600s for LSO oc debug Writing 1GB of random data via oc debug can exceed the default 300s timeout, especially on later iterations when the disk is nearly full. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> --- tests/functional/z_cluster/test_mon_data_avail_warn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/functional/z_cluster/test_mon_data_avail_warn.py b/tests/functional/z_cluster/test_mon_data_avail_warn.py index db4ae00f803a..3758ee7cde56 100644 --- a/tests/functional/z_cluster/test_mon_data_avail_warn.py +++ b/tests/functional/z_cluster/test_mon_data_avail_warn.py @@ -113,6 +113,7 @@ def exec_dd_cmd(self): self.oc_cmd.exec_oc_debug_cmd( node=self.worker_node, cmd_list=[write_cmd], + timeout=600, ) else: self.mon_pod.exec_sh_cmd_on_pod(command=write_cmd, sh="sh")