From 97767f4312b2b88f2e03613d9fdb8d077503ee3b Mon Sep 17 00:00:00 2001 From: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> Date: Tue, 12 May 2026 10:46:07 +0530 Subject: [PATCH 1/4] Fix NFS mount retry logic in out-cluster NFS export tests The retry() calls wrapping self.con.exec_cmd() were broken in two ways: 1. exec_cmd was called immediately and its tuple result passed to retry() instead of passing a callable, so no retry ever occurred 2. Connection.exec_cmd() returns (retcode, stdout, stderr) and never raises CommandFailed, so the retry exception type never triggered Add _mount_nfs_with_retry() helper that wraps exec_cmd in a nested function which raises CommandFailed on non-zero retcode, and calls it via retry properly. Replace all 5 broken call sites. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> --- ...est_nfs_feature_enable_for_ODF_clusters.py | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py b/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py index 060982f94735..3c66414df1cf 100644 --- a/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py +++ b/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py @@ -359,6 +359,28 @@ def __make_connection(): nfs_utils.update_etc_hosts_on_nfs_client(con, hostname_add) return con + def _mount_nfs_with_retry(self, cmd, tries=28, delay=10): + """ + Execute an NFS mount command on the client VM with retry. + + Args: + cmd (str): Mount command to execute on the NFS client VM + tries (int): Number of retry attempts (default: 28) + delay (int): Delay in seconds between retries (default: 10) + + Raises: + CommandFailed: If mount does not succeed within the retry limit + """ + + def _do_mount(): + retcode, _, stderr = self.con.exec_cmd(cmd) + if retcode != 0: + raise CommandFailed( + f"NFS mount command failed with retcode " f"{retcode}: {stderr}" + ) + + retry((CommandFailed), tries=tries, delay=delay)(_do_mount)() + @tier1 @polarion_id("OCS-4269") @skipif_hci_client @@ -569,11 +591,7 @@ def test_outcluster_nfs_export( + self.test_folder ) - retry( - (CommandFailed), - tries=28, - delay=10, - )(self.con.exec_cmd(export_nfs_external_cmd)) + self._mount_nfs_with_retry(export_nfs_external_cmd) # Verify able to read exported volume command = f"cat {self.test_folder}/index.html" @@ -741,11 +759,7 @@ def test_multiple_nfs_based_PVs( + " " + self.test_folder ) - retry( - (CommandFailed), - tries=28, - delay=10, - )(self.con.exec_cmd(export_nfs_external_cmd)) + self._mount_nfs_with_retry(export_nfs_external_cmd) # Verify able to access exported volume command = f"cat {self.test_folder}/index.html" @@ -874,11 +888,7 @@ def test_multiple_mounts_of_same_nfs_volume( + " " + self.test_folder ) - retry( - (CommandFailed), - tries=28, - delay=10, - )(self.con.exec_cmd(export_nfs_external_cmd)) + self._mount_nfs_with_retry(export_nfs_external_cmd) # Verify able to access exported volume command = f"cat {self.test_folder}/shared_file.html" @@ -1007,11 +1017,7 @@ def test_external_nfs_client_can_write_read_new_file( + " " + self.test_folder ) - retry( - (CommandFailed), - tries=28, - delay=10, - )(self.con.exec_cmd(export_nfs_external_cmd)) + self._mount_nfs_with_retry(export_nfs_external_cmd) # Verify able to write new file in exported volume by external client command = ( @@ -1772,11 +1778,7 @@ def test_incluster_outcluster_nfs_export_for_non_default_nfs_sc( + self.test_folder ) - retry( - (CommandFailed), - tries=28, - delay=10, - )(self.con.exec_cmd(export_nfs_external_cmd)) + self._mount_nfs_with_retry(export_nfs_external_cmd) # Verify able to read exported volume command = f"cat {self.test_folder}/index.html" From 4d00c2f228aafefc87b06baecba156251f33280d Mon Sep 17 00:00:00 2001 From: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> Date: Tue, 12 May 2026 11:13:09 +0530 Subject: [PATCH 2/4] Open port 2049 on IBM Cloud VPC security group for NFS LB IBM Cloud VPC LoadBalancer security groups block inbound traffic by default. The NFS LB on port 2049 needs an explicit inbound rule, same as the ingress LB needs rules for ports 80/443 (added in #15012). Add configure_nfs_lb_security_group() that finds the VPC LB backing the rook-ceph-nfs-my-nfs-load-balancer Service and adds an inbound TCP 2049 rule to its security groups. Call it automatically from create_nfs_load_balancer_service() on IBM Cloud. Add remove_nfs_lb_security_group_rules() to clean up the rule during teardown, called from delete_nfs_load_balancer_service() before the Service is deleted (so the VPC LB is still present for lookup). Co-Authored-By: Claude Signed-off-by: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> --- ocs_ci/utility/ibmcloud.py | 152 +++++++++++++++++++++++++++++++++++- ocs_ci/utility/nfs_utils.py | 25 +++++- 2 files changed, 172 insertions(+), 5 deletions(-) diff --git a/ocs_ci/utility/ibmcloud.py b/ocs_ci/utility/ibmcloud.py index 7c17b86e6edc..85574a7cf37b 100644 --- a/ocs_ci/utility/ibmcloud.py +++ b/ocs_ci/utility/ibmcloud.py @@ -36,7 +36,6 @@ from ocs_ci.utility.utils import get_infra_id, get_ocp_version, run_cmd, TimeoutSampler from ocs_ci.ocs.node import get_nodes - logger = logging.getLogger(__name__) @@ -1477,6 +1476,157 @@ def configure_ingress_load_balancer_security_group(): raise +def _get_lb_security_groups(svc_name, namespace): + """ + Look up the IBM Cloud VPC load balancer backing a Kubernetes + LoadBalancer Service and return its security groups. + + Args: + svc_name (str): Kubernetes Service name + namespace (str): Kubernetes namespace + + Returns: + list: security group dicts from the VPC LB, empty list on + failure + + """ + rg_name = get_resource_group_name(config.ENV_DATA["cluster_path"]) + + svc_ocp = OCP( + kind="Service", + namespace=namespace, + resource_name=svc_name, + ) + svc_data = svc_ocp.get() + + lb_ingress = svc_data.get("status", {}).get("loadBalancer", {}).get("ingress", []) + if not lb_ingress: + logger.warning( + "No LB ingress on service %s, cannot configure " "security group", + svc_name, + ) + return [] + + lb_hostname = lb_ingress[0].get("hostname") or lb_ingress[0].get("ip") + if not lb_hostname: + logger.warning( + "No hostname/IP in LB ingress for service %s", + svc_name, + ) + return [] + + logger.debug("LB endpoint for %s: %s", svc_name, lb_hostname) + + cmd = f"ibmcloud is lbs --resource-group-name {rg_name} " f"--output json" + out = run_ibmcloud_cmd(cmd) + load_balancers = json.loads(out) + + matching_lb = None + for lb in load_balancers: + if lb.get("hostname") == lb_hostname: + matching_lb = lb + break + + if not matching_lb: + logger.error( + "Could not find IBM Cloud VPC LB with hostname %s", + lb_hostname, + ) + return [] + + security_groups = matching_lb.get("security_groups", []) + if not security_groups: + logger.warning( + "No security groups on LB %s", + matching_lb.get("name"), + ) + return security_groups + + +def configure_nfs_lb_security_group(): + """ + Add an inbound TCP rule for port 2049 (NFS) to the security + groups attached to the NFS LoadBalancer on IBM Cloud VPC. + + Must be called after the ``rook-ceph-nfs-my-nfs-load-balancer`` + Service has an ingress address assigned. + """ + svc_name = "rook-ceph-nfs-my-nfs-load-balancer" + namespace = constants.OPENSHIFT_STORAGE_NAMESPACE + logger.info("Configuring NFS LB security group for port 2049") + + security_groups = _get_lb_security_groups(svc_name, namespace) + for sg in security_groups: + sg_name = sg.get("name") + try: + logger.info("Adding inbound TCP 2049 to %s", sg_name) + add_security_group_rule(sg_name, "inbound", "tcp", 2049, 2049) + except Exception as e: + logger.warning( + "Failed to add port 2049 rule to %s " "(may already exist): %s", + sg_name, + e, + ) + + logger.info("NFS LB security group configuration done") + + +def remove_nfs_lb_security_group_rules(): + """ + Remove inbound TCP 2049 rules from the security groups attached + to the NFS LoadBalancer on IBM Cloud VPC. + + Should be called before deleting the NFS LoadBalancer Service so + the VPC LB is still present for look-up. + """ + svc_name = "rook-ceph-nfs-my-nfs-load-balancer" + namespace = constants.OPENSHIFT_STORAGE_NAMESPACE + logger.info("Removing NFS LB security group rules for port 2049") + + security_groups = _get_lb_security_groups(svc_name, namespace) + for sg in security_groups: + sg_id = sg.get("id") + sg_name = sg.get("name") + try: + cmd = f"ibmcloud is security-group {sg_id} " f"--output json" + out = run_ibmcloud_cmd(cmd) + sg_detail = json.loads(out) + except Exception as e: + logger.warning( + "Could not fetch rules for %s: %s", + sg_name, + e, + ) + continue + + for rule in sg_detail.get("rules", []): + if ( + rule.get("direction") == "inbound" + and rule.get("protocol") == "tcp" + and rule.get("port_min") == 2049 + and rule.get("port_max") == 2049 + ): + rule_id = rule.get("id") + logger.info( + "Deleting rule %s from %s", + rule_id, + sg_name, + ) + try: + run_ibmcloud_cmd( + f"ibmcloud is security-group-rule-delete " + f"{sg_id} {rule_id} --force" + ) + except Exception as e: + logger.warning( + "Failed to delete rule %s: %s", + rule_id, + e, + ) + + logger.info("NFS LB security group cleanup done") + + def create_address_prefix(prefix_name, vpc, zone, cidr): """ Create address prefix in VPC. diff --git a/ocs_ci/utility/nfs_utils.py b/ocs_ci/utility/nfs_utils.py index b9e056d91003..8a329856ed53 100644 --- a/ocs_ci/utility/nfs_utils.py +++ b/ocs_ci/utility/nfs_utils.py @@ -196,13 +196,22 @@ def create_nfs_load_balancer_service( if "hostname" in host_details: hostname_add = host_details["hostname"] log.info("ingress hostname, %s", hostname_add) - return hostname_add elif "ip" in host_details: - host_ip = host_details["ip"] - log.info("ingress host ip, %s", host_ip) - return host_ip + hostname_add = host_details["ip"] + log.info("ingress host ip, %s", hostname_add) else: log.error("host details unavailable") + return None + + platform = config.ENV_DATA.get("platform", "").lower() + if platform == constants.IBMCLOUD_PLATFORM: + from ocs_ci.utility.ibmcloud import ( + configure_nfs_lb_security_group, + ) + + configure_nfs_lb_security_group() + + return hostname_add def update_etc_hosts_on_nfs_client(con, hostname): @@ -297,6 +306,14 @@ def delete_nfs_load_balancer_service( ) return + platform = config.ENV_DATA.get("platform", "").lower() + if platform == constants.IBMCLOUD_PLATFORM: + from ocs_ci.utility.ibmcloud import ( + remove_nfs_lb_security_group_rules, + ) + + remove_nfs_lb_security_group_rules() + log.info("Deleting NFS LoadBalancer service %s", svc_name) storage_cluster_obj.exec_oc_cmd(f"delete service {svc_name}") From 395a1da56e02b2b5ca3991b9764bcb8d3c076979 Mon Sep 17 00:00:00 2001 From: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> Date: Tue, 12 May 2026 17:07:56 +0530 Subject: [PATCH 3/4] Fix stale subvolume assertion in test_nfs_pvc_subvolume_deletion Assert that the specific subvolume created by the test is no longer stale after deletion, instead of asserting zero stale subvolumes cluster-wide. Pre-existing stale subvolumes from other tests caused false failures. Also log the delete output and stale lists for debuggability. Co-Authored-By: Claude Signed-off-by: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> --- ...est_nfs_feature_enable_for_ODF_clusters.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py b/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py index 3c66414df1cf..f02fef42ee39 100644 --- a/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py +++ b/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py @@ -1618,16 +1618,24 @@ def test_nfs_pvc_subvolume_deletion( # Checking for stale volumes output = exec_cmd(cmd=f"{odf_cli_path} subvolume ls --stale") + stale_before = self.parse_subvolume_ls_output(output) + log.info("Stale subvolumes before delete: %s", stale_before) - # Deleteing stale subvolume - exec_cmd( - cmd=f"{odf_cli_path} subvolume delete {new_pvc[0]} {new_pvc[1]} {new_pvc[2]}" + # Deleting stale subvolume + delete_output = exec_cmd( + cmd=f"{odf_cli_path} subvolume delete" + f" {new_pvc[0]} {new_pvc[1]} {new_pvc[2]}" ) + log.info("Subvolume delete output: %s", delete_output.stdout) - # Checking for stale volumes + # Verify the specific subvolume was deleted output = exec_cmd(cmd=f"{odf_cli_path} subvolume ls --stale") - stale_volumes = self.parse_subvolume_ls_output(output) - assert len(stale_volumes) == 0 # No stale volumes available + stale_after = self.parse_subvolume_ls_output(output) + log.info("Stale subvolumes after delete: %s", stale_after) + stale_svs = {sv[1] for sv in stale_after} + assert ( + new_pvc[1] not in stale_svs + ), f"Subvolume {new_pvc[1]} still stale after delete" # Delete ocs-storagecluster-ceph-nfs-retain storageclass self.sc_obj.delete(resource_name=self.retain_nfs_sc_name) From 1da76f3029e9c4e2dd8af82c2a4055494c7e659e Mon Sep 17 00:00:00 2001 From: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> Date: Tue, 12 May 2026 17:49:12 +0530 Subject: [PATCH 4/4] Convert log statements from %-style to f-string format Co-Authored-By: Claude Signed-off-by: Amrita Mahapatra <49347640+amr1ta@users.noreply.github.com> --- ocs_ci/utility/ibmcloud.py | 45 +++++-------------- ocs_ci/utility/nfs_utils.py | 2 +- ...est_nfs_feature_enable_for_ODF_clusters.py | 6 +-- 3 files changed, 15 insertions(+), 38 deletions(-) diff --git a/ocs_ci/utility/ibmcloud.py b/ocs_ci/utility/ibmcloud.py index 85574a7cf37b..acdf28dfdfb9 100644 --- a/ocs_ci/utility/ibmcloud.py +++ b/ocs_ci/utility/ibmcloud.py @@ -1502,20 +1502,16 @@ def _get_lb_security_groups(svc_name, namespace): lb_ingress = svc_data.get("status", {}).get("loadBalancer", {}).get("ingress", []) if not lb_ingress: logger.warning( - "No LB ingress on service %s, cannot configure " "security group", - svc_name, + f"No LB ingress on service {svc_name}, cannot configure " f"security group" ) return [] lb_hostname = lb_ingress[0].get("hostname") or lb_ingress[0].get("ip") if not lb_hostname: - logger.warning( - "No hostname/IP in LB ingress for service %s", - svc_name, - ) + logger.warning(f"No hostname/IP in LB ingress for service {svc_name}") return [] - logger.debug("LB endpoint for %s: %s", svc_name, lb_hostname) + logger.debug(f"LB endpoint for {svc_name}: {lb_hostname}") cmd = f"ibmcloud is lbs --resource-group-name {rg_name} " f"--output json" out = run_ibmcloud_cmd(cmd) @@ -1528,18 +1524,12 @@ def _get_lb_security_groups(svc_name, namespace): break if not matching_lb: - logger.error( - "Could not find IBM Cloud VPC LB with hostname %s", - lb_hostname, - ) + logger.error(f"Could not find IBM Cloud VPC LB with hostname {lb_hostname}") return [] security_groups = matching_lb.get("security_groups", []) if not security_groups: - logger.warning( - "No security groups on LB %s", - matching_lb.get("name"), - ) + logger.warning(f"No security groups on LB {matching_lb.get('name')}") return security_groups @@ -1559,13 +1549,12 @@ def configure_nfs_lb_security_group(): for sg in security_groups: sg_name = sg.get("name") try: - logger.info("Adding inbound TCP 2049 to %s", sg_name) + logger.info(f"Adding inbound TCP 2049 to {sg_name}") add_security_group_rule(sg_name, "inbound", "tcp", 2049, 2049) except Exception as e: logger.warning( - "Failed to add port 2049 rule to %s " "(may already exist): %s", - sg_name, - e, + f"Failed to add port 2049 rule to {sg_name} " + f"(may already exist): {e}" ) logger.info("NFS LB security group configuration done") @@ -1592,11 +1581,7 @@ def remove_nfs_lb_security_group_rules(): out = run_ibmcloud_cmd(cmd) sg_detail = json.loads(out) except Exception as e: - logger.warning( - "Could not fetch rules for %s: %s", - sg_name, - e, - ) + logger.warning(f"Could not fetch rules for {sg_name}: {e}") continue for rule in sg_detail.get("rules", []): @@ -1607,22 +1592,14 @@ def remove_nfs_lb_security_group_rules(): and rule.get("port_max") == 2049 ): rule_id = rule.get("id") - logger.info( - "Deleting rule %s from %s", - rule_id, - sg_name, - ) + logger.info(f"Deleting rule {rule_id} from {sg_name}") try: run_ibmcloud_cmd( f"ibmcloud is security-group-rule-delete " f"{sg_id} {rule_id} --force" ) except Exception as e: - logger.warning( - "Failed to delete rule %s: %s", - rule_id, - e, - ) + logger.warning(f"Failed to delete rule {rule_id}: {e}") logger.info("NFS LB security group cleanup done") diff --git a/ocs_ci/utility/nfs_utils.py b/ocs_ci/utility/nfs_utils.py index 8a329856ed53..4898747fc6c2 100644 --- a/ocs_ci/utility/nfs_utils.py +++ b/ocs_ci/utility/nfs_utils.py @@ -198,7 +198,7 @@ def create_nfs_load_balancer_service( log.info("ingress hostname, %s", hostname_add) elif "ip" in host_details: hostname_add = host_details["ip"] - log.info("ingress host ip, %s", hostname_add) + log.info(f"ingress host ip, {hostname_add}") else: log.error("host details unavailable") return None diff --git a/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py b/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py index f02fef42ee39..617752609375 100644 --- a/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py +++ b/tests/functional/nfs_feature/test_nfs_feature_enable_for_ODF_clusters.py @@ -1619,19 +1619,19 @@ def test_nfs_pvc_subvolume_deletion( # Checking for stale volumes output = exec_cmd(cmd=f"{odf_cli_path} subvolume ls --stale") stale_before = self.parse_subvolume_ls_output(output) - log.info("Stale subvolumes before delete: %s", stale_before) + log.info(f"Stale subvolumes before delete: {stale_before}") # Deleting stale subvolume delete_output = exec_cmd( cmd=f"{odf_cli_path} subvolume delete" f" {new_pvc[0]} {new_pvc[1]} {new_pvc[2]}" ) - log.info("Subvolume delete output: %s", delete_output.stdout) + log.info(f"Subvolume delete output: {delete_output.stdout}") # Verify the specific subvolume was deleted output = exec_cmd(cmd=f"{odf_cli_path} subvolume ls --stale") stale_after = self.parse_subvolume_ls_output(output) - log.info("Stale subvolumes after delete: %s", stale_after) + log.info(f"Stale subvolumes after delete: {stale_after}") stale_svs = {sv[1] for sv in stale_after} assert ( new_pvc[1] not in stale_svs