diff --git a/.secrets.baseline b/.secrets.baseline index 13a4e17c62cd..d3d6e592b23c 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -637,7 +637,7 @@ "hashed_secret": "b7e9a6e2e04ed3edbf8b4e21def4beccbb6eb6b1", "is_secret": false, "is_verified": false, - "line_number": 230, + "line_number": 233, "type": "Secret Keyword", "verified_result": null }, @@ -645,7 +645,7 @@ "hashed_secret": "a12337323b638ab044b1166bff4b1a1f83162819", "is_secret": false, "is_verified": false, - "line_number": 829, + "line_number": 832, "type": "Secret Keyword", "verified_result": null }, @@ -653,7 +653,7 @@ "hashed_secret": "fb947972c92f052c0a08866d182be0075a2b601b", "is_secret": false, "is_verified": false, - "line_number": 838, + "line_number": 841, "type": "Secret Keyword", "verified_result": null }, @@ -661,7 +661,7 @@ "hashed_secret": "03e227627ab8681281fdb8aa3d799b03f782d672", "is_secret": false, "is_verified": false, - "line_number": 2030, + "line_number": 2033, "type": "Secret Keyword", "verified_result": null }, @@ -669,7 +669,7 @@ "hashed_secret": "ef5f3d909f23bd0aa02b4253f98350384f709c86", "is_secret": false, "is_verified": false, - "line_number": 2137, + "line_number": 2140, "type": "Secret Keyword", "verified_result": null }, @@ -677,7 +677,7 @@ "hashed_secret": "cb1ae2b504c4615841d8144267a131231d2bd677", "is_secret": false, "is_verified": false, - "line_number": 2138, + "line_number": 2141, "type": "Secret Keyword", "verified_result": null }, @@ -685,7 +685,7 @@ "hashed_secret": "1a1e70e87dd0452c42f33ce9bf74aa28134dba6b", "is_secret": false, "is_verified": false, - "line_number": 2139, + "line_number": 2142, "type": "Secret Keyword", "verified_result": null }, @@ -693,7 +693,7 @@ "hashed_secret": "7b1ba2f04f2f1604dc4e3caffcadf9fcbce7df5b", "is_secret": false, "is_verified": false, - "line_number": 2140, + "line_number": 2143, "type": "Secret Keyword", "verified_result": null }, @@ -701,7 +701,7 @@ "hashed_secret": "0fa3b21ced80146d752888f2b60ec80e0d4b8925", "is_secret": false, "is_verified": false, - "line_number": 2145, + "line_number": 2148, "type": "Secret Keyword", "verified_result": null }, @@ -709,7 +709,7 @@ "hashed_secret": "f084f2068494b8d1cd06811dd97d02c3d85f40ee", "is_secret": false, "is_verified": false, - "line_number": 2160, + "line_number": 2163, "type": "Secret Keyword", "verified_result": null }, @@ -717,7 +717,7 @@ "hashed_secret": "adfa401a3b0a733d8f00519ac8c6b3893a2e7e8e", "is_secret": false, "is_verified": false, - "line_number": 2161, + "line_number": 2164, "type": "Secret Keyword", "verified_result": null }, @@ -725,7 +725,7 @@ "hashed_secret": "898e46bbadc12f87120548bd445eb4210c8407c8", "is_secret": false, "is_verified": false, - "line_number": 2169, + "line_number": 2172, "type": "Secret Keyword", "verified_result": null }, @@ -733,7 +733,7 @@ "hashed_secret": "f57ccec6b8f7b12b635ab53d26c3bf7300247341", "is_secret": false, "is_verified": false, - "line_number": 2170, + "line_number": 2173, "type": "Secret Keyword", "verified_result": null }, @@ -741,7 +741,7 @@ "hashed_secret": "77b044ea736f8cbe568d1954424186d901f89db9", "is_secret": false, "is_verified": false, - "line_number": 2171, + "line_number": 2174, "type": "Secret Keyword", "verified_result": null }, @@ -749,7 +749,7 @@ "hashed_secret": "d64368f12ca17c69568c6a132f17d44d56e60660", "is_secret": false, "is_verified": false, - "line_number": 2172, + "line_number": 2175, "type": "Secret Keyword", "verified_result": null }, @@ -757,7 +757,7 @@ "hashed_secret": "8f9ca35156c02cb6ba58c5b51230b9bedc38de4f", "is_secret": false, "is_verified": false, - "line_number": 2173, + "line_number": 2176, "type": "Secret Keyword", "verified_result": null }, @@ -765,7 +765,7 @@ "hashed_secret": "9ec53cfd9929c70c3f87c210b6a7b77fb6d79d43", "is_secret": false, "is_verified": false, - "line_number": 2767, + "line_number": 2770, "type": "Secret Keyword", "verified_result": null }, @@ -773,7 +773,7 @@ "hashed_secret": "ee977806d7286510da8b9a7492ba58e2484c0ecc", "is_secret": false, "is_verified": false, - "line_number": 2920, + "line_number": 2927, "type": "Secret Keyword", "verified_result": null }, @@ -781,7 +781,7 @@ "hashed_secret": "adc1f5c8707f7d7aba3aabe13c15e5ef1151872e", "is_secret": false, "is_verified": false, - "line_number": 2921, + "line_number": 2928, "type": "Secret Keyword", "verified_result": null }, @@ -789,7 +789,7 @@ "hashed_secret": "ee46262b2df945e46ea310b925ad087465dbd3f2", "is_secret": false, "is_verified": false, - "line_number": 3646, + "line_number": 3649, "type": "Secret Keyword", "verified_result": null }, @@ -797,7 +797,7 @@ "hashed_secret": "f678cad4ab874d71b559a069d5e34a95fe38a480", "is_secret": false, "is_verified": false, - "line_number": 3647, + "line_number": 3650, "type": "Secret Keyword", "verified_result": null } diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 72f65ec7a176..21bf3f10e250 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -59,6 +59,9 @@ TEMPLATE_COUCHBASE_PILLOWFIGHT_DIR = os.path.join(TEMPLATE_COUCHBASE_DIR, "pillowfight") TEMPLATE_CEPHFS_STRESS_DIR = os.path.join(TEMPLATE_WORKLOAD_DIR, "cephfs_stress") TEMPLATE_MCG_DIR = os.path.join(TEMPLATE_DIR, "mcg") +TEMPLATE_BLOCK_NB_EGRESS_NETWORK_POLICY = os.path.join( + TEMPLATE_MCG_DIR, "block_egress_network_policy.yaml" +) TEMPLATE_RGW_DIR = os.path.join(TEMPLATE_DIR, "rgw") TEMPLATE_AMQ_DIR = os.path.join(TEMPLATE_WORKLOAD_DIR, "amq") TEMPLATE_OPENSHIFT_INFRA_DIR = os.path.join(TEMPLATE_DIR, "openshift-infra/") diff --git a/ocs_ci/templates/mcg/block_egress_network_policy.yaml b/ocs_ci/templates/mcg/block_egress_network_policy.yaml new file mode 100644 index 000000000000..5dfdc977153d --- /dev/null +++ b/ocs_ci/templates/mcg/block_egress_network_policy.yaml @@ -0,0 +1,16 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: block-cloud-egress +spec: + # Only targets noobaa-endpoint pods, not other pods in the namespace + podSelector: + matchLabels: + noobaa-s3: noobaa + policyTypes: + - Egress + # Allow egress only to pods within the cluster (any namespace). + # All external traffic (cloud storage, internet) is blocked. + egress: + - to: + - namespaceSelector: {} diff --git a/tests/functional/object/mcg/test_endpoint_network_disruption.py b/tests/functional/object/mcg/test_endpoint_network_disruption.py new file mode 100644 index 000000000000..6f95a79bb37f --- /dev/null +++ b/tests/functional/object/mcg/test_endpoint_network_disruption.py @@ -0,0 +1,384 @@ +import logging +import os +import subprocess +import tempfile +import threading +import time + +import pytest + +from ocs_ci.framework import config +from ocs_ci.framework.pytest_customization.marks import ( + mcg, + red_squad, + runs_on_provider, + skipif_managed_service, + skipif_proxy_cluster, +) +from ocs_ci.framework.testlib import ( + MCGTest, + skipif_disconnected_cluster, + tier4c, +) +from ocs_ci.ocs import constants, ocp +from ocs_ci.ocs.bucket_utils import craft_s3_command +from ocs_ci.ocs.exceptions import CommandFailed +from ocs_ci.ocs.resources import pod +from ocs_ci.utility import templating + +logger = logging.getLogger(__name__) + + +@mcg +@red_squad +@runs_on_provider +@skipif_managed_service +@skipif_disconnected_cluster +@skipif_proxy_cluster +class TestEndpointCloudNetworkDisruption(MCGTest): + """ + Test class for verifying noobaa-endpoint pod resilience when + cloud storage connections are severed mid-stream. + """ + + LARGE_FILE_SIZE_MB = 2048 + NETWORK_POLICY_NAME = "block-cloud-egress" + + @tier4c + @pytest.mark.parametrize( + argnames=["disruption_during", "bucketclass_dict"], + argvalues=[ + pytest.param( + "download", + { + "interface": "OC", + "namespace_policy_dict": { + "type": "Single", + "namespacestore_dict": {"aws": [(1, None)]}, + }, + }, + marks=pytest.mark.polarion_id("OCS-7901"), + id="download-namespacestore-aws", + ), + pytest.param( + "download", + { + "interface": "OC", + "namespace_policy_dict": { + "type": "Single", + "namespacestore_dict": {"azure": [(1, None)]}, + }, + }, + marks=pytest.mark.polarion_id("OCS-7902"), + id="download-namespacestore-azure", + ), + pytest.param( + "download", + { + "interface": "OC", + "namespace_policy_dict": { + "type": "Single", + "namespacestore_dict": {"ibmcos": [(1, None)]}, + }, + }, + marks=pytest.mark.polarion_id("OCS-7903"), + id="download-namespacestore-ibmcos", + ), + pytest.param( + "download", + { + "interface": "OC", + "backingstore_dict": {"aws": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7904"), + id="download-backingstore-aws", + ), + pytest.param( + "download", + { + "interface": "OC", + "backingstore_dict": {"azure": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7905"), + id="download-backingstore-azure", + ), + pytest.param( + "download", + { + "interface": "OC", + "backingstore_dict": {"gcp": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7906"), + id="download-backingstore-gcp", + ), + pytest.param( + "download", + { + "interface": "OC", + "backingstore_dict": {"ibmcos": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7907"), + id="download-backingstore-ibmcos", + ), + pytest.param( + "upload", + { + "interface": "OC", + "namespace_policy_dict": { + "type": "Single", + "namespacestore_dict": {"aws": [(1, None)]}, + }, + }, + marks=pytest.mark.polarion_id("OCS-7908"), + id="upload-namespacestore-aws", + ), + pytest.param( + "upload", + { + "interface": "OC", + "namespace_policy_dict": { + "type": "Single", + "namespacestore_dict": {"azure": [(1, None)]}, + }, + }, + marks=pytest.mark.polarion_id("OCS-7909"), + id="upload-namespacestore-azure", + ), + pytest.param( + "upload", + { + "interface": "OC", + "namespace_policy_dict": { + "type": "Single", + "namespacestore_dict": {"ibmcos": [(1, None)]}, + }, + }, + marks=pytest.mark.polarion_id("OCS-7910"), + id="upload-namespacestore-ibmcos", + ), + pytest.param( + "upload", + { + "interface": "OC", + "backingstore_dict": {"aws": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7911"), + id="upload-backingstore-aws", + ), + pytest.param( + "upload", + { + "interface": "OC", + "backingstore_dict": {"azure": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7912"), + id="upload-backingstore-azure", + ), + pytest.param( + "upload", + { + "interface": "OC", + "backingstore_dict": {"gcp": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7913"), + id="upload-backingstore-gcp", + ), + pytest.param( + "upload", + { + "interface": "OC", + "backingstore_dict": {"ibmcos": [(1, None)]}, + }, + marks=pytest.mark.polarion_id("OCS-7914"), + id="upload-backingstore-ibmcos", + ), + ], + ) + def test_endpoint_survives_cloud_connection_severed( + self, + request, + disruption_during, + bucketclass_dict, + mcg_obj, + awscli_pod_session, + bucket_factory, + ): + """ + Verify that noobaa-endpoint pods survive when TCP connections + to cloud storage are severed mid-stream, rather than crashing + with an unhandled exception. + + Test steps: + 1. Create a bucket backed by cloud storage (namespacestore + or backingstore depending on parametrization) + 2. Generate a large file on the awscli pod + 3. For download disruption: upload the file first + 4. Record endpoint pod restart counts + 5. Start a large upload or download in a background thread + 6. Apply a NetworkPolicy to block all external egress + from noobaa-endpoint pods only + 7. Verify endpoint pods did not crash or restart + """ + namespace = config.ENV_DATA["cluster_namespace"] + awscli_pod = awscli_pod_session + + store_dict = bucketclass_dict.get( + "backingstore_dict", + bucketclass_dict.get("namespace_policy_dict", {}).get( + "namespacestore_dict", {} + ), + ) + platform = next(iter(store_dict)) + + # Step 1: Create bucket with parametrized bucketclass + logger.info(f"Creating bucket backed by {platform} cloud storage") + bucket = bucket_factory( + amount=1, + interface=bucketclass_dict["interface"], + bucketclass=bucketclass_dict, + )[0] + logger.info(f"Created bucket: {bucket.name}") + + # Step 2: Generate a large file + logger.info(f"Generating {self.LARGE_FILE_SIZE_MB} MB file") + awscli_pod.exec_sh_cmd_on_pod( + f"dd if=/dev/urandom of=/tmp/bigfile " + f"bs=1M count={self.LARGE_FILE_SIZE_MB} status=none", + timeout=300, + ) + request.addfinalizer( + lambda: awscli_pod.exec_sh_cmd_on_pod( + "rm -f /tmp/bigfile /tmp/bigfile_download" + ) + ) + + # Step 3: For download disruption, upload the file first + if disruption_during == "download": + logger.info("Uploading file before download disruption test") + upload_cmd = craft_s3_command( + f"cp /tmp/bigfile s3://{bucket.name}/bigfile", + mcg_obj=mcg_obj, + ) + awscli_pod.exec_cmd_on_pod(upload_cmd, out_yaml_format=False, timeout=600) + logger.info("Upload complete") + + # Step 4: Record endpoint pod state before disruption + endpoint_pods = pod.get_noobaa_endpoint_pods() + assert endpoint_pods, "No noobaa-endpoint pods found" + restart_counts_before = {p.name: p.restart_count for p in endpoint_pods} + logger.info(f"Endpoint pod restart counts before: {restart_counts_before}") + + # Step 5: Register NetworkPolicy cleanup finalizer + def _cleanup_network_policy(): + try: + ocp.OCP(kind=constants.NETWORK_POLICY, namespace=namespace).delete( + resource_name=self.NETWORK_POLICY_NAME + ) + logger.info(f"Deleted NetworkPolicy {self.NETWORK_POLICY_NAME}") + except CommandFailed: + logger.warning( + f"NetworkPolicy {self.NETWORK_POLICY_NAME} already deleted" + ) + + request.addfinalizer(_cleanup_network_policy) + + # Step 6: Start operation in background, then apply NetworkPolicy + logger.info(f"Starting large {disruption_during} in background thread") + operation_disrupted = threading.Event() + + if disruption_during == "download": + s3_cmd = craft_s3_command( + f"cp s3://{bucket.name}/bigfile /tmp/bigfile_download", + mcg_obj=mcg_obj, + max_attempts=2, + ) + else: + s3_cmd = craft_s3_command( + f"cp /tmp/bigfile s3://{bucket.name}/bigfile", + mcg_obj=mcg_obj, + max_attempts=2, + ) + + def _s3_operation(): + try: + awscli_pod.exec_cmd_on_pod(s3_cmd, out_yaml_format=False, timeout=120) + logger.info( + f"{disruption_during.capitalize()} completed before " + f"network disruption took effect" + ) + except (CommandFailed, TimeoutError, subprocess.TimeoutExpired): + operation_disrupted.set() + logger.info( + f"{disruption_during.capitalize()} failed as expected " + f"due to network disruption" + ) + + operation_thread = threading.Thread(target=_s3_operation, daemon=True) + operation_thread.start() + time.sleep(2) + + logger.info( + "Applying NetworkPolicy to block external egress " + "from noobaa-endpoint pods only" + ) + network_policy_data = templating.load_yaml( + constants.TEMPLATE_BLOCK_NB_EGRESS_NETWORK_POLICY + ) + network_policy_data["metadata"]["namespace"] = namespace + temp_yaml = tempfile.NamedTemporaryFile( + mode="w+", + prefix="network_policy_", + suffix=".yaml", + delete=False, + ) + templating.dump_data_to_temp_yaml(network_policy_data, temp_yaml.name) + ocp_obj = ocp.OCP(kind=constants.NETWORK_POLICY, namespace=namespace) + ocp_obj.exec_oc_cmd(f"apply -f {temp_yaml.name}") + os.unlink(temp_yaml.name) + logger.info( + "NetworkPolicy applied - external egress blocked " + "for noobaa-endpoint pods" + ) + + operation_thread.join(timeout=120) + assert operation_disrupted.is_set(), ( + f"{disruption_during.capitalize()} completed before the " + f"NetworkPolicy disrupted it. Increase LARGE_FILE_SIZE_MB " + f"to ensure the {disruption_during} is still in progress " + f"when the network is severed." + ) + + # Step 7: Verify endpoint pods survived + logger.info("Waiting 30 seconds for any crash to manifest, then verifying") + time.sleep(30) + + endpoint_pods_after = pod.get_noobaa_endpoint_pods() + for ep in endpoint_pods_after: + pod_status = ep.get()["status"]["phase"] + assert pod_status == constants.STATUS_RUNNING, ( + f"Endpoint pod {ep.name} is in {pod_status} state " + f"instead of Running" + ) + + if ep.name in restart_counts_before: + current_restarts = ep.restart_count + assert current_restarts == restart_counts_before[ep.name], ( + f"Endpoint pod {ep.name} restarted: " + f"before={restart_counts_before[ep.name]}, " + f"after={current_restarts}" + ) + + panics = pod.search_pattern_in_pod_logs( + pod_name=ep.name, + pattern="PANIC.*uncaughtException", + container="endpoint", + since="5m", + ) + assert ( + not panics + ), f"Found PANIC/uncaughtException in endpoint pod {ep.name}: {panics}" + + logger.info( + f"All endpoint pods survived the cloud connection " + f"disruption during {disruption_during}" + )