diff --git a/CHANGELOG.md b/CHANGELOG.md index b268f5c6a6..0f5f3d2b2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ CHANGELOG - Replace cfn-hup in compute nodes with systemd timer to support in place updates in order to improve performance for tightly coupled worloads at scale. This new mechanism relies on shared storage to sync updates between the head node and compute nodes. - Disable `dnf-makecache.timer` to improve performance for tightly coupled worloads on RHEL/Rocky at scale. -- Support updates of `Tags` during cluster-updates. +- Support updates of `Tags` during cluster-updates except in AWS Top Secret and AWS Secret Regions. - Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource. - This overrides the parallelcluster default using a shallow merge. - Add alarm on missing clustermgtd heartbeat. diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py index b75f05a64b..13d8940d52 100644 --- a/cli/src/pcluster/config/update_policy.py +++ b/cli/src/pcluster/config/update_policy.py @@ -11,6 +11,7 @@ import re from enum import Enum +from pcluster.aws.common import get_region from pcluster.config.cluster_config import QueueUpdateStrategy from pcluster.config.update_policy_utils import SharedStorageChangeInfo from pcluster.constants import ( @@ -149,6 +150,12 @@ def is_slurm_queues_change(change): return any(path.startswith("SlurmQueues[") for path in change.path) +def _is_adc_region(): + """Return True if the currently configured AWS region belongs to an ADC partition.""" + region = get_region() or "" + return region.startswith("us-iso") + + def extract_type_and_name_from_path(path): # Example path = 'SlurmQueues[slurm-q-name]' # This function returns the type and name extracted like this: 'SlurmQueues', 'slurm-q-name' @@ -730,6 +737,25 @@ def fail_reason_extra_chef_attributes(change, _) -> str: + ". If you need this change, please consider creating a new cluster instead of updating the existing one.", ) +# Update supported everywhere except ADC regions (us-iso*, us-isob*). +# +# In ADC regions there is a CloudFormation behavior where an UpdateStack call that includes +# both a Tags change and a resource whose change is only in Metadata does not update that +# resource. This breaks the head node update flow because cfn-hup on the head node polls +# DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate and never sees them, +# leaving the HeadNodeWaitCondition to time out. +# +# Until the CloudFormation behavior is fixed in ADC, block tag updates in those regions. +UpdatePolicy.SUPPORTED_UNLESS_ADC = UpdatePolicy( + name="SUPPORTED_UNLESS_ADC", + level=1000, + fail_reason=lambda change, patch: ( + f"Updating '{change.key}' during a cluster update is not supported in ADC regions." + ), + action_needed=lambda change, patch: f"Restore '{change.key}' to its previous value.", + condition_checker=lambda change, patch: not _is_adc_region(), +) + # Block update if cluster has a managed Fsx for Lustre FileSystem, otherwise fallback to QueueUpdateStrategy UpdatePolicy.MANAGED_FSX = UpdatePolicy( name="MANAGED_FSX", diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py index ccf1231975..b4ecf40476 100644 --- a/cli/src/pcluster/schemas/cluster_schema.py +++ b/cli/src/pcluster/schemas/cluster_schema.py @@ -1994,7 +1994,11 @@ class ClusterSchema(BaseSchema): monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - tags = fields.Nested(TagSchema, many=True, metadata={"update_policy": UpdatePolicy.SUPPORTED, "update_key": "Key"}) + tags = fields.Nested( + TagSchema, + many=True, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC, "update_key": "Key"}, + ) iam = fields.Nested(ClusterIamSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) directory_service = fields.Nested( DirectoryServiceSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP} diff --git a/cli/src/pcluster/schemas/common_schema.py b/cli/src/pcluster/schemas/common_schema.py index 00ac611bce..92a85e2be3 100644 --- a/cli/src/pcluster/schemas/common_schema.py +++ b/cli/src/pcluster/schemas/common_schema.py @@ -183,7 +183,7 @@ class TagSchema(BaseSchema): value = fields.Str( required=True, validate=validate.Length(max=256), - metadata={"update_policy": UpdatePolicy.SUPPORTED}, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC}, ) @post_load diff --git a/cli/tests/pcluster/config/test_config_patch.py b/cli/tests/pcluster/config/test_config_patch.py index 2a89c314cc..d38c97eba1 100644 --- a/cli/tests/pcluster/config/test_config_patch.py +++ b/cli/tests/pcluster/config/test_config_patch.py @@ -1154,11 +1154,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_addition", ), pytest.param( @@ -1170,11 +1170,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", {"Key": "test2", "Value": "val2"}, None, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_removal", ), pytest.param( @@ -1186,11 +1186,11 @@ def test_patch_check_cluster_resource_bucket( "Value", "old_value", "new_value", - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=False, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_value_modification", ), pytest.param( @@ -1202,11 +1202,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="order_change_plus_addition", ), pytest.param( diff --git a/cli/tests/pcluster/config/test_update_policy.py b/cli/tests/pcluster/config/test_update_policy.py index 93bcf554cd..f7ea43aba7 100644 --- a/cli/tests/pcluster/config/test_update_policy.py +++ b/cli/tests/pcluster/config/test_update_policy.py @@ -3085,3 +3085,44 @@ def test_head_node_local_storage_update_blocked( if expected_action_needed: assert_that(action_needed_messages).is_not_empty() assert_that(action_needed_messages[0]).is_equal_to(expected_action_needed) + + +@pytest.mark.parametrize( + "region, expected_result", + [ + pytest.param("us-east-1", True, id="commercial region allows tag update"), + pytest.param("eu-west-1", True, id="commercial EU region allows tag update"), + pytest.param("us-gov-west-1", True, id="GovCloud region allows tag update"), + pytest.param("cn-north-1", True, id="China region allows tag update"), + pytest.param("us-iso-east-1", False, id="us-iso region blocks tag update"), + pytest.param("us-iso-west-1", False, id="us-iso-west region blocks tag update"), + pytest.param("us-isob-east-1", False, id="us-isob region blocks tag update"), + pytest.param("", True, id="empty region falls back to allowing update"), + ], +) +def test_supported_unless_adc_condition_checker(mocker, region, expected_result): + """SUPPORTED_UNLESS_ADC blocks updates only in ADC partitions (us-iso*, us-isob*).""" + mocker.patch("pcluster.config.update_policy.get_region", return_value=region) + + change_mock = mocker.MagicMock() + patch_mock = mocker.MagicMock() + + assert_that(UpdatePolicy.SUPPORTED_UNLESS_ADC.condition_checker(change_mock, patch_mock)).is_equal_to( + expected_result + ) + + +def test_supported_unless_adc_fail_reason_mentions_adc(mocker): + """The fail reason for SUPPORTED_UNLESS_ADC explains the ADC limitation.""" + mocker.patch("pcluster.config.update_policy.get_region", return_value="us-iso-east-1") + + change_mock = mocker.MagicMock() + change_mock.key = "Tags" + patch_mock = mocker.MagicMock() + + result, fail_reason, action_needed, _ = UpdatePolicy.SUPPORTED_UNLESS_ADC.check(change_mock, patch_mock) + + assert_that(result).is_equal_to(UpdatePolicy.CheckResult.ACTION_NEEDED) + assert_that(fail_reason).contains("ADC") + assert_that(fail_reason).contains("Tags") + assert_that(action_needed).contains("Tags") diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index fe7552983e..2a1b204557 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -26,6 +26,7 @@ get_shared_volume_tags, ) from time_utils import minutes, seconds +from utils import wait_for_computefleet_changed from tests.common.schedulers_common import SlurmCommands from tests.common.utils import get_installed_parallelcluster_version @@ -53,18 +54,27 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os _check_tag_propagation(cluster, scheduler, os, volume_name) cluster.stop() - + wait_for_computefleet_changed(cluster, "STOPPED") # Updates cluster with new configuration updated_cluster_config = pcluster_config_reader(config_file="pcluster.config.update.yaml", volume_name=volume_name) - cluster.update(str(updated_cluster_config), force_update="true") + cluster.update(str(updated_cluster_config)) cluster.start() # Makes sure that the compute nodes have started before checking tags _wait_for_compute_fleet_start(cluster) - # Checks for tag propagation - _check_tag_propagation(cluster, scheduler, os, volume_name, add_additional_config_tags=True) + # Checks for tag propagation. + # In ADC regions, CloudFormation doesn't properly update stack tags during a cluster update (see the + # SUPPORTED_UNLESS_ADC update policy), so AdditionalConfigTag is intentionally + # not added to the updated configuration and we skip asserting it here. + _check_tag_propagation( + cluster, + scheduler, + os, + volume_name, + add_additional_config_tags="us-iso" not in cluster.region, + ) if scheduler == "slurm": _test_queue_and_compute_resources_tags(cluster, pcluster_config_reader, scheduler, os, volume_name) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml index f26ba6dbb4..b74bf20f92 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml @@ -1,12 +1,25 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag Value: ConfigFileTagValue - Key: QueueOverrideTag Value: ClusterLevelValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml index ba23ec11be..0c66439fe9 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml @@ -1,6 +1,18 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag @@ -9,6 +21,7 @@ Tags: Value: ClusterLevelValue - Key: AdditionalConfigTag Value: AdditionalConfigTagValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: