From c135b52728d291fa0710b5504b81db58c0aa04d8 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Mon, 4 May 2026 08:33:49 -0700 Subject: [PATCH 1/2] Block Tags updates on cluster update in ADC regions In us-iso*, us-isob* regions, CloudFormation has a behavior where an UpdateStack call that includes both a Tags change and a resource whose change is only in Metadata does not update that resource. This breaks the head node update flow: cfn-hup on the head node polls DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate, never sees them when tags are updated in the same call, and the HeadNodeWaitCondition times out after 30 minutes. Until the CloudFormation behavior is addressed, block tag updates at the validation layer in ADC regions so the failure mode surfaces immediately. Commercial, GovCloud, and China regions are unaffected. Policy changes: - Add UpdatePolicy.SUPPORTED_UNLESS_ADC, equivalent to SUPPORTED outside ADC and UNSUPPORTED inside ADC. - Cover the new policy with parametrized unit tests across commercial, GovCloud, China, us-iso, us-isob, and empty region values. --- CHANGELOG.md | 2 +- cli/src/pcluster/config/update_policy.py | 26 ++++++++++++ cli/src/pcluster/schemas/cluster_schema.py | 6 ++- cli/src/pcluster/schemas/common_schema.py | 2 +- .../pcluster/config/test_config_patch.py | 16 ++++---- .../pcluster/config/test_update_policy.py | 41 +++++++++++++++++++ .../tests/tags/test_tag_propagation.py | 13 +++++- .../pcluster.config.update.queue_update.yaml | 13 ++++++ .../pcluster.config.update.yaml | 13 ++++++ 9 files changed, 119 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b268f5c6a6..0f5f3d2b2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ CHANGELOG - Replace cfn-hup in compute nodes with systemd timer to support in place updates in order to improve performance for tightly coupled worloads at scale. This new mechanism relies on shared storage to sync updates between the head node and compute nodes. - Disable `dnf-makecache.timer` to improve performance for tightly coupled worloads on RHEL/Rocky at scale. -- Support updates of `Tags` during cluster-updates. +- Support updates of `Tags` during cluster-updates except in AWS Top Secret and AWS Secret Regions. - Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource. - This overrides the parallelcluster default using a shallow merge. - Add alarm on missing clustermgtd heartbeat. diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py index b75f05a64b..13d8940d52 100644 --- a/cli/src/pcluster/config/update_policy.py +++ b/cli/src/pcluster/config/update_policy.py @@ -11,6 +11,7 @@ import re from enum import Enum +from pcluster.aws.common import get_region from pcluster.config.cluster_config import QueueUpdateStrategy from pcluster.config.update_policy_utils import SharedStorageChangeInfo from pcluster.constants import ( @@ -149,6 +150,12 @@ def is_slurm_queues_change(change): return any(path.startswith("SlurmQueues[") for path in change.path) +def _is_adc_region(): + """Return True if the currently configured AWS region belongs to an ADC partition.""" + region = get_region() or "" + return region.startswith("us-iso") + + def extract_type_and_name_from_path(path): # Example path = 'SlurmQueues[slurm-q-name]' # This function returns the type and name extracted like this: 'SlurmQueues', 'slurm-q-name' @@ -730,6 +737,25 @@ def fail_reason_extra_chef_attributes(change, _) -> str: + ". If you need this change, please consider creating a new cluster instead of updating the existing one.", ) +# Update supported everywhere except ADC regions (us-iso*, us-isob*). +# +# In ADC regions there is a CloudFormation behavior where an UpdateStack call that includes +# both a Tags change and a resource whose change is only in Metadata does not update that +# resource. This breaks the head node update flow because cfn-hup on the head node polls +# DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate and never sees them, +# leaving the HeadNodeWaitCondition to time out. +# +# Until the CloudFormation behavior is fixed in ADC, block tag updates in those regions. +UpdatePolicy.SUPPORTED_UNLESS_ADC = UpdatePolicy( + name="SUPPORTED_UNLESS_ADC", + level=1000, + fail_reason=lambda change, patch: ( + f"Updating '{change.key}' during a cluster update is not supported in ADC regions." + ), + action_needed=lambda change, patch: f"Restore '{change.key}' to its previous value.", + condition_checker=lambda change, patch: not _is_adc_region(), +) + # Block update if cluster has a managed Fsx for Lustre FileSystem, otherwise fallback to QueueUpdateStrategy UpdatePolicy.MANAGED_FSX = UpdatePolicy( name="MANAGED_FSX", diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py index ccf1231975..b4ecf40476 100644 --- a/cli/src/pcluster/schemas/cluster_schema.py +++ b/cli/src/pcluster/schemas/cluster_schema.py @@ -1994,7 +1994,11 @@ class ClusterSchema(BaseSchema): monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - tags = fields.Nested(TagSchema, many=True, metadata={"update_policy": UpdatePolicy.SUPPORTED, "update_key": "Key"}) + tags = fields.Nested( + TagSchema, + many=True, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC, "update_key": "Key"}, + ) iam = fields.Nested(ClusterIamSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) directory_service = fields.Nested( DirectoryServiceSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP} diff --git a/cli/src/pcluster/schemas/common_schema.py b/cli/src/pcluster/schemas/common_schema.py index 00ac611bce..92a85e2be3 100644 --- a/cli/src/pcluster/schemas/common_schema.py +++ b/cli/src/pcluster/schemas/common_schema.py @@ -183,7 +183,7 @@ class TagSchema(BaseSchema): value = fields.Str( required=True, validate=validate.Length(max=256), - metadata={"update_policy": UpdatePolicy.SUPPORTED}, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC}, ) @post_load diff --git a/cli/tests/pcluster/config/test_config_patch.py b/cli/tests/pcluster/config/test_config_patch.py index 2a89c314cc..d38c97eba1 100644 --- a/cli/tests/pcluster/config/test_config_patch.py +++ b/cli/tests/pcluster/config/test_config_patch.py @@ -1154,11 +1154,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_addition", ), pytest.param( @@ -1170,11 +1170,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", {"Key": "test2", "Value": "val2"}, None, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_removal", ), pytest.param( @@ -1186,11 +1186,11 @@ def test_patch_check_cluster_resource_bucket( "Value", "old_value", "new_value", - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=False, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_value_modification", ), pytest.param( @@ -1202,11 +1202,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="order_change_plus_addition", ), pytest.param( diff --git a/cli/tests/pcluster/config/test_update_policy.py b/cli/tests/pcluster/config/test_update_policy.py index 93bcf554cd..f7ea43aba7 100644 --- a/cli/tests/pcluster/config/test_update_policy.py +++ b/cli/tests/pcluster/config/test_update_policy.py @@ -3085,3 +3085,44 @@ def test_head_node_local_storage_update_blocked( if expected_action_needed: assert_that(action_needed_messages).is_not_empty() assert_that(action_needed_messages[0]).is_equal_to(expected_action_needed) + + +@pytest.mark.parametrize( + "region, expected_result", + [ + pytest.param("us-east-1", True, id="commercial region allows tag update"), + pytest.param("eu-west-1", True, id="commercial EU region allows tag update"), + pytest.param("us-gov-west-1", True, id="GovCloud region allows tag update"), + pytest.param("cn-north-1", True, id="China region allows tag update"), + pytest.param("us-iso-east-1", False, id="us-iso region blocks tag update"), + pytest.param("us-iso-west-1", False, id="us-iso-west region blocks tag update"), + pytest.param("us-isob-east-1", False, id="us-isob region blocks tag update"), + pytest.param("", True, id="empty region falls back to allowing update"), + ], +) +def test_supported_unless_adc_condition_checker(mocker, region, expected_result): + """SUPPORTED_UNLESS_ADC blocks updates only in ADC partitions (us-iso*, us-isob*).""" + mocker.patch("pcluster.config.update_policy.get_region", return_value=region) + + change_mock = mocker.MagicMock() + patch_mock = mocker.MagicMock() + + assert_that(UpdatePolicy.SUPPORTED_UNLESS_ADC.condition_checker(change_mock, patch_mock)).is_equal_to( + expected_result + ) + + +def test_supported_unless_adc_fail_reason_mentions_adc(mocker): + """The fail reason for SUPPORTED_UNLESS_ADC explains the ADC limitation.""" + mocker.patch("pcluster.config.update_policy.get_region", return_value="us-iso-east-1") + + change_mock = mocker.MagicMock() + change_mock.key = "Tags" + patch_mock = mocker.MagicMock() + + result, fail_reason, action_needed, _ = UpdatePolicy.SUPPORTED_UNLESS_ADC.check(change_mock, patch_mock) + + assert_that(result).is_equal_to(UpdatePolicy.CheckResult.ACTION_NEEDED) + assert_that(fail_reason).contains("ADC") + assert_that(fail_reason).contains("Tags") + assert_that(action_needed).contains("Tags") diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index fe7552983e..7e160d50f1 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -63,8 +63,17 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os # Makes sure that the compute nodes have started before checking tags _wait_for_compute_fleet_start(cluster) - # Checks for tag propagation - _check_tag_propagation(cluster, scheduler, os, volume_name, add_additional_config_tags=True) + # Checks for tag propagation. + # In ADC regions, CloudFormation doesn't properly update stack tags during a cluster update (see the + # SUPPORTED_UNLESS_ADC update policy), so AdditionalConfigTag is intentionally + # not added to the updated configuration and we skip asserting it here. + _check_tag_propagation( + cluster, + scheduler, + os, + volume_name, + add_additional_config_tags="us-iso" not in cluster.region, + ) if scheduler == "slurm": _test_queue_and_compute_resources_tags(cluster, pcluster_config_reader, scheduler, os, volume_name) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml index f26ba6dbb4..b74bf20f92 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml @@ -1,12 +1,25 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag Value: ConfigFileTagValue - Key: QueueOverrideTag Value: ClusterLevelValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml index ba23ec11be..0c66439fe9 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml @@ -1,6 +1,18 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag @@ -9,6 +21,7 @@ Tags: Value: ClusterLevelValue - Key: AdditionalConfigTag Value: AdditionalConfigTagValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: From ab6146bea0115f458473d6cf29b5689e46d9e6a6 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Mon, 4 May 2026 09:03:56 -0700 Subject: [PATCH 2/2] [integ-tests] Remove force flag during update tags --- tests/integration-tests/tests/tags/test_tag_propagation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index 7e160d50f1..2a1b204557 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -26,6 +26,7 @@ get_shared_volume_tags, ) from time_utils import minutes, seconds +from utils import wait_for_computefleet_changed from tests.common.schedulers_common import SlurmCommands from tests.common.utils import get_installed_parallelcluster_version @@ -53,10 +54,10 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os _check_tag_propagation(cluster, scheduler, os, volume_name) cluster.stop() - + wait_for_computefleet_changed(cluster, "STOPPED") # Updates cluster with new configuration updated_cluster_config = pcluster_config_reader(config_file="pcluster.config.update.yaml", volume_name=volume_name) - cluster.update(str(updated_cluster_config), force_update="true") + cluster.update(str(updated_cluster_config)) cluster.start()