Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ CHANGELOG
- Replace cfn-hup in compute nodes with systemd timer to support in place updates in order to improve performance for tightly coupled worloads at scale.
This new mechanism relies on shared storage to sync updates between the head node and compute nodes.
- Disable `dnf-makecache.timer` to improve performance for tightly coupled worloads on RHEL/Rocky at scale.
- Support updates of `Tags` during cluster-updates.
- Support updates of `Tags` during cluster-updates except in AWS Top Secret and AWS Secret Regions.
- Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource.
- This overrides the parallelcluster default using a shallow merge.
- Add alarm on missing clustermgtd heartbeat.
Expand Down
26 changes: 26 additions & 0 deletions cli/src/pcluster/config/update_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import re
from enum import Enum

from pcluster.aws.common import get_region
from pcluster.config.cluster_config import QueueUpdateStrategy
from pcluster.config.update_policy_utils import SharedStorageChangeInfo
from pcluster.constants import (
Expand Down Expand Up @@ -149,6 +150,12 @@ def is_slurm_queues_change(change):
return any(path.startswith("SlurmQueues[") for path in change.path)


def _is_adc_region():
"""Return True if the currently configured AWS region belongs to an ADC partition."""
region = get_region() or ""
return region.startswith("us-iso")


def extract_type_and_name_from_path(path):
# Example path = 'SlurmQueues[slurm-q-name]'
# This function returns the type and name extracted like this: 'SlurmQueues', 'slurm-q-name'
Expand Down Expand Up @@ -730,6 +737,25 @@ def fail_reason_extra_chef_attributes(change, _) -> str:
+ ". If you need this change, please consider creating a new cluster instead of updating the existing one.",
)

# Update supported everywhere except ADC regions (us-iso*, us-isob*).
#
# In ADC regions there is a CloudFormation behavior where an UpdateStack call that includes
# both a Tags change and a resource whose change is only in Metadata does not update that
# resource. This breaks the head node update flow because cfn-hup on the head node polls
# DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate and never sees them,
# leaving the HeadNodeWaitCondition to time out.
#
# Until the CloudFormation behavior is fixed in ADC, block tag updates in those regions.
UpdatePolicy.SUPPORTED_UNLESS_ADC = UpdatePolicy(
name="SUPPORTED_UNLESS_ADC",
level=1000,
fail_reason=lambda change, patch: (
f"Updating '{change.key}' during a cluster update is not supported in ADC regions."
),
action_needed=lambda change, patch: f"Restore '{change.key}' to its previous value.",
condition_checker=lambda change, patch: not _is_adc_region(),
)

# Block update if cluster has a managed Fsx for Lustre FileSystem, otherwise fallback to QueueUpdateStrategy
UpdatePolicy.MANAGED_FSX = UpdatePolicy(
name="MANAGED_FSX",
Expand Down
6 changes: 5 additions & 1 deletion cli/src/pcluster/schemas/cluster_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1994,7 +1994,11 @@ class ClusterSchema(BaseSchema):

monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
tags = fields.Nested(TagSchema, many=True, metadata={"update_policy": UpdatePolicy.SUPPORTED, "update_key": "Key"})
tags = fields.Nested(
TagSchema,
many=True,
metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC, "update_key": "Key"},
)
iam = fields.Nested(ClusterIamSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
directory_service = fields.Nested(
DirectoryServiceSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP}
Expand Down
2 changes: 1 addition & 1 deletion cli/src/pcluster/schemas/common_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class TagSchema(BaseSchema):
value = fields.Str(
required=True,
validate=validate.Length(max=256),
metadata={"update_policy": UpdatePolicy.SUPPORTED},
metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC},
)

@post_load
Expand Down
16 changes: 8 additions & 8 deletions cli/tests/pcluster/config/test_config_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1154,11 +1154,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
None,
{"Key": "test3", "Value": "val3"},
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_addition",
),
pytest.param(
Expand All @@ -1170,11 +1170,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
{"Key": "test2", "Value": "val2"},
None,
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_removal",
),
pytest.param(
Expand All @@ -1186,11 +1186,11 @@ def test_patch_check_cluster_resource_bucket(
"Value",
"old_value",
"new_value",
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=False,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_value_modification",
),
pytest.param(
Expand All @@ -1202,11 +1202,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
None,
{"Key": "test3", "Value": "val3"},
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="order_change_plus_addition",
),
pytest.param(
Expand Down
41 changes: 41 additions & 0 deletions cli/tests/pcluster/config/test_update_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3085,3 +3085,44 @@ def test_head_node_local_storage_update_blocked(
if expected_action_needed:
assert_that(action_needed_messages).is_not_empty()
assert_that(action_needed_messages[0]).is_equal_to(expected_action_needed)


@pytest.mark.parametrize(
"region, expected_result",
[
pytest.param("us-east-1", True, id="commercial region allows tag update"),
pytest.param("eu-west-1", True, id="commercial EU region allows tag update"),
pytest.param("us-gov-west-1", True, id="GovCloud region allows tag update"),
pytest.param("cn-north-1", True, id="China region allows tag update"),
pytest.param("us-iso-east-1", False, id="us-iso region blocks tag update"),
pytest.param("us-iso-west-1", False, id="us-iso-west region blocks tag update"),
pytest.param("us-isob-east-1", False, id="us-isob region blocks tag update"),
pytest.param("", True, id="empty region falls back to allowing update"),
],
)
def test_supported_unless_adc_condition_checker(mocker, region, expected_result):
"""SUPPORTED_UNLESS_ADC blocks updates only in ADC partitions (us-iso*, us-isob*)."""
mocker.patch("pcluster.config.update_policy.get_region", return_value=region)

change_mock = mocker.MagicMock()
patch_mock = mocker.MagicMock()

assert_that(UpdatePolicy.SUPPORTED_UNLESS_ADC.condition_checker(change_mock, patch_mock)).is_equal_to(
expected_result
)


def test_supported_unless_adc_fail_reason_mentions_adc(mocker):
"""The fail reason for SUPPORTED_UNLESS_ADC explains the ADC limitation."""
mocker.patch("pcluster.config.update_policy.get_region", return_value="us-iso-east-1")

change_mock = mocker.MagicMock()
change_mock.key = "Tags"
patch_mock = mocker.MagicMock()

result, fail_reason, action_needed, _ = UpdatePolicy.SUPPORTED_UNLESS_ADC.check(change_mock, patch_mock)

assert_that(result).is_equal_to(UpdatePolicy.CheckResult.ACTION_NEEDED)
assert_that(fail_reason).contains("ADC")
assert_that(fail_reason).contains("Tags")
assert_that(action_needed).contains("Tags")
18 changes: 14 additions & 4 deletions tests/integration-tests/tests/tags/test_tag_propagation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
get_shared_volume_tags,
)
from time_utils import minutes, seconds
from utils import wait_for_computefleet_changed

from tests.common.schedulers_common import SlurmCommands
from tests.common.utils import get_installed_parallelcluster_version
Expand Down Expand Up @@ -53,18 +54,27 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os
_check_tag_propagation(cluster, scheduler, os, volume_name)

cluster.stop()

wait_for_computefleet_changed(cluster, "STOPPED")
# Updates cluster with new configuration
updated_cluster_config = pcluster_config_reader(config_file="pcluster.config.update.yaml", volume_name=volume_name)
cluster.update(str(updated_cluster_config), force_update="true")
cluster.update(str(updated_cluster_config))

cluster.start()

# Makes sure that the compute nodes have started before checking tags
_wait_for_compute_fleet_start(cluster)

# Checks for tag propagation
_check_tag_propagation(cluster, scheduler, os, volume_name, add_additional_config_tags=True)
# Checks for tag propagation.
# In ADC regions, CloudFormation doesn't properly update stack tags during a cluster update (see the
# SUPPORTED_UNLESS_ADC update policy), so AdditionalConfigTag is intentionally
# not added to the updated configuration and we skip asserting it here.
_check_tag_propagation(
cluster,
scheduler,
os,
volume_name,
add_additional_config_tags="us-iso" not in cluster.region,
)

if scheduler == "slurm":
_test_queue_and_compute_resources_tags(cluster, pcluster_config_reader, scheduler, os, volume_name)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
Image:
Os: {{ os }}
Tags:
{% if "us-iso" in region %}
# In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be
# updated when UpdateStack includes a Tags change together with a metadata-only
# change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC
# update policy). Tags must match pcluster.config.yaml exactly to avoid a diff.
- Key: ConfigFileTag
Value: ConfigFileTagValue
- Key: QueueOverrideTag
Value: ClusterLevelValue
- Key: ComputeOverrideTag
Value: ClusterLevelValue
{% else %}
- Key: ComputeOverrideTag
Value: ClusterLevelValue
- Key: ConfigFileTag
Value: ConfigFileTagValue
- Key: QueueOverrideTag
Value: ClusterLevelValue
{% endif %}
HeadNode:
InstanceType: {{ instance }}
Networking:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
Image:
Os: {{ os }}
Tags:
{% if "us-iso" in region %}
# In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be
# updated when UpdateStack includes a Tags change together with a metadata-only
# change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC
# update policy). Tags must match pcluster.config.yaml exactly to avoid a diff.
- Key: ConfigFileTag
Value: ConfigFileTagValue
- Key: QueueOverrideTag
Value: ClusterLevelValue
- Key: ComputeOverrideTag
Value: ClusterLevelValue
{% else %}
- Key: ComputeOverrideTag
Value: ClusterLevelValue
- Key: ConfigFileTag
Expand All @@ -9,6 +21,7 @@ Tags:
Value: ClusterLevelValue
- Key: AdditionalConfigTag
Value: AdditionalConfigTagValue
{% endif %}
HeadNode:
InstanceType: {{ instance }}
Networking:
Expand Down
Loading