Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions test/extended/node/CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# OpenShift Node E2E Tests - Tribal Knowledge

## Core Principle

**ALWAYS use the utility functions in `node_utils.go`** instead of implementing your own. Read that file to discover available helpers.

## Key Functions & Context

### Node Selection

- **GetNodesByLabel** - Use this for getting a subset of the nodes. The labels must be carefully chosen.
- **GetControlPlaneNodes** - These are the master nodes or the control plane nodes. In most clusters it will return 3 of them.
- **GetPureWorkerNodes** - Use this to make sure that the node returned is not a control plane node.

### Node Command Execution

- **ExecOnNodeWithChroot** - Use this for all the root command executions inside a debug container. This can change the state of the node. Use it with caution.
Comment thread
ngopalak-redhat marked this conversation as resolved.

### Kubelet Configuration & Lifecycle

- **GetKubeletConfigFromNode** - Use this to check if a kubelet configuration made at the API level has been applied to the node.
- **CleanupDropInAndRestartKubelet** - Kubelet supports drop-in directory. If you manually drop-in a config use this to clean up.
- **IsNodeInReadyState** - Use this to find out if the node has completed its restart and back to ready state.
- **WaitForNodeToBeReady** - If any kubelet config is applied, use this to wait for the node to reach a ready state.
- **RestartKubeletOnNode** - Use this when testing kubelet restarts and also in cases when there are some issues that's outside the context.

### MachineConfig Operations

- **WaitForMCP** - This is used when you create a new machine config and wait for it to be applied. Although parallel, if multiple nodes are involved it can take more time.

## Common Mistakes to Avoid

1. **Don't manually construct `oc debug` commands** - use `ExecOnNodeWithChroot()` or `ExecOnNodeWithNsenter()`

2. **Don't forget to handle SNO clusters** - use `GetPureWorkerNodes()` to filter out nodes with dual roles

3. **Don't skip context propagation** - always pass `ctx` to utility functions

4. **Don't forget cleanup** - use `defer` or `g.AfterEach` with `CleanupDropInAndRestartKubelet()`

5. **Don't ignore MCP rollouts** - after MachineConfig changes, use `WaitForMCP()` to ensure stability

6. **Don't assume swap operations work with chroot** - use `ExecOnNodeWithNsenter()` for swap commands

## Getting Help

- Read the function documentation in `node_utils.go`
- Look at existing tests in this directory for patterns
- Check testdata files in `testdata/node/` for config examples
- See `node_swap_cnv.go` for a complete example
37 changes: 37 additions & 0 deletions test/extended/node/claude-docs-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
# Check if node_utils.go has functions not mentioned in CLAUDE.md

set -e

NODE_UTILS="test/extended/node/node_utils.go"
CLAUDE_MD="test/extended/node/CLAUDE.md"

# Extract ONLY exported function names from node_utils.go (start with uppercase)
# Lowercase (unexported) helpers are intentionally not documented in CLAUDE.md
# Matches both standalone functions and receiver methods, including digits in names
UTILS_FUNCS=$(
grep -E '^[[:space:]]*func([[:space:]]+\([^)]*\))?[[:space:]]+[A-Z][A-Za-z0-9_]*[[:space:]]*\(' "$NODE_UTILS" \
| sed -E 's/^[[:space:]]*func([[:space:]]+\([^)]*\))?[[:space:]]+([A-Z][A-Za-z0-9_]*)[[:space:]]*\(.*/\2/' \
| sort -u
)

# Read CLAUDE.md once for efficiency
CLAUDE_CONTENT=$(cat "$CLAUDE_MD")

# Check each function is mentioned in CLAUDE.md (word-boundary match to avoid false positives)
MISSING=()
for func in $UTILS_FUNCS; do
if ! echo "$CLAUDE_CONTENT" | grep -Fqw "$func"; then
MISSING+=(" - $func()")
fi
done

if [ ${#MISSING[@]} -gt 0 ]; then
echo "⚠️ Warning: node_utils.go functions not documented in CLAUDE.md:"
printf '%s\n' "${MISSING[@]}"
echo ""
echo "Please update CLAUDE.md to document these utility functions."
exit 1
fi

echo "✅ All node_utils.go functions are documented in CLAUDE.md"
6 changes: 3 additions & 3 deletions test/extended/node/node_sizing.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv
g.DeferCleanup(cleanupMCP)

g.By("Waiting for custom MachineConfigPool to be ready")
err = waitForMCP(ctx, mcClient, testMCPName, 5*time.Minute)
err = WaitForMCP(ctx, mcClient, testMCPName, 5*time.Minute)
o.Expect(err).NotTo(o.HaveOccurred(), "Custom MachineConfigPool should become ready")

verifyNodeSizingEnabledFile(oc, nodeName, "true")
Expand Down Expand Up @@ -193,7 +193,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv

// Wait for custom MCP to be ready after cleanup
g.By("Waiting for custom MCP to be ready after KubeletConfig deletion")
waitErr := waitForMCP(cleanupCtx, mcClient, testMCPName, 5*time.Minute)
waitErr := WaitForMCP(cleanupCtx, mcClient, testMCPName, 5*time.Minute)
if apierrors.IsNotFound(waitErr) {
// MachineConfigPool already deleted, nothing to wait for
} else if waitErr != nil {
Expand Down Expand Up @@ -229,7 +229,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv
}, 2*time.Minute, 10*time.Second).Should(o.BeTrue(), fmt.Sprintf("%s MCP should start updating", testMCPName))

g.By(fmt.Sprintf("Waiting for %s MCP to be ready with new configuration", testMCPName))
err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute)
err = WaitForMCP(ctx, mcClient, testMCPName, 15*time.Minute)
o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("%s MCP should become ready with new configuration", testMCPName))

verifyNodeSizingEnabledFile(oc, nodeName, "false")
Expand Down
20 changes: 10 additions & 10 deletions test/extended/node/node_swap.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@ var _ = g.Describe("[Jira:Node][sig-node] Node non-cnv swap configuration", func
// the kubelet will not use it for memory management, maintaining consistent behavior across the cluster.
g.It("should have correct default kubelet swap settings with worker nodes failSwapOn=false, control plane nodes failSwapOn=true, and both swapBehavior=NoSwap [OCP-86394]", ote.Informing(), func(ctx context.Context) {
g.By("Getting worker nodes")
allWorkerNodes, err := getNodesByLabel(ctx, oc, "node-role.kubernetes.io/worker")
allWorkerNodes, err := GetNodesByLabel(ctx, oc, "node-role.kubernetes.io/worker")
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(len(allWorkerNodes)).Should(o.BeNumerically(">", 0), "Expected at least one worker node")

// Filter out nodes that are also control plane (e.g., SNO)
workerNodes := getPureWorkerNodes(allWorkerNodes)
workerNodes := GetPureWorkerNodes(allWorkerNodes)

g.By("Validating kubelet configuration on each worker node")
for _, node := range workerNodes {
config, err := getKubeletConfigFromNode(ctx, oc, node.Name)
config, err := GetKubeletConfigFromNode(ctx, oc, node.Name)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to get kubelet config for worker node %s", node.Name)

g.By(fmt.Sprintf("Checking failSwapOn=false on worker node %s", node.Name))
Expand All @@ -74,13 +74,13 @@ var _ = g.Describe("[Jira:Node][sig-node] Node non-cnv swap configuration", func

if *controlPlaneTopology != configv1.ExternalTopologyMode {
g.By("Getting control plane nodes")
controlPlaneNodes, err := getControlPlaneNodes(ctx, oc)
controlPlaneNodes, err := GetControlPlaneNodes(ctx, oc)
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(len(controlPlaneNodes)).Should(o.BeNumerically(">", 0), "Expected at least one control plane node")

g.By("Validating kubelet configuration on each control plane node")
for _, node := range controlPlaneNodes {
config, err := getKubeletConfigFromNode(ctx, oc, node.Name)
config, err := GetKubeletConfigFromNode(ctx, oc, node.Name)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to get kubelet config for control plane node %s", node.Name)

g.By(fmt.Sprintf("Checking failSwapOn=true on control plane node %s", node.Name))
Expand Down Expand Up @@ -113,7 +113,7 @@ var _ = g.Describe("[Jira:Node][sig-node] Node non-cnv swap configuration", func

g.By("Getting initial machine config resourceVersion")
// Get the initial resourceVersion of the worker machine config before creating KubeletConfig
workerGeneratedKubeletMC, err := getWorkerGeneratedKubeletMC(ctx, mcClient)
workerGeneratedKubeletMC, err := GetWorkerGeneratedKubeletMC(ctx, mcClient)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to find worker-generated-kubelet MachineConfig")
initialResourceVersion := workerGeneratedKubeletMC.ResourceVersion
framework.Logf("Initial %s resourceVersion: %s", workerGeneratedKubeletMC.Name, initialResourceVersion)
Expand Down Expand Up @@ -183,21 +183,21 @@ var _ = g.Describe("[Jira:Node][sig-node] Node non-cnv swap configuration", func
time.Sleep(5 * time.Second)

// Check if the machine config was created or updated (compare to initial resourceVersion captured earlier)
workerMCAfter, err := getWorkerGeneratedKubeletMC(ctx, mcClient)
workerMCAfter, err := GetWorkerGeneratedKubeletMC(ctx, mcClient)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to find worker-generated-kubelet MachineConfig for verification")
o.Expect(workerMCAfter.ResourceVersion).To(o.Equal(initialResourceVersion), "Machine config %s should not be updated when failSwapOn is rejected", workerMCAfter.Name)
framework.Logf("Verified: %s was not updated (resourceVersion: %s)", workerMCAfter.Name, workerMCAfter.ResourceVersion)

g.By("Verifying worker nodes still have correct swap settings")
allWorkerNodes, err := getNodesByLabel(ctx, oc, "node-role.kubernetes.io/worker")
allWorkerNodes, err := GetNodesByLabel(ctx, oc, "node-role.kubernetes.io/worker")
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(len(allWorkerNodes)).Should(o.BeNumerically(">", 0), "Expected at least one worker node")

// Filter out nodes that are also control plane (e.g., SNO)
workerNodes := getPureWorkerNodes(allWorkerNodes)
workerNodes := GetPureWorkerNodes(allWorkerNodes)

for _, node := range workerNodes {
config, err := getKubeletConfigFromNode(ctx, oc, node.Name)
config, err := GetKubeletConfigFromNode(ctx, oc, node.Name)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to get kubelet config for worker node %s", node.Name)

g.By(fmt.Sprintf("Verifying failSwapOn=false remains unchanged on worker node %s", node.Name))
Expand Down
Loading