Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 16 additions & 24 deletions e2e/cse_timing.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ import (
)

const (
// cseEventsDir is the directory where CSE task timing events are stored on the VM.
// This matches EVENTS_LOGGING_DIR defined in both cse_helpers.sh and cse_start.sh.
// Events are written directly here (not in per-handler subdirectories) — each file
// is a single-line JSON object named <epoch-ms>.json.
cseEventsDir = "/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/"
// provisionJSONPath is the path to the provision.json file with overall boot timing.
provisionJSONPath = "/var/log/azure/aks/provision.json"
)
Expand All @@ -33,23 +28,23 @@ type CSETaskTiming struct {

// CSEProvisionTiming represents the overall provisioning timing from provision.json.
type CSEProvisionTiming struct {
ExitCode string `json:"ExitCode"`
ExecDuration string `json:"ExecDuration"`
KernelStartTime string `json:"KernelStartTime"`
CloudInitLocalStart string `json:"CloudInitLocalStartTime"`
CloudInitStart string `json:"CloudInitStartTime"`
CloudFinalStart string `json:"CloudFinalStartTime"`
CSEStartTime string `json:"CSEStartTime"`
GuestAgentStartTime string `json:"GuestAgentStartTime"`
SystemdSummary string `json:"SystemdSummary"`
BootDatapoints json.RawMessage `json:"BootDatapoints"`
ExitCode string `json:"ExitCode"`
ExecDuration string `json:"ExecDuration"`
KernelStartTime string `json:"KernelStartTime"`
CloudInitLocalStart string `json:"CloudInitLocalStartTime"`
CloudInitStart string `json:"CloudInitStartTime"`
CloudFinalStart string `json:"CloudFinalStartTime"`
CSEStartTime string `json:"CSEStartTime"`
GuestAgentStartTime string `json:"GuestAgentStartTime"`
SystemdSummary string `json:"SystemdSummary"`
BootDatapoints json.RawMessage `json:"BootDatapoints"`
}

// CSETimingReport holds all parsed timing data from a VM.
type CSETimingReport struct {
Tasks []CSETaskTiming
Provision *CSEProvisionTiming
taskIndex map[string]*CSETaskTiming
Tasks []CSETaskTiming
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this PR, but we probably should have some formatting or lint rule to stop whitespace only changes in PRs.

Provision *CSEProvisionTiming
taskIndex map[string]*CSETaskTiming
}

// cseEventJSON matches the JSON structure written by logs_to_events() in cse_helpers.sh.
Expand Down Expand Up @@ -128,12 +123,9 @@ func ExtractCSETimings(ctx context.Context, s *Scenario) (*CSETimingReport, erro

// Read all event JSON files from the CSE events directory, explicitly
// appending a newline after each file so each JSON document is separated.
// Search both the primary events directory and any handler-version subdirectories,
// as the Guest Agent may move events between these locations.
listCmd := fmt.Sprintf(
"sudo find %s /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null",
cseEventsDir,
)
// Search the CustomScript directory tree for any events/ subdirectories,
// as the Guest Agent may store events in handler-version subdirectories.
listCmd := "sudo find /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null"
result, err := execScriptOnVm(ctx, s, s.Runtime.VM, listCmd)
Comment thread
r2k1 marked this conversation as resolved.
if err != nil {
return nil, fmt.Errorf("failed to read CSE events: %w", err)
Expand Down
24 changes: 2 additions & 22 deletions e2e/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana
}, nil
}

func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace string, labelSelector string, fieldSelector string, maxRetries int) (*corev1.Pod, error) {
func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) {
defer toolkit.LogStepCtxf(ctx, "waiting for pod %s %s in %q namespace", labelSelector, fieldSelector, namespace)()
var pod *corev1.Pod

Expand All @@ -103,22 +103,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace
}
}

// Check for FailedCreatePodSandBox events
events, err := k.Typed.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + pod.Name})
if err == nil {
for _, event := range events.Items {
if event.Reason == "FailedCreatePodSandBox" {
maxRetries--
sandboxErr := fmt.Errorf("pod %s has FailedCreatePodSandBox event: %s", pod.Name, event.Message)
if maxRetries <= 0 {
return false, sandboxErr
}
k.Typed.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: to.Ptr(int64(0))})
return false, nil // Keep polling
}
}
}

switch pod.Status.Phase {
case corev1.PodFailed:
logPodDebugInfo(ctx, k, pod)
Expand All @@ -144,10 +128,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace
return pod, err
}

func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) {
return k.WaitUntilPodRunningWithRetry(ctx, namespace, labelSelector, fieldSelector, 0)
}

func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string {
defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)()
var lastNode *corev1.Node
Expand Down Expand Up @@ -201,7 +181,7 @@ func (k *Kubeclient) GetPodNetworkDebugPodForNode(ctx context.Context, kubeNodeN
if kubeNodeName == "" {
return nil, fmt.Errorf("kubeNodeName must not be empty")
}
return k.WaitUntilPodRunningWithRetry(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName, 3)
return k.WaitUntilPodRunning(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName)
Copy link
Copy Markdown
Contributor Author

@r2k1 r2k1 May 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The retry didn't work. It re-created pod with the same name preserving the events.
The retry found previous events and failed the test (when it suppose to continue, k8s automatically retries pod creation in this case, such events are transient and can be ignored)

}

func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) {
Expand Down
Loading