diff --git a/e2e/cse_timing.go b/e2e/cse_timing.go index 6f8e3e8a681..3592ff0d005 100644 --- a/e2e/cse_timing.go +++ b/e2e/cse_timing.go @@ -13,11 +13,6 @@ import ( ) const ( - // cseEventsDir is the directory where CSE task timing events are stored on the VM. - // This matches EVENTS_LOGGING_DIR defined in both cse_helpers.sh and cse_start.sh. - // Events are written directly here (not in per-handler subdirectories) — each file - // is a single-line JSON object named .json. - cseEventsDir = "/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/" // provisionJSONPath is the path to the provision.json file with overall boot timing. provisionJSONPath = "/var/log/azure/aks/provision.json" ) @@ -33,23 +28,23 @@ type CSETaskTiming struct { // CSEProvisionTiming represents the overall provisioning timing from provision.json. type CSEProvisionTiming struct { - ExitCode string `json:"ExitCode"` - ExecDuration string `json:"ExecDuration"` - KernelStartTime string `json:"KernelStartTime"` - CloudInitLocalStart string `json:"CloudInitLocalStartTime"` - CloudInitStart string `json:"CloudInitStartTime"` - CloudFinalStart string `json:"CloudFinalStartTime"` - CSEStartTime string `json:"CSEStartTime"` - GuestAgentStartTime string `json:"GuestAgentStartTime"` - SystemdSummary string `json:"SystemdSummary"` - BootDatapoints json.RawMessage `json:"BootDatapoints"` + ExitCode string `json:"ExitCode"` + ExecDuration string `json:"ExecDuration"` + KernelStartTime string `json:"KernelStartTime"` + CloudInitLocalStart string `json:"CloudInitLocalStartTime"` + CloudInitStart string `json:"CloudInitStartTime"` + CloudFinalStart string `json:"CloudFinalStartTime"` + CSEStartTime string `json:"CSEStartTime"` + GuestAgentStartTime string `json:"GuestAgentStartTime"` + SystemdSummary string `json:"SystemdSummary"` + BootDatapoints json.RawMessage `json:"BootDatapoints"` } // CSETimingReport holds all parsed timing data from a VM. type CSETimingReport struct { - Tasks []CSETaskTiming - Provision *CSEProvisionTiming - taskIndex map[string]*CSETaskTiming + Tasks []CSETaskTiming + Provision *CSEProvisionTiming + taskIndex map[string]*CSETaskTiming } // cseEventJSON matches the JSON structure written by logs_to_events() in cse_helpers.sh. @@ -128,12 +123,9 @@ func ExtractCSETimings(ctx context.Context, s *Scenario) (*CSETimingReport, erro // Read all event JSON files from the CSE events directory, explicitly // appending a newline after each file so each JSON document is separated. - // Search both the primary events directory and any handler-version subdirectories, - // as the Guest Agent may move events between these locations. - listCmd := fmt.Sprintf( - "sudo find %s /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null", - cseEventsDir, - ) + // Search the CustomScript directory tree for any events/ subdirectories, + // as the Guest Agent may store events in handler-version subdirectories. + listCmd := "sudo find /var/log/azure/Microsoft.Azure.Extensions.CustomScript/ -name '*.json' -path '*/events/*' -exec sh -c 'cat \"$1\"; echo' _ {} \\; 2>/dev/null" result, err := execScriptOnVm(ctx, s, s.Runtime.VM, listCmd) if err != nil { return nil, fmt.Errorf("failed to read CSE events: %w", err) diff --git a/e2e/kube.go b/e2e/kube.go index b5f1fe18580..9650ab343bc 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -76,7 +76,7 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana }, nil } -func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace string, labelSelector string, fieldSelector string, maxRetries int) (*corev1.Pod, error) { +func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) { defer toolkit.LogStepCtxf(ctx, "waiting for pod %s %s in %q namespace", labelSelector, fieldSelector, namespace)() var pod *corev1.Pod @@ -103,22 +103,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace } } - // Check for FailedCreatePodSandBox events - events, err := k.Typed.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + pod.Name}) - if err == nil { - for _, event := range events.Items { - if event.Reason == "FailedCreatePodSandBox" { - maxRetries-- - sandboxErr := fmt.Errorf("pod %s has FailedCreatePodSandBox event: %s", pod.Name, event.Message) - if maxRetries <= 0 { - return false, sandboxErr - } - k.Typed.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: to.Ptr(int64(0))}) - return false, nil // Keep polling - } - } - } - switch pod.Status.Phase { case corev1.PodFailed: logPodDebugInfo(ctx, k, pod) @@ -144,10 +128,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace return pod, err } -func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) { - return k.WaitUntilPodRunningWithRetry(ctx, namespace, labelSelector, fieldSelector, 0) -} - func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string { defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)() var lastNode *corev1.Node @@ -201,7 +181,7 @@ func (k *Kubeclient) GetPodNetworkDebugPodForNode(ctx context.Context, kubeNodeN if kubeNodeName == "" { return nil, fmt.Errorf("kubeNodeName must not be empty") } - return k.WaitUntilPodRunningWithRetry(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName, 3) + return k.WaitUntilPodRunning(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName) } func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) {