diff --git a/agent/run_job.go b/agent/run_job.go index a820af0066..17b1e5c324 100644 --- a/agent/run_job.go +++ b/agent/run_job.go @@ -330,7 +330,7 @@ func (r *JobRunner) runJob(ctx context.Context) core.ProcessExit { // Run the process. This will block until it finishes. if err := r.process.Run(ctx); err != nil { // Send the error to job logs - _, _ = fmt.Fprintf(r.jobLogs, "Error running job: %s\n", err) + _, _ = fmt.Fprintf(r.jobLogs, "Error running job: %s\n", job.FormatJobError(err)) // The process did not run at all, so make sure it fails return core.ProcessExit{ diff --git a/internal/job/errors.go b/internal/job/errors.go new file mode 100644 index 0000000000..f306a31387 --- /dev/null +++ b/internal/job/errors.go @@ -0,0 +1,19 @@ +package job + +import ( + "context" + "errors" +) + +func FormatJobError(err error) string { + switch { + case err == nil: + return "" + case errors.Is(err, context.DeadlineExceeded): + return "job timed out" + case errors.Is(err, context.Canceled): + return "job cancelled" + default: + return err.Error() + } +} diff --git a/internal/job/errors_test.go b/internal/job/errors_test.go new file mode 100644 index 0000000000..6dfcdb94b4 --- /dev/null +++ b/internal/job/errors_test.go @@ -0,0 +1,58 @@ +package job + +import ( + "context" + "errors" + "fmt" + "testing" +) + +func TestFormatJobError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want string + }{ + { + name: "nil error", + err: nil, + want: "", + }, + { + name: "context.Canceled", + err: context.Canceled, + want: "job cancelled", + }, + { + name: "context.DeadlineExceeded", + err: context.DeadlineExceeded, + want: "job timed out", + }, + { + name: "wrapped context.Canceled", + err: fmt.Errorf("running command: %w", context.Canceled), + want: "job cancelled", + }, + { + name: "wrapped context.DeadlineExceeded", + err: fmt.Errorf("running command: %w", context.DeadlineExceeded), + want: "job timed out", + }, + { + name: "non-context error passes through", + err: errors.New("kaboom"), + want: "kaboom", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + if got := FormatJobError(tt.err); got != tt.want { + t.Errorf("FormatJobError(%v) = %q, want %q", tt.err, got, tt.want) + } + }) + } +} diff --git a/internal/job/executor.go b/internal/job/executor.go index 85795c7c51..f19eea73d9 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -271,7 +271,7 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) { // an error from the hook/job logic. These are both good to report but // shouldn't override each other in reporting. if commandErr != nil { - e.shell.Printf("user command error: %v", commandErr) + e.shell.Printf("user command error: %s", FormatJobError(commandErr)) span.RecordError(commandErr) } @@ -1140,7 +1140,7 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr, commandErr error) return nil, commandErr default: - e.shell.Errorf("%s", commandErr) + e.shell.Errorf("%s", FormatJobError(commandErr)) // error is not an exit error, we don't want to return it return nil, nil diff --git a/kubernetes/repro_3384_test.go b/kubernetes/repro_3384_test.go new file mode 100644 index 0000000000..6cf3548076 --- /dev/null +++ b/kubernetes/repro_3384_test.go @@ -0,0 +1,91 @@ +//go:build !windows + +package kubernetes + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "github.com/buildkite/agent/v3/internal/job" + "github.com/buildkite/agent/v3/logger" +) + +// TestRepro3384_K8sRunnerLeaksBareContextError reproduces the user-visible bug +// in #3384: when the agent context is cancelled (UI cancel or job timeout), +// kubernetes.Runner.Run returns ctx.Err() bare. The pre-fix line at +// agent/run_job.go was: +// +// fmt.Fprintf(r.jobLogs, "Error running job: %s\n", err) +// +// which produced `Error running job: context canceled` in the job log. This +// test drives a real Runner, cancels its context, and shows both the pre-fix +// and post-fix log lines. +func TestRepro3384_K8sRunnerLeaksBareContextError(t *testing.T) { + // Short dir name: macOS unix sockets cap at 104 bytes. + tempDir, err := os.MkdirTemp("", "r3384") + if err != nil { + t.Fatalf("MkdirTemp: %v", err) + } + t.Cleanup(func() { _ = os.RemoveAll(tempDir) }) + + runner := NewRunner(logger.Discard, RunnerConfig{ + SocketPath: filepath.Join(tempDir, "bk.sock"), + ClientCount: 0, // startupCheck disabled with ClientStartTimeout==0 + }) + + ctx, cancel := context.WithCancel(context.Background()) + runErrCh := make(chan error, 1) + go func() { runErrCh <- runner.Run(ctx) }() + + // Wait for the socket to appear so we know Run is in its select loop. + deadline := time.After(10 * time.Second) + tick := time.Tick(time.Millisecond) +loop: + for { + select { + case <-tick: + if _, err := os.Lstat(runner.conf.SocketPath); err == nil { + break loop + } + case <-deadline: + t.Fatal("runner socket never appeared") + } + } + + // Simulate the agent receiving a cancel (UI cancel / job timeout). + cancel() + + var runErr error + select { + case runErr = <-runErrCh: + case <-time.After(5 * time.Second): + t.Fatal("runner.Run did not return after ctx cancel") + } + + // Confirm the bare context error. + if !errors.Is(runErr, context.Canceled) { + t.Fatalf("runner.Run returned %v, want context.Canceled", runErr) + } + if runErr.Error() != "context canceled" { + t.Fatalf("runErr.Error() = %q, want %q (Go's bare sentinel)", runErr.Error(), "context canceled") + } + + // Pre-fix: agent/run_job.go printed err directly. + preFix := fmt.Sprintf("Error running job: %s", runErr) + t.Logf("PRE-FIX job log line: %q", preFix) + if preFix != "Error running job: context canceled" { + t.Fatalf("pre-fix line = %q, want %q", preFix, "Error running job: context canceled") + } + + // Post-fix: routed through job.FormatJobError. + postFix := fmt.Sprintf("Error running job: %s", job.FormatJobError(runErr)) + t.Logf("POST-FIX job log line: %q", postFix) + if postFix != "Error running job: job cancelled" { + t.Fatalf("post-fix line = %q, want %q", postFix, "Error running job: job cancelled") + } +}