Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion agent/run_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ func (r *JobRunner) runJob(ctx context.Context) core.ProcessExit {
// Run the process. This will block until it finishes.
if err := r.process.Run(ctx); err != nil {
// Send the error to job logs
_, _ = fmt.Fprintf(r.jobLogs, "Error running job: %s\n", err)
_, _ = fmt.Fprintf(r.jobLogs, "Error running job: %s\n", job.FormatJobError(err))

// The process did not run at all, so make sure it fails
return core.ProcessExit{
Expand Down
19 changes: 19 additions & 0 deletions internal/job/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package job

import (
"context"
"errors"
)

func FormatJobError(err error) string {
switch {
case err == nil:
return ""
case errors.Is(err, context.DeadlineExceeded):
return "job timed out"
case errors.Is(err, context.Canceled):
return "job cancelled"
default:
return err.Error()
}
}
58 changes: 58 additions & 0 deletions internal/job/errors_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package job

import (
"context"
"errors"
"fmt"
"testing"
)

func TestFormatJobError(t *testing.T) {
t.Parallel()

tests := []struct {
name string
err error
want string
}{
{
name: "nil error",
err: nil,
want: "",
},
{
name: "context.Canceled",
err: context.Canceled,
want: "job cancelled",
},
{
name: "context.DeadlineExceeded",
err: context.DeadlineExceeded,
want: "job timed out",
},
{
name: "wrapped context.Canceled",
err: fmt.Errorf("running command: %w", context.Canceled),
want: "job cancelled",
},
{
name: "wrapped context.DeadlineExceeded",
err: fmt.Errorf("running command: %w", context.DeadlineExceeded),
want: "job timed out",
},
{
name: "non-context error passes through",
err: errors.New("kaboom"),
want: "kaboom",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
if got := FormatJobError(tt.err); got != tt.want {
t.Errorf("FormatJobError(%v) = %q, want %q", tt.err, got, tt.want)
}
})
}
}
4 changes: 2 additions & 2 deletions internal/job/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {
// an error from the hook/job logic. These are both good to report but
// shouldn't override each other in reporting.
if commandErr != nil {
e.shell.Printf("user command error: %v", commandErr)
e.shell.Printf("user command error: %s", FormatJobError(commandErr))
span.RecordError(commandErr)
}

Expand Down Expand Up @@ -1140,7 +1140,7 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr, commandErr error)
return nil, commandErr

default:
e.shell.Errorf("%s", commandErr)
e.shell.Errorf("%s", FormatJobError(commandErr))

// error is not an exit error, we don't want to return it
return nil, nil
Expand Down
91 changes: 91 additions & 0 deletions kubernetes/repro_3384_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
//go:build !windows

package kubernetes

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"testing"
"time"

"github.com/buildkite/agent/v3/internal/job"
"github.com/buildkite/agent/v3/logger"
)

// TestRepro3384_K8sRunnerLeaksBareContextError reproduces the user-visible bug
// in #3384: when the agent context is cancelled (UI cancel or job timeout),
// kubernetes.Runner.Run returns ctx.Err() bare. The pre-fix line at
// agent/run_job.go was:
//
// fmt.Fprintf(r.jobLogs, "Error running job: %s\n", err)
//
// which produced `Error running job: context canceled` in the job log. This
// test drives a real Runner, cancels its context, and shows both the pre-fix
// and post-fix log lines.
func TestRepro3384_K8sRunnerLeaksBareContextError(t *testing.T) {
// Short dir name: macOS unix sockets cap at 104 bytes.
tempDir, err := os.MkdirTemp("", "r3384")
if err != nil {
t.Fatalf("MkdirTemp: %v", err)
}
t.Cleanup(func() { _ = os.RemoveAll(tempDir) })

runner := NewRunner(logger.Discard, RunnerConfig{
SocketPath: filepath.Join(tempDir, "bk.sock"),
ClientCount: 0, // startupCheck disabled with ClientStartTimeout==0
})

ctx, cancel := context.WithCancel(context.Background())
runErrCh := make(chan error, 1)
go func() { runErrCh <- runner.Run(ctx) }()

// Wait for the socket to appear so we know Run is in its select loop.
deadline := time.After(10 * time.Second)
tick := time.Tick(time.Millisecond)
loop:
for {
select {
case <-tick:
if _, err := os.Lstat(runner.conf.SocketPath); err == nil {
break loop
}
case <-deadline:
t.Fatal("runner socket never appeared")
}
}

// Simulate the agent receiving a cancel (UI cancel / job timeout).
cancel()

var runErr error
select {
case runErr = <-runErrCh:
case <-time.After(5 * time.Second):
t.Fatal("runner.Run did not return after ctx cancel")
}

// Confirm the bare context error.
if !errors.Is(runErr, context.Canceled) {
t.Fatalf("runner.Run returned %v, want context.Canceled", runErr)
}
if runErr.Error() != "context canceled" {
t.Fatalf("runErr.Error() = %q, want %q (Go's bare sentinel)", runErr.Error(), "context canceled")
}

// Pre-fix: agent/run_job.go printed err directly.
preFix := fmt.Sprintf("Error running job: %s", runErr)
t.Logf("PRE-FIX job log line: %q", preFix)
if preFix != "Error running job: context canceled" {
t.Fatalf("pre-fix line = %q, want %q", preFix, "Error running job: context canceled")
}

// Post-fix: routed through job.FormatJobError.
postFix := fmt.Sprintf("Error running job: %s", job.FormatJobError(runErr))
t.Logf("POST-FIX job log line: %q", postFix)
if postFix != "Error running job: job cancelled" {
t.Fatalf("post-fix line = %q, want %q", postFix, "Error running job: job cancelled")
}
}