From 13f0b33ebc00ecb64f4b8b1fb7e9e6f827d50f98 Mon Sep 17 00:00:00 2001 From: Tai Dickerson Date: Mon, 15 Jun 2026 14:45:03 -0400 Subject: [PATCH] Retry CreateOnGithubJob on GitHub auth 401s Deployment/status creation surfaces transient Octokit::Unauthorized when a GitHub installation token is rejected or still propagating. CommitDeployment#create_on_github! only rescues NotFound/Forbidden, so the 401 escaped the job unhandled and reopened the Observe issue. Add retry_on Octokit::Unauthorized to CreateOnGithubJob with polynomially_longer backoff and attempts: 14 (~24h window). The window intentionally outlasts the 50m installation-token cache (GITHUB_TOKEN_RAILS_CACHE_LIFETIME in lib/shipit/github_app.rb) so a stale cached token can refresh before we give up. On exhaustion, log and do not re-raise, matching the existing NotFound/Forbidden give-up behavior. No token cache or client changes; we do not evict/remint the cached token to avoid a remint storm across workers. This aligns the retry shape with the validated approach from https://github.com/Shopify/github-certification/pull/1873. Fixes https://github.com/shop/issues/issues/8801 --- CHANGELOG.md | 4 +++ Gemfile.lock | 2 +- app/jobs/shipit/create_on_github_job.rb | 11 +++++++ lib/shipit/version.rb | 2 +- test/jobs/perform_task_job_test.rb | 6 ++-- test/jobs/shipit/create_on_github_job_test.rb | 29 +++++++++++++++++++ 6 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 test/jobs/shipit/create_on_github_job_test.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index c000d7124..208dd60c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Unreleased +# 0.45.3 +* Retry CreateOnGithubJob on transient GitHub authentication failures. +* Stabilize PerformTaskJob tests by stubbing the task execution strategy instead of Command#stream!. + # 0.45.2 * (bugfix) Fix 404 error when removing all permissions from an API client diff --git a/Gemfile.lock b/Gemfile.lock index bfa4428da..58676efb7 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - shipit-engine (0.45.2) + shipit-engine (0.45.3) active_model_serializers (~> 0.9.3) ansi_stream (~> 0.0.6) autoprefixer-rails (~> 6.4.1) diff --git a/app/jobs/shipit/create_on_github_job.rb b/app/jobs/shipit/create_on_github_job.rb index 2d75616e6..e11ba4d22 100644 --- a/app/jobs/shipit/create_on_github_job.rb +++ b/app/jobs/shipit/create_on_github_job.rb @@ -7,6 +7,17 @@ class CreateOnGithubJob < BackgroundJob queue_as :default on_duplicate :drop + # Transient Octokit::Unauthorized = GitHub installation-token propagation lag. + # attempts: 14 (~24h) outlasts the 50m token cache (GITHUB_TOKEN_RAILS_CACHE_LIFETIME). + # No token eviction here to avoid a remint storm across workers. + retry_on Octokit::Unauthorized, wait: :polynomially_longer, attempts: 14 do |job, exception| + record = job.arguments.first + Rails.logger.warn( + "[CreateOnGithubJob] Giving up on #{record.class.name} #{record.id} " \ + "after GitHub authentication failures: #{exception.class} #{exception.message}" + ) + end + # We observe that some objects regularly take longer than the default 10 seconds to create, e.g. deployments self.timeout = 40 self.lock_timeout = 20 diff --git a/lib/shipit/version.rb b/lib/shipit/version.rb index b67b2d43b..fd6a7eb91 100644 --- a/lib/shipit/version.rb +++ b/lib/shipit/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module Shipit - VERSION = '0.45.2' + VERSION = '0.45.3' end diff --git a/test/jobs/perform_task_job_test.rb b/test/jobs/perform_task_job_test.rb index 45f418f07..3dd81ef94 100644 --- a/test/jobs/perform_task_job_test.rb +++ b/test/jobs/perform_task_job_test.rb @@ -107,7 +107,7 @@ def success? end test "mark deploy as error an unexpected exception is raised" do - Command.any_instance.expects(:stream!).at_least_once.raises(Command::Denied) + Shipit::TaskExecutionStrategy::Default.any_instance.expects(:capture!).at_least_once.raises(Command::Denied) @job.perform(@deploy) @@ -116,7 +116,7 @@ def success? end test "mark deploy as timedout if a command timeout" do - Command.any_instance.expects(:stream!).at_least_once.raises(Command::TimedOut) + Shipit::TaskExecutionStrategy::Default.any_instance.expects(:capture!).at_least_once.raises(Command::TimedOut) @job.perform(@deploy) @@ -129,7 +129,7 @@ def success? begin Shipit.timeout_exit_codes = [70].freeze - Command.any_instance.expects(:stream!).at_least_once.raises(Command::Failed.new('Blah', 70)) + Shipit::TaskExecutionStrategy::Default.any_instance.expects(:capture!).at_least_once.raises(Command::Failed.new('Blah', 70)) @job.perform(@deploy) diff --git a/test/jobs/shipit/create_on_github_job_test.rb b/test/jobs/shipit/create_on_github_job_test.rb new file mode 100644 index 000000000..907b409b8 --- /dev/null +++ b/test/jobs/shipit/create_on_github_job_test.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require 'test_helper' + +module Shipit + class CreateOnGithubJobTest < ActiveSupport::TestCase + setup do + @deployment = shipit_commit_deployments(:shipit_pending_fourth) + end + + test "#perform retries on GitHub authentication errors" do + CommitDeployment.any_instance.stubs(:create_on_github!).raises(Octokit::Unauthorized) + + assert_enqueued_with(job: CreateOnGithubJob) do + CreateOnGithubJob.perform_now(@deployment) + end + end + + test "#perform gives up without re-raising after exhausting authentication retries" do + CommitDeployment.any_instance.stubs(:create_on_github!).raises(Octokit::Unauthorized) + Rails.logger.stubs(:warn) + + job = CreateOnGithubJob.new(@deployment) + job.exception_executions = { "[Octokit::Unauthorized]" => 13 } + + assert_nothing_raised { job.perform_now } + end + end +end