diff --git a/docs/BACKLOG/open.md b/docs/BACKLOG/open.md index 8772c62..2ad528c 100644 --- a/docs/BACKLOG/open.md +++ b/docs/BACKLOG/open.md @@ -347,3 +347,86 @@ Checks: symbols) and confirm the filtered finding count = 0 --- + +## 56. Fix the `--rework-pr` workflow in `solve_issues.py` + +Labels: `kind/bug`, `theme/solver`, `area/cost-cap`, `priority/2` + +Priority: `2` + +The `--rework-pr` workflow in `solve_issues.py` is broken for +follow-up-edits. Measured across this session (2026-06-24/25): + +- **Run 1** (PR #425, `opencode/deepseek-v4-flash-free` via + `--model opencode`): OpenRouter 400 Bad Request. opencode-serve + 1.14.28 rejects the slug in this code path (the same model + worked in the normal solve path for the same PR's first push). +- **Run 2** (PR #425, `mistral/mistral-large-latest` via + `--model openrouter_direct`): same 400. Not the model โ€” the + request shape. +- **Run 3** (PR #425, `openai/gpt-4o-mini` via + `--model openrouter_direct`): no 400, model produced valid + patch JSON, but output truncated mid-JSON by the 4096-token + output cap. Worker reports `status: no_patches`. +- **Run 4** (PR #425, `openai/gpt-4o` via + `--model openrouter_direct`): no 400, full patch JSON produced, + but the worker-patcher (likely `git apply`) rejected it 25x with + "Keine gueltigen Patches in der Eingabe". Most likely cause: + the model wrote a diff against the pre-PR-433 code, but the + worker-patcher applies against the current branch tip (which + already has PR #433's commits). Mismatch. +- **Run 5** (PR #437, `openai/gpt-4o` via + `--model openrouter_direct`): same `patches_failed` / + `worker_exit_code: 3` pattern. Confirms the failure is not + PR-specific. + +`/tmp/compare_temperatures.py` (still on disk in this session) +can reproduce the 400 / no_patches / patches_failed modes +deterministically. + +**Most likely root cause for Run 4/5 (the patches_failed case):** +the rework-pr prompt gives the model the PR diff as "what to fix" +but does not anchor the model to "your diff must apply cleanly on +top of the current branch tip". The model writes a +rewrite-from-scratch diff and the patcher rejects it. Fix would +need to change the rework prompt in `solve_issues.py` (out of +scope for the ยง54 review_pr.py work). + +**Secondary root cause for Runs 1/2 (the OpenRouter 400):** +unknown โ€” both `deepseek-v4-flash-free` and `mistral-large-latest` +are listed as valid OpenRouter slugs. Could be a request-shape +mismatch in `_gh_api`-equivalent call inside `solve_issues.py`, +or could be a transient OpenRouter API issue. Needs a 3-run +reproduction in a controlled environment before any fix is +designed. + +Suggested scope: +- run a deterministic reproduction with the existing + `/tmp/compare_temperatures.py` against 3 different PRs to + characterise the failure surface +- read the rework-pr prompt construction in `solve_issues.py` + and identify the anchoring gap (model doesn't know about the + current branch tip's existing commits) +- design a fix that: + - explicitly passes the current branch tip SHA + the + already-existing commits on the PR branch to the model + - asks the model for an **incremental** diff against that + base, not a full rewrite + - verifies each produced patch with `git apply --check` + *before* the worker marks the run as successful +- add unit tests in `tests/test_solve_issues.py` for the + rework-pr prompt shape (mock at the openrouter-call boundary) + +Touches: `scripts/solve_issues.py`, `tests/test_solve_issues.py` + +Checks: +- `git diff --check` +- `python -m unittest tests.test_solve_issues -v` +- `python -m unittest discover -s tests` +- new unit test that exercises the rework-pr prompt shape with a + realistic PR-diff + current-branch-tip context +- 3-run reproduction against 3 different real PRs from this repo: + all 3 should now succeed or fail with a clear, actionable error + (not the current opaque `patches_failed` 25x retry loop) + +--- diff --git a/prompts/rework_pr.md b/prompts/rework_pr.md index 026b7d6..51ba8bd 100644 --- a/prompts/rework_pr.md +++ b/prompts/rework_pr.md @@ -8,8 +8,19 @@ provided that must be addressed by making additional changes on the same branch. - **PR #{pr_number}** in `{owner}/{repo}` - **Base branch:** `{base_branch}` - **Head branch:** `{head_branch}` (the branch you will modify) +- **Current head commit:** `{head_sha}` - **Reviewers:** {reviewer_usernames} +## Current Branch State + +The PR branch already contains the following commits, newest first: + +{existing_commits_list} + +Your patch MUST be a minimal incremental diff that applies cleanly on top of +commit `{head_sha}`. Do not rewrite files from scratch against older PR or base +branch content. Reference files by their current post-`{head_sha}` content. + ## PR Diff (current state vs base) ``` @@ -29,6 +40,7 @@ The following review comments must be addressed: 3. Preserve the existing branch name โ€” push follow-up commits to `{head_branch}`. 4. If a reviewer comment is unclear, use your best judgment to resolve it. 5. After applying changes, verify the code is syntactically correct. +6. Return only an incremental patch for the current branch tip `{head_sha}`. ## Constraints diff --git a/scripts/validation/github_client.py b/scripts/validation/github_client.py index 8f4915e..e06c1f4 100644 --- a/scripts/validation/github_client.py +++ b/scripts/validation/github_client.py @@ -157,6 +157,18 @@ def get_pull_request(self, repo: str, number: int) -> PullRequestInfo | None: merge_commit_sha=(pr.get("merge_commit_sha") or None), ) + def get_pull_request_commits(self, repo: str, number: int) -> list[dict[str, Any]]: + """Return commit metadata for a pull request, newest commit last.""" + resp = self.session.get( + f"{self.BASE}/repos/{self.owner}/{repo}/pulls/{number}/commits", + params={"per_page": 100}, + ) + if resp.status_code == 404: + return [] + self._raise_for_status(resp, f"get PR commits: {repo}#{number}") + data = resp.json() + return data if isinstance(data, list) else [] + def get_ci_status(self, repo: str, ref: str) -> CiStatus: resp = self.session.get( f"{self.BASE}/repos/{self.owner}/{repo}/commits/{ref}/status", diff --git a/scripts/validation/rework.py b/scripts/validation/rework.py index 1bbc6fe..4876e15 100644 --- a/scripts/validation/rework.py +++ b/scripts/validation/rework.py @@ -33,6 +33,7 @@ _REPO_ROOT = Path(__file__).resolve().parents[2] REWORK_PROMPT_PATH = _REPO_ROOT / "prompts" / "rework_pr.md" REWORK_COMMIT_MESSAGE_PREFIX = "rework: apply review feedback" +DEFAULT_REWORK_MAX_TOKENS = 16384 def _load_rework_prompt_template() -> str: @@ -55,6 +56,8 @@ def _build_rework_prompt( head_branch: str, diff: str, review_threads: list[ReviewThread], + head_sha: str = "", + existing_commits_list: str = "", ) -> str: """Build the final rework prompt by filling in the template.""" reviewer_usernames = ", ".join( @@ -73,12 +76,44 @@ def _build_rework_prompt( repo=repo, base_branch=base_branch, head_branch=head_branch, + head_sha=head_sha or "(unknown)", + existing_commits_list=existing_commits_list or "(commit list unavailable)", reviewer_usernames=reviewer_usernames, diff=diff, review_threads=review_text, ) +def _format_pr_commits_for_prompt(commits: list[dict[str, Any]], *, limit: int = 12) -> str: + """Format PR commits for the rework prompt, newest first.""" + if not commits: + return "(no existing PR commits returned by GitHub)" + + lines: list[str] = [] + for item in reversed(commits[-limit:]): + sha = str(item.get("sha") or "")[:12] or "unknown" + commit = item.get("commit") if isinstance(item.get("commit"), dict) else {} + message = str(commit.get("message") or "").splitlines()[0].strip() + if not message: + message = "(no commit message)" + lines.append(f"- {sha} {message}") + if len(commits) > limit: + lines.append(f"- ... {len(commits) - limit} older commit(s) omitted") + return "\n".join(lines) + + +def _rework_max_tokens_from_env(default: int = DEFAULT_REWORK_MAX_TOKENS) -> int: + """Return the OpenRouter completion cap for PR rework runs.""" + raw = os.getenv("OPENROUTER_REWORK_MAX_TOKENS") + if not raw: + return default + try: + value = int(raw) + except ValueError: + return default + return value if value > 0 else default + + def _run_worker_via_subprocess( prompt: str, repo_dir: str, @@ -104,14 +139,14 @@ def _run_worker_via_subprocess( api_key=openrouter_key, model=model, request_timeout_seconds=timeout_seconds, - use_structured_output=True, + use_structured_output=False, ) try: response_text, usage = worker.generate_with_usage( - prompt=prompt, + prompt=worker.build_patch_prompt(prompt, structured=False), temperature=0.3, - max_tokens=8192, + max_tokens=_rework_max_tokens_from_env(), timeout=timeout_seconds, ) except Exception as exc: @@ -261,6 +296,12 @@ def run_pr_rework( except RuntimeError: review_threads = [] + try: + existing_commits = client.get_pull_request_commits(repo, pr_number) + except RuntimeError: + existing_commits = [] + existing_commits_list = _format_pr_commits_for_prompt(existing_commits) + # 3. Build prompt try: template = _load_rework_prompt_template() @@ -285,6 +326,8 @@ def run_pr_rework( head_branch=head_branch, diff=diff, review_threads=review_threads, + head_sha=pr_info.head_sha, + existing_commits_list=existing_commits_list, ) # Dry-run: print metadata and prompt, return early diff --git a/tests/test_validation/test_github_client.py b/tests/test_validation/test_github_client.py index ad9e0a8..3b5d463 100644 --- a/tests/test_validation/test_github_client.py +++ b/tests/test_validation/test_github_client.py @@ -111,6 +111,23 @@ def test_get_pull_request_returns_none_on_404(self): pr = self.client.get_pull_request("repo", 999) self.assertIsNone(pr) + def test_get_pull_request_commits_returns_list(self): + commits = [ + {"sha": "abc123", "commit": {"message": "first"}}, + {"sha": "def456", "commit": {"message": "second"}}, + ] + self.session.get.return_value = self._mock_response(200, commits) + + result = self.client.get_pull_request_commits("repo", 10) + + self.assertEqual(result, commits) + self.session.get.assert_called_once() + self.assertIn("/pulls/10/commits", self.session.get.call_args.args[0]) + + def test_get_pull_request_commits_returns_empty_on_404(self): + self.session.get.return_value = self._mock_response(404) + self.assertEqual(self.client.get_pull_request_commits("repo", 999), []) + def test_get_ci_status_returns_status(self): self.session.get.return_value = self._mock_response(200, { "state": "success", "total_count": 3, "statuses": [ diff --git a/tests/test_validation/test_rework.py b/tests/test_validation/test_rework.py index 6334b5c..b2bfda0 100644 --- a/tests/test_validation/test_rework.py +++ b/tests/test_validation/test_rework.py @@ -11,7 +11,10 @@ from validation.rework import ( _build_rework_prompt, + _format_pr_commits_for_prompt, _load_rework_prompt_template, + _rework_max_tokens_from_env, + _run_worker_via_subprocess, run_pr_rework, ) from validation.github_client import ( @@ -26,6 +29,8 @@ def setUp(self): self.template = ( "PR #{pr_number} in {owner}/{repo}\n" "Base: {base_branch}, Head: {head_branch}\n" + "Head SHA: {head_sha}\n" + "Commits:\n{existing_commits_list}\n" "Reviewers: {reviewer_usernames}\n" "DIFF:\n{diff}\n" "FEEDBACK:\n{review_threads}\n" @@ -45,6 +50,8 @@ def test_basic_prompt_build(self): head_branch="ai/fix-issue-1", diff="--- a/file.py\n+++ b/file.py\n@@ -1 +1 @@\n-foo\n+bar", review_threads=threads, + head_sha="abc123", + existing_commits_list="- abc123 latest fix", ) self.assertIn("PR #42", prompt) self.assertIn("test-owner/test-repo", prompt) @@ -54,6 +61,8 @@ def test_basic_prompt_build(self): self.assertIn("Add type hints", prompt) self.assertIn("main", prompt) self.assertIn("ai/fix-issue-1", prompt) + self.assertIn("abc123", prompt) + self.assertIn("latest fix", prompt) def test_empty_review_threads(self): prompt = _build_rework_prompt( @@ -91,6 +100,63 @@ def test_reviewer_usernames_deduplicated(self): self.assertEqual(reviewers_line.count("bob"), 1) +class ReworkCommitContextTests(unittest.TestCase): + def test_format_pr_commits_newest_first(self): + commits = [ + {"sha": "111111111111aaaa", "commit": {"message": "old commit\n\nbody"}}, + {"sha": "222222222222bbbb", "commit": {"message": "new commit"}}, + ] + + formatted = _format_pr_commits_for_prompt(commits) + + self.assertLess(formatted.index("222222222222"), formatted.index("111111111111")) + self.assertIn("new commit", formatted) + self.assertIn("old commit", formatted) + + def test_format_pr_commits_handles_empty_list(self): + self.assertIn("no existing PR commits", _format_pr_commits_for_prompt([])) + + +class ReworkWorkerInvocationTests(unittest.TestCase): + def test_rework_max_tokens_default_and_env_override(self): + with patch.dict("os.environ", {}, clear=True): + self.assertEqual(_rework_max_tokens_from_env(), 16384) + with patch.dict("os.environ", {"OPENROUTER_REWORK_MAX_TOKENS": "32768"}): + self.assertEqual(_rework_max_tokens_from_env(), 32768) + with patch.dict("os.environ", {"OPENROUTER_REWORK_MAX_TOKENS": "not-int"}): + self.assertEqual(_rework_max_tokens_from_env(), 16384) + + def test_run_worker_uses_plain_patch_prompt_without_response_format(self): + with patch("workers.openrouter_worker.OpenRouterWorker") as worker_cls: + worker = MagicMock() + worker.build_patch_prompt.return_value = "PATCH PROMPT" + worker.generate_with_usage.return_value = ( + '{"patches":[{"file_path":"x.py","diff":"--- a/x.py\\n+++ b/x.py\\n@@ -1 +1 @@\\n-a\\n+b\\n"}]}', + MagicMock(model="test-model", cost_usd=None, total_tokens=10), + ) + worker.extract_patches.return_value = ["--- a/x.py\n+++ b/x.py\n@@ -1 +1 @@\n-a\n+b\n"] + worker.apply_patches.return_value = [ + MagicMock(success=True, applied_file="x.py", error=None) + ] + worker_cls.return_value = worker + + returncode, output = _run_worker_via_subprocess( + prompt="Original prompt", + repo_dir="/tmp/repo", + model="openai/gpt-4o-mini", + openrouter_key="key", + ) + + self.assertEqual(returncode, 0) + worker_cls.assert_called_once() + self.assertFalse(worker_cls.call_args.kwargs["use_structured_output"]) + worker.build_patch_prompt.assert_called_once_with("Original prompt", structured=False) + worker.generate_with_usage.assert_called_once() + self.assertEqual(worker.generate_with_usage.call_args.kwargs["prompt"], "PATCH PROMPT") + self.assertEqual(worker.generate_with_usage.call_args.kwargs["max_tokens"], 16384) + self.assertIn("Applied patch", output) + + class LoadReworkPromptTemplateTests(unittest.TestCase): def test_template_file_exists(self): template_path = ROOT / "prompts" / "rework_pr.md" @@ -98,7 +164,14 @@ def test_template_file_exists(self): def test_template_contains_required_placeholders(self): template = _load_rework_prompt_template() - for placeholder in ("{pr_number}", "{diff}", "{review_threads}", "{reviewer_usernames}"): + for placeholder in ( + "{pr_number}", + "{diff}", + "{review_threads}", + "{reviewer_usernames}", + "{head_sha}", + "{existing_commits_list}", + ): self.assertIn(placeholder, template) @@ -113,9 +186,13 @@ def test_dry_run_returns_early_without_api_calls(self): state="open", merged=False, head_ref="ai/fix-issue-1", + head_sha="abc123", base_ref="main", html_url="https://github.com/o/r/pull/404", ) + mock_client.get_pull_request_commits.return_value = [ + {"sha": "abc123", "commit": {"message": "latest commit"}}, + ] result = run_pr_rework( owner="o", @@ -197,9 +274,13 @@ def test_missing_openrouter_key_triggers_config_error(self): state="open", merged=False, head_ref="fix", + head_sha="abc123", base_ref="main", html_url="", ) + mock_client.get_pull_request_commits.return_value = [ + {"sha": "abc123", "commit": {"message": "latest commit"}}, + ] result = run_pr_rework( owner="o", @@ -219,7 +300,8 @@ def test_placeholders_in_template(self): template_path = ROOT / "prompts" / "rework_pr.md" content = template_path.read_text(encoding="utf-8") required = ["pr_number", "owner", "repo", "base_branch", "head_branch", - "reviewer_usernames", "diff", "review_threads"] + "head_sha", "existing_commits_list", "reviewer_usernames", + "diff", "review_threads"] for field in required: self.assertIn(f"{{{field}}}", content, f"Missing placeholder {{{field}}} in prompt template")