From 1f1417f408b20f83dec0fa7ad41fc7ee35694b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Nowak?= Date: Thu, 11 Sep 2025 16:30:40 +0200 Subject: [PATCH] chore(report): add PR diff report generator This commit adds a Python script that generates much more verbose PR Diff. --- .gitignore | 5 +- diff_report.py | 116 ++++++++++++++++++++++++++++++++++++++ gh-compr | 36 +----------- main.py | 27 +++++++++ pr_diff.py | 133 ++++++++++++++++++++++++++++++++++++++++++++ report_formatter.py | 116 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 398 insertions(+), 35 deletions(-) create mode 100644 diff_report.py create mode 100644 main.py create mode 100644 pr_diff.py create mode 100644 report_formatter.py diff --git a/.gitignore b/.gitignore index 384b830..70a4883 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,7 @@ temp/ [._]sw[a-p] # Script output -*.diff \ No newline at end of file +*.diff + +# Python pycache +/__pycache__ diff --git a/diff_report.py b/diff_report.py new file mode 100644 index 0000000..f65b010 --- /dev/null +++ b/diff_report.py @@ -0,0 +1,116 @@ +from typing import List, Optional, Dict, Any +from pr_diff import Pr + +class DiffReport: + """ + Compares two PrDiff objects and produces a structured report of differences. + """ + + def __init__(self, pr1: Pr, pr2: Pr): + self.pr1 = pr1 + self.pr2 = pr2 + + # Final structured output + self.identical_files: List[str] = [] + self.different_files: Dict[str, List[Any]] = {} + self.generate() + + def generate(self): + """Generate the report data structure by comparing two PR diffs.""" + pr1_files = {f.file_path: f for f in self.pr1.pr_diff.file_diffs} + pr2_files = {f.file_path: f for f in self.pr2.pr_diff.file_diffs} + + all_files = sorted(set(pr1_files.keys()) | set(pr2_files.keys())) + + for file_path in all_files: + file1 = pr1_files.get(file_path) + file2 = pr2_files.get(file_path) + + if file1 and file2: + if file1.md5 == file2.md5: + self.identical_files.append(file_path) + else: + aligned = self._align_hunks(file1.hunks, file2.hunks) + self.different_files[file_path] = aligned + else: + aligned = [] + if file1 and not file2: + aligned = self._align_hunks(file1.hunks, []) + elif file2 and not file1: + aligned = self._align_hunks([], file2.hunks) + + self.different_files[file_path] = aligned + + def _align_hunks(self, hunks1: List, hunks2: List) -> List[Dict[str, Optional[str]]]: + """ + Align two lists of hunks based on md5 matching. + Returns a list of dicts with left/right hunks aligned. + """ + aligned = [] + h1_index, h2_index = 0, 0 + total_h1, total_h2 = len(hunks1), len(hunks2) + + while h1_index < total_h1 or h2_index < total_h2: + current_h1 = hunks1[h1_index] if h1_index < total_h1 else None + current_h2 = hunks2[h2_index] if h2_index < total_h2 else None + + # CASE 1: One list is exhausted → treat remaining as extras + if current_h1 is not None and current_h2 is None: + aligned.append({"left": current_h1, "right": None}) + h1_index += 1 + + elif current_h1 is None and current_h2 is not None: + aligned.append({"left": None, "right": current_h2}) + h2_index += 1 + + # CASE 2: Both lists have hunks + else: + # CASE 2A: Direct match + if current_h1.md5 == current_h2.md5: + aligned.append({"left": current_h1, "right": current_h2}) + h1_index += 1 + h2_index += 1 + + # CASE 2B: Look ahead in the right side to find match for current_h1 + else: + found_h2_match_index = None + lookahead_index = h2_index + 1 + + # Scan the rest of the hunks2 list + while found_h2_match_index is None and lookahead_index < total_h2: + if hunks2[lookahead_index].md5 == current_h1.md5: + found_h2_match_index = lookahead_index + else: + lookahead_index += 1 + + if found_h2_match_index is not None: + # Extra right hunks before the match + for extra_h2 in hunks2[h2_index:found_h2_match_index]: + aligned.append({"left": None, "right": extra_h2}) + + # Match found + aligned.append({ + "left": current_h1, + "right": hunks2[found_h2_match_index] + }) + + # Update both indexes + h1_index += 1 + h2_index = found_h2_match_index + 1 + + # CASE 2C: No match found at all → extra left hunk + else: + aligned.append({"left": current_h1, "right": None}) + h1_index += 1 + + return aligned + + + def to_dict(self) -> Dict[str, Any]: + """Return the final structured report as a dictionary.""" + return { + "left_pr": self.pr1, + "right_pr": self.pr2, + "identical_files": self.identical_files, + "different_files": self.different_files, + } diff --git a/gh-compr b/gh-compr index 8ff773d..c0e9ef7 100755 --- a/gh-compr +++ b/gh-compr @@ -21,7 +21,7 @@ tag_diff() { local diff=$1 local prNumber=$2 local taggedDiff - + taggedDiff=$(echo -e "$diff" | sed -E "/^(\+\+\+|\-\-\-)/ s|$| # [PR: ${prNumber}]|") echo "$taggedDiff" @@ -52,39 +52,7 @@ else read -rp "Enter the second PR URL: " prUrl2 fi -output1=$(validate_url_extract_info "$prUrl1") -status=$? -if [ $status -ne 0 ]; then - echo "$output1" - exit 1 -fi -output2=$(validate_url_extract_info "$prUrl2") -status=$? -if [ $status -ne 0 ]; then - echo "$output2" - exit 1 -fi - -read -r owner1 repo1 pr1Number <<< "$output1" -read -r owner2 repo2 pr2Number <<< "$output2" - -# Fetch PR diffs using gh CLI -pr1Diff=$(gh pr diff "$pr1Number" -R "$owner1/$repo1") -pr2Diff=$(gh pr diff "$pr2Number" -R "$owner2/$repo2") - -# Remove context lines from the diffs -# it would be nice if https://cli.github.com/manual/gh_pr_diff -# had an option to set the number of context lines so this step -# wouldn't be necessary -pr1NoCtxDiff=$(echo "$pr1Diff" | grep -v '^[^+-]') -pr2NoCtxDiff=$(echo "$pr2Diff" | grep -v '^[^+-]') - -# Tag headers to make sure filenames always exist in the final diff -pr1TaggedDiff=$(tag_diff "$pr1NoCtxDiff" "$pr1Number") -pr2TaggedDiff=$(tag_diff "$pr2NoCtxDiff" "$pr2Number") - -# Generate diff and save to file -diff_output=$(diff -u0 <(echo "$pr1TaggedDiff") <(echo "$pr2TaggedDiff") || true) +diff_output=$(python main.py "$prUrl1" "$prUrl2") if [ -n "$output" ]; then echo "$diff_output" > "$output" echo "Diff saved to $output" diff --git a/main.py b/main.py new file mode 100644 index 0000000..791ad3f --- /dev/null +++ b/main.py @@ -0,0 +1,27 @@ +from typing import List +import re +import subprocess +import sys +from pr_diff import Pr, PrDiff, FileDiff, Hunk +from diff_report import DiffReport +from report_formatter import format_pr_diff_report_markdown + + +def process_input(): + url1 = sys.argv[1] + url2 = sys.argv[2] + return url1, url2 + + +def main(): + url1, url2 = process_input() + pr1 = Pr(url1) + pr2 = Pr(url2) + diff_report = DiffReport(pr1, pr2) + + output = format_pr_diff_report_markdown(diff_report.to_dict()) + print(output) + + +if __name__ == "__main__": + main() diff --git a/pr_diff.py b/pr_diff.py new file mode 100644 index 0000000..cb96ec0 --- /dev/null +++ b/pr_diff.py @@ -0,0 +1,133 @@ +import subprocess +import re +import hashlib +from typing import List + +def compute_md5(data: str) -> str: + return hashlib.md5(data.encode('utf-8')).hexdigest() + +class Hunk: + def __init__(self, text: str): + self.text = text.strip() + self.md5 = self._compute_md5(self.text) + + @staticmethod + def _compute_md5(data: str) -> str: + return hashlib.md5(data.encode('utf-8')).hexdigest() + + def __repr__(self): + return f"" + + def pretty_print(self): + print("\t\t", self) + + +class FileDiff: + """Represents the diff for a single file, containing multiple hunks.""" + + def __init__(self, file_path: str, diff_text: str): + self.file_path = file_path + self.diff_text = diff_text.strip() + self.md5 = compute_md5(self.diff_text) + self.hunks = self._parse_hunks(self.diff_text) + + def _parse_hunks(self, text: str) -> List[Hunk]: + """Extract all hunks from the file diff.""" + parts = re.split(r'(?=^@@ )', text, flags=re.MULTILINE) + return [Hunk(part) for part in parts if part.strip().startswith('@@')] + + def __repr__(self): + return f"" + + def pretty_print(self): + print("\t", self) + for hunk in self.hunks: + hunk.pretty_print() + + +class PrDiff: + """Represents the entire PR diff, containing multiple file diffs.""" + + def __init__(self, diff_text: str): + self.diff_text = diff_text.strip() + self.md5 = compute_md5(self.diff_text) + self.file_diffs = self._parse_chunks(self.diff_text) + + + def _parse_chunks(self, diff_text: str) -> List[FileDiff]: + """Split the PR diff into individual file diffs.""" + # Split on "diff --git" lines + raw_chunks = re.split(r'(?=^diff --git)', diff_text, flags=re.MULTILINE) + raw_chunks = [chunk.strip() for chunk in raw_chunks if chunk.strip()] + + file_diffs = [] + for chunk in raw_chunks: + lines = chunk.splitlines() + + # Extract file path from the first line + # Format: diff --git a/path/to/file b/path/to/file + match = re.match(r'^diff --git a/(.+?) b/\1$', lines[0]) + if match: + file_path = match.group(1) + else: + # Fallback if exact match fails + file_path = lines[0].split()[2][2:] + + file_diffs.append(FileDiff(file_path, chunk)) + + return file_diffs + + def __repr__(self): + return f"" + + def pretty_print(self): + print(self) + for chunk in self.chunks: + chunk.pretty_print() + +class Pr: + def __init__(self, url: str): + owner, repo, pull_number = self._validate_url_extract_info(url) + self.url = url + self.repo = repo + self.number = pull_number + + raw_pr_diff = self._download_pr_diff(url) + self.pr_diff = PrDiff(raw_pr_diff) + + def __repr__(self): + return f"" + + def to_markdown(self): + return f"PR *#{self.number}*, diff md5: *{self.pr_diff.md5}*, files: *{len(self.pr_diff.chunks)}*" + + + @staticmethod + def _download_pr_diff(pr_url: str): + completed_process = subprocess.run(["gh", "pr", "diff", pr_url], capture_output=True, text=True) + if completed_process.returncode != 0: + print(completed_process.stderr, file=sys.stderr) + raise RuntimeError(f"Could not download pr diff from: {pr_url}") + return completed_process.stdout + + @staticmethod + def _validate_url_extract_info(url: str) -> (str, str, str): + """ + Validate a GitHub pull request URL. + + :param url: The GitHub PR URL to validate. + :return: True if successful. + :raises ValueError: If the URL does not match the expected format. + """ + regex = r"^https://github\.com/([a-zA-Z0-9-]+)/([a-zA-Z0-9-]+)/pull/([0-9]+)$" + match = re.match(regex, url) + + if not match: + raise ValueError( + f"PR URL '{url}' does not match the expected format: " + "'https://github.com///pull/'" + ) + + owner, repo, pull_number = match.groups() + return owner, repo, int(pull_number) + diff --git a/report_formatter.py b/report_formatter.py new file mode 100644 index 0000000..0ec9890 --- /dev/null +++ b/report_formatter.py @@ -0,0 +1,116 @@ +from typing import List, Dict, Optional + + +def format_pr_diff_report_markdown(report_data: Dict) -> str: + """ + Format a PR diff report into Markdown + HTML with: + - File-level summary at the top + - Collapsible sections for each file that differs + - A list of identical files + """ + lines = [] + + _add_header_section(lines, report_data) + + if report_data["different_files"].items(): + _add_pr_diff_summary_section(lines, report_data) + + if report_data["identical_files"]: + _add_identical_fiels_section(lines, report_data) + + return "\n".join(lines) + + +def _add_header_section(lines: List[str], report_data: Dict): + # --- File-level summary --- + total_files = len(report_data["identical_files"]) + len(report_data["different_files"]) + total_identical = len(report_data["identical_files"]) + total_different = len(report_data["different_files"].keys()) + + similarity_ratio = total_identical / total_files if total_files else 1.0 + + lines.append(f"# Similarity: {similarity_ratio * 100:.1f}%\n") + lines.append("Compared: ") + lines.append("- left: " + report_data["left_pr"].url) + lines.append("- right: " + report_data["right_pr"].url) + lines.append("") + lines.append(f"{_generate_emoji_chart(total_identical, total_different)}\n") + + lines.append("
") # separate sections + + +def _add_pr_diff_summary_section(lines: List[str], report_data: Dict): + total_files = len(report_data["identical_files"]) + len(report_data["different_files"]) + total_identical = len(report_data["identical_files"]) + total_different = len(report_data["different_files"].keys()) + + lines.append("
") + lines.append("

PR Diff Summary

\n") + lines.append("
    ") + lines.append(f"
  • Total files compared: {total_files}
  • ") + lines.append(f"
  • Files identical: {total_identical}
  • ") + lines.append(f"
  • Files with differences: {total_different}\n
  • ") + lines.append("
\n") + + left_pr = report_data["left_pr"] + right_pr = report_data["right_pr"] + + # --- Different files --- + for filename, aligned_hunks in report_data["different_files"].items(): + lines.append(f"
") + lines.append(f"📄 {filename}\n") + lines.append("") + lines.append("") + lines.append("") + lines.append(f"") + lines.append(f"") + lines.append("") + + for idx, pair in enumerate(aligned_hunks, start=1): + left_hunk = pair["left"] + right_hunk = pair["right"] + + if left_hunk and right_hunk and left_hunk.md5 == right_hunk.md5: + lines.append("") + lines.append(f"") + lines.append(f"") + lines.append("") + else: + left_text = _render_hunk(left_hunk) + right_text = _render_hunk(right_hunk) + lines.append("") + lines.append(f"") + lines.append(f"") + lines.append(f"") + lines.append("") + + lines.append("
#\n\n[PR #{left_pr.number}]({left_pr.url})\n\n\n\n[PR #{right_pr.number}]({right_pr.url})\n\n
{idx}Hunk is identical (md5: {left_hunk.md5})
{idx}\n{left_text}\n\n{right_text}\n
\n") + lines.append("
\n") # end collapsible section + + lines.append("
") # end Pr Diff summary + + +def _add_identical_fiels_section(lines: List[str], report_data: Dict): + lines.append("
") + lines.append("

Identical files

\n") + for file_path in report_data["identical_files"]: + lines.append(f"- `{file_path}`") + lines.append("
\n") + + +def _generate_emoji_chart(identical: int, different: int, total_segments: int = 10) -> str: + """Generate a simple emoji chart for similarity.""" + total_files = identical + different + filled = int(total_segments * identical / total_files) if total_files else total_segments + empty = total_segments - filled + return f"Similarity: {'🟩' * filled}{'🟥' * empty} ({identical}/{total_files} identical)" + + +def _render_hunk(hunk: Optional[object]) -> str: + """ + Render a single hunk into HTML-safe diff block. + """ + if hunk is None: + return "(no changes)" + + return f"\n```diff\n{hunk.text}\n```\n"