From 1f1417f408b20f83dec0fa7ad41fc7ee35694b19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20Nowak?= <mikolaj.nowak@konghq.com>
Date: Thu, 11 Sep 2025 16:30:40 +0200
Subject: [PATCH] chore(report): add PR diff report generator

This commit adds a Python script that generates
much more verbose PR Diff.
---
 .gitignore          |   5 +-
 diff_report.py      | 116 ++++++++++++++++++++++++++++++++++++++
 gh-compr            |  36 +-----------
 main.py             |  27 +++++++++
 pr_diff.py          | 133 ++++++++++++++++++++++++++++++++++++++++++++
 report_formatter.py | 116 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 398 insertions(+), 35 deletions(-)
 create mode 100644 diff_report.py
 create mode 100644 main.py
 create mode 100644 pr_diff.py
 create mode 100644 report_formatter.py

diff --git a/.gitignore b/.gitignore
index 384b830..70a4883 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,4 +23,7 @@ temp/
 [._]sw[a-p]
 
 # Script output
-*.diff
\ No newline at end of file
+*.diff
+
+# Python pycache
+/__pycache__
diff --git a/diff_report.py b/diff_report.py
new file mode 100644
index 0000000..f65b010
--- /dev/null
+++ b/diff_report.py
@@ -0,0 +1,116 @@
+from typing import List, Optional, Dict, Any
+from pr_diff import Pr
+
+class DiffReport:
+    """
+    Compares two PrDiff objects and produces a structured report of differences.
+    """
+
+    def __init__(self, pr1: Pr, pr2: Pr):
+        self.pr1 = pr1
+        self.pr2 = pr2
+
+        # Final structured output
+        self.identical_files: List[str] = []
+        self.different_files: Dict[str, List[Any]] = {}
+        self.generate()
+
+    def generate(self):
+        """Generate the report data structure by comparing two PR diffs."""
+        pr1_files = {f.file_path: f for f in self.pr1.pr_diff.file_diffs}
+        pr2_files = {f.file_path: f for f in self.pr2.pr_diff.file_diffs}
+
+        all_files = sorted(set(pr1_files.keys()) | set(pr2_files.keys()))
+
+        for file_path in all_files:
+            file1 = pr1_files.get(file_path)
+            file2 = pr2_files.get(file_path)
+
+            if file1 and file2:
+                if file1.md5 == file2.md5:
+                    self.identical_files.append(file_path)
+                else:
+                    aligned = self._align_hunks(file1.hunks, file2.hunks)
+                    self.different_files[file_path] = aligned
+            else:
+                aligned = []
+                if file1 and not file2:
+                    aligned = self._align_hunks(file1.hunks, [])
+                elif file2 and not file1:
+                    aligned = self._align_hunks([], file2.hunks)
+
+                self.different_files[file_path] = aligned
+
+    def _align_hunks(self, hunks1: List, hunks2: List) -> List[Dict[str, Optional[str]]]:
+        """
+        Align two lists of hunks based on md5 matching.
+        Returns a list of dicts with left/right hunks aligned.
+        """
+        aligned = []
+        h1_index, h2_index = 0, 0
+        total_h1, total_h2 = len(hunks1), len(hunks2)
+
+        while h1_index < total_h1 or h2_index < total_h2:
+            current_h1 = hunks1[h1_index] if h1_index < total_h1 else None
+            current_h2 = hunks2[h2_index] if h2_index < total_h2 else None
+
+            # CASE 1: One list is exhausted → treat remaining as extras
+            if current_h1 is not None and current_h2 is None:
+                aligned.append({"left": current_h1, "right": None})
+                h1_index += 1
+
+            elif current_h1 is None and current_h2 is not None:
+                aligned.append({"left": None, "right": current_h2})
+                h2_index += 1
+
+            # CASE 2: Both lists have hunks
+            else:
+                # CASE 2A: Direct match
+                if current_h1.md5 == current_h2.md5:
+                    aligned.append({"left": current_h1, "right": current_h2})
+                    h1_index += 1
+                    h2_index += 1
+
+                # CASE 2B: Look ahead in the right side to find match for current_h1
+                else:
+                    found_h2_match_index = None
+                    lookahead_index = h2_index + 1
+
+                    # Scan the rest of the hunks2 list
+                    while found_h2_match_index is None and lookahead_index < total_h2:
+                        if hunks2[lookahead_index].md5 == current_h1.md5:
+                            found_h2_match_index = lookahead_index
+                        else:
+                            lookahead_index += 1
+
+                    if found_h2_match_index is not None:
+                        # Extra right hunks before the match
+                        for extra_h2 in hunks2[h2_index:found_h2_match_index]:
+                            aligned.append({"left": None, "right": extra_h2})
+
+                        # Match found
+                        aligned.append({
+                            "left": current_h1,
+                            "right": hunks2[found_h2_match_index]
+                        })
+
+                        # Update both indexes
+                        h1_index += 1
+                        h2_index = found_h2_match_index + 1
+
+                    # CASE 2C: No match found at all → extra left hunk
+                    else:
+                        aligned.append({"left": current_h1, "right": None})
+                        h1_index += 1
+
+        return aligned
+
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return the final structured report as a dictionary."""
+        return {
+            "left_pr": self.pr1,
+            "right_pr": self.pr2,
+            "identical_files": self.identical_files,
+            "different_files": self.different_files,
+        }
diff --git a/gh-compr b/gh-compr
index 8ff773d..c0e9ef7 100755
--- a/gh-compr
+++ b/gh-compr
@@ -21,7 +21,7 @@ tag_diff() {
   local diff=$1
   local prNumber=$2
   local taggedDiff
-  
+
   taggedDiff=$(echo -e "$diff" | sed -E "/^(\+\+\+|\-\-\-)/ s|$|    # [PR: ${prNumber}]|")
 
   echo "$taggedDiff"
@@ -52,39 +52,7 @@ else
   read -rp "Enter the second PR URL: " prUrl2
 fi
 
-output1=$(validate_url_extract_info "$prUrl1")
-status=$?
-if [ $status -ne 0 ]; then
-  echo "$output1"
-  exit 1
-fi
-output2=$(validate_url_extract_info "$prUrl2")
-status=$?
-if [ $status -ne 0 ]; then
-  echo "$output2"
-  exit 1
-fi
-
-read -r owner1 repo1 pr1Number <<< "$output1"
-read -r owner2 repo2 pr2Number <<< "$output2"
-
-# Fetch PR diffs using gh CLI
-pr1Diff=$(gh pr diff "$pr1Number" -R "$owner1/$repo1")
-pr2Diff=$(gh pr diff "$pr2Number" -R "$owner2/$repo2")
-
-# Remove context lines from the diffs
-# it would be nice if https://cli.github.com/manual/gh_pr_diff
-# had an option to set the number of context lines so this step
-# wouldn't be necessary
-pr1NoCtxDiff=$(echo "$pr1Diff" | grep -v '^[^+-]')
-pr2NoCtxDiff=$(echo "$pr2Diff" | grep -v '^[^+-]')
-
-# Tag headers to make sure filenames always exist in the final diff
-pr1TaggedDiff=$(tag_diff "$pr1NoCtxDiff" "$pr1Number")
-pr2TaggedDiff=$(tag_diff "$pr2NoCtxDiff" "$pr2Number")
-
-# Generate diff and save to file
-diff_output=$(diff -u0 <(echo "$pr1TaggedDiff") <(echo "$pr2TaggedDiff") || true)
+diff_output=$(python main.py "$prUrl1" "$prUrl2")
 if [ -n "$output" ]; then
   echo "$diff_output" > "$output"
   echo "Diff saved to $output"
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..791ad3f
--- /dev/null
+++ b/main.py
@@ -0,0 +1,27 @@
+from typing import List
+import re
+import subprocess
+import sys
+from pr_diff import Pr, PrDiff, FileDiff, Hunk
+from diff_report import DiffReport
+from report_formatter import format_pr_diff_report_markdown
+
+
+def process_input():
+    url1 = sys.argv[1]
+    url2 = sys.argv[2]
+    return url1, url2
+
+
+def main():
+    url1, url2 = process_input()
+    pr1 = Pr(url1)
+    pr2 = Pr(url2)
+    diff_report = DiffReport(pr1, pr2)
+
+    output = format_pr_diff_report_markdown(diff_report.to_dict())
+    print(output)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/pr_diff.py b/pr_diff.py
new file mode 100644
index 0000000..cb96ec0
--- /dev/null
+++ b/pr_diff.py
@@ -0,0 +1,133 @@
+import subprocess
+import re
+import hashlib
+from typing import List
+
+def compute_md5(data: str) -> str:
+    return hashlib.md5(data.encode('utf-8')).hexdigest()
+
+class Hunk:
+    def __init__(self, text: str):
+        self.text = text.strip()
+        self.md5 = self._compute_md5(self.text)
+
+    @staticmethod
+    def _compute_md5(data: str) -> str:
+        return hashlib.md5(data.encode('utf-8')).hexdigest()
+
+    def __repr__(self):
+        return f"<Hunk md5={self.md5} length={len(self.text)}>"
+
+    def pretty_print(self):
+        print("\t\t", self)
+
+
+class FileDiff:
+    """Represents the diff for a single file, containing multiple hunks."""
+
+    def __init__(self, file_path: str, diff_text: str):
+        self.file_path = file_path
+        self.diff_text = diff_text.strip()
+        self.md5 = compute_md5(self.diff_text)
+        self.hunks = self._parse_hunks(self.diff_text)
+
+    def _parse_hunks(self, text: str) -> List[Hunk]:
+        """Extract all hunks from the file diff."""
+        parts = re.split(r'(?=^@@ )', text, flags=re.MULTILINE)
+        return [Hunk(part) for part in parts if part.strip().startswith('@@')]
+
+    def __repr__(self):
+        return f"<FileDiff path=['{self.file_path}'] md5=[{self.md5}] hunks=[{len(self.hunks)}]>"
+
+    def pretty_print(self):
+        print("\t", self)
+        for hunk in self.hunks:
+            hunk.pretty_print()
+
+
+class PrDiff:
+    """Represents the entire PR diff, containing multiple file diffs."""
+
+    def __init__(self, diff_text: str):
+        self.diff_text = diff_text.strip()
+        self.md5 = compute_md5(self.diff_text)
+        self.file_diffs = self._parse_chunks(self.diff_text)
+
+
+    def _parse_chunks(self, diff_text: str) -> List[FileDiff]:
+        """Split the PR diff into individual file diffs."""
+        # Split on "diff --git" lines
+        raw_chunks = re.split(r'(?=^diff --git)', diff_text, flags=re.MULTILINE)
+        raw_chunks = [chunk.strip() for chunk in raw_chunks if chunk.strip()]
+
+        file_diffs = []
+        for chunk in raw_chunks:
+            lines = chunk.splitlines()
+
+            # Extract file path from the first line
+            # Format: diff --git a/path/to/file b/path/to/file
+            match = re.match(r'^diff --git a/(.+?) b/\1$', lines[0])
+            if match:
+                file_path = match.group(1)
+            else:
+                # Fallback if exact match fails
+                file_path = lines[0].split()[2][2:]
+
+            file_diffs.append(FileDiff(file_path, chunk))
+
+        return file_diffs
+
+    def __repr__(self):
+        return f"<PrDiff md5=[{self.md5}] files=[{len(self.chunks)}]>"
+
+    def pretty_print(self):
+        print(self)
+        for chunk in self.chunks:
+            chunk.pretty_print()
+
+class Pr:
+    def __init__(self, url: str):
+        owner, repo, pull_number = self._validate_url_extract_info(url)
+        self.url = url
+        self.repo = repo
+        self.number = pull_number
+
+        raw_pr_diff = self._download_pr_diff(url)
+        self.pr_diff = PrDiff(raw_pr_diff)
+
+    def __repr__(self):
+        return f"<Pr number=[{self.number}] diff_md5=[{self.pr_diff.md5}] files=[{len(self.pr_diff.chunks)}]>"
+
+    def to_markdown(self):
+        return f"PR *#{self.number}*, diff md5: *{self.pr_diff.md5}*, files: *{len(self.pr_diff.chunks)}*"
+
+
+    @staticmethod
+    def _download_pr_diff(pr_url: str):
+        completed_process = subprocess.run(["gh", "pr", "diff", pr_url], capture_output=True, text=True)
+        if completed_process.returncode != 0:
+            print(completed_process.stderr, file=sys.stderr)
+            raise RuntimeError(f"Could not download pr diff from: {pr_url}")
+        return completed_process.stdout
+
+    @staticmethod
+    def _validate_url_extract_info(url: str) -> (str, str, str):
+        """
+        Validate a GitHub pull request URL.
+
+        :param url: The GitHub PR URL to validate.
+        :return: True if successful.
+        :raises ValueError: If the URL does not match the expected format.
+        """
+        regex = r"^https://github\.com/([a-zA-Z0-9-]+)/([a-zA-Z0-9-]+)/pull/([0-9]+)$"
+        match = re.match(regex, url)
+
+        if not match:
+            raise ValueError(
+                f"PR URL '{url}' does not match the expected format: "
+                "'https://github.com/<owner>/<repo>/pull/<number>'"
+            )
+
+        owner, repo, pull_number = match.groups()
+        return owner, repo, int(pull_number)
+
diff --git a/report_formatter.py b/report_formatter.py
new file mode 100644
index 0000000..0ec9890
--- /dev/null
+++ b/report_formatter.py
@@ -0,0 +1,116 @@
+from typing import List, Dict, Optional
+
+
+def format_pr_diff_report_markdown(report_data: Dict) -> str:
+    """
+    Format a PR diff report into Markdown + HTML with:
+    - File-level summary at the top
+    - Collapsible sections for each file that differs
+    - A list of identical files
+    """
+    lines = []
+
+    _add_header_section(lines, report_data)
+
+    if report_data["different_files"].items():
+        _add_pr_diff_summary_section(lines, report_data)
+
+    if report_data["identical_files"]:
+        _add_identical_fiels_section(lines, report_data)
+
+    return "\n".join(lines)
+
+
+def _add_header_section(lines: List[str], report_data: Dict):
+    # --- File-level summary ---
+    total_files = len(report_data["identical_files"]) + len(report_data["different_files"])
+    total_identical = len(report_data["identical_files"])
+    total_different = len(report_data["different_files"].keys())
+
+    similarity_ratio = total_identical / total_files if total_files else 1.0
+
+    lines.append(f"# Similarity: {similarity_ratio * 100:.1f}%\n")
+    lines.append("Compared: ")
+    lines.append("- left: " + report_data["left_pr"].url)
+    lines.append("- right: " + report_data["right_pr"].url)
+    lines.append("")
+    lines.append(f"{_generate_emoji_chart(total_identical, total_different)}\n")
+
+    lines.append("<hr/>") # separate sections
+
+
+def _add_pr_diff_summary_section(lines: List[str], report_data: Dict):
+    total_files = len(report_data["identical_files"]) + len(report_data["different_files"])
+    total_identical = len(report_data["identical_files"])
+    total_different = len(report_data["different_files"].keys())
+
+    lines.append("<details>")
+    lines.append("<summary><h3>PR Diff Summary</h3></summary>\n")
+    lines.append("<ul>")
+    lines.append(f"<li>Total files compared: {total_files}</li>")
+    lines.append(f"<li>Files identical: {total_identical}</li>")
+    lines.append(f"<li>Files with differences: {total_different}\n</li>")
+    lines.append("</ul>\n")
+
+    left_pr = report_data["left_pr"]
+    right_pr = report_data["right_pr"]
+
+    # --- Different files ---
+    for filename, aligned_hunks in report_data["different_files"].items():
+        lines.append(f"<details>")
+        lines.append(f"<summary>📄 {filename}</summary>\n")
+        lines.append("<table style='table-layout: fixed; width: 100%;'>")
+        lines.append("<tr>")
+        lines.append("<th style='width: 5%'>#</th>")
+        lines.append(f"<th style='width: 47.5%'>\n\n[PR #{left_pr.number}]({left_pr.url})\n\n</th>")
+        lines.append(f"<th style='width: 47.5%'>\n\n[PR #{right_pr.number}]({right_pr.url})\n\n</th>")
+        lines.append("</tr>")
+
+        for idx, pair in enumerate(aligned_hunks, start=1):
+            left_hunk = pair["left"]
+            right_hunk = pair["right"]
+
+            if left_hunk and right_hunk and left_hunk.md5 == right_hunk.md5:
+                lines.append("<tr>")
+                lines.append(f"<td>{idx}</td>")
+                lines.append(f"<td colspan='2'>Hunk is identical (md5: <code>{left_hunk.md5}</code>)</td>")
+                lines.append("</tr>")
+            else:
+                left_text = _render_hunk(left_hunk)
+                right_text = _render_hunk(right_hunk)
+                lines.append("<tr>")
+                lines.append(f"<td>{idx}</td>")
+                lines.append(f"<td>\n{left_text}\n</td>")
+                lines.append(f"<td>\n{right_text}\n</td>")
+                lines.append("</tr>")
+
+        lines.append("</table>\n")
+        lines.append("</details>\n")  # end collapsible section
+
+    lines.append("</details>") # end Pr Diff summary
+
+
+def _add_identical_fiels_section(lines: List[str], report_data: Dict):
+    lines.append("<details>")
+    lines.append("<summary><h3>Identical files</h3></summary>\n")
+    for file_path in report_data["identical_files"]:
+        lines.append(f"- `{file_path}`")
+    lines.append("</details>\n")
+
+
+def _generate_emoji_chart(identical: int, different: int, total_segments: int = 10) -> str:
+    """Generate a simple emoji chart for similarity."""
+    total_files = identical + different
+    filled = int(total_segments * identical / total_files) if total_files else total_segments
+    empty = total_segments - filled
+    return f"Similarity: {'🟩' * filled}{'🟥' * empty} ({identical}/{total_files} identical)"
+
+
+def _render_hunk(hunk: Optional[object]) -> str:
+    """
+    Render a single hunk into HTML-safe diff block.
+    """
+    if hunk is None:
+        return "(no changes)"
+
+    return f"\n```diff\n{hunk.text}\n```\n"