|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Diagnose source_hash mismatches between wiki.db and fresh renders. |
| 3 | +
|
| 4 | +Usage: |
| 5 | + cd ~/forge/free-code |
| 6 | + python3.11 ~/forge/repowise/scripts/diagnose_hash_mismatch.py [--max-pages N] |
| 7 | +
|
| 8 | +What it checks: |
| 9 | + A) dep_summaries (completed_page_summaries) — empty on re-run (level 0/1 skipped) |
| 10 | + B) graph edge ordering — non-deterministic due to ProcessPoolExecutor + as_completed |
| 11 | + C) betweenness_centrality — random sampling when n > 30000 nodes |
| 12 | + D) community_id — Louvain seed=42, should be stable |
| 13 | + E) git history via git_meta — NOT passed to assemble_file_page (won't affect hash) |
| 14 | +""" |
| 15 | +from __future__ import annotations |
| 16 | + |
| 17 | +import argparse |
| 18 | +import asyncio |
| 19 | +import difflib |
| 20 | +import hashlib |
| 21 | +import sqlite3 |
| 22 | +import sys |
| 23 | +from pathlib import Path |
| 24 | + |
| 25 | +REPOWISE_ROOT = Path(__file__).parent.parent |
| 26 | +sys.path.insert(0, str(REPOWISE_ROOT / "packages" / "core" / "src")) |
| 27 | +sys.path.insert(0, str(REPOWISE_ROOT / "packages" / "cli" / "src")) |
| 28 | + |
| 29 | + |
| 30 | +def _sha256(text: str) -> str: |
| 31 | + return hashlib.sha256(text.encode()).hexdigest() |
| 32 | + |
| 33 | + |
| 34 | +async def _build_pipeline(repo_path: Path): |
| 35 | + """Run ingestion + graph, return (parsed_files, source_map, graph_builder).""" |
| 36 | + from repowise.core.pipeline.orchestrator import _run_ingestion |
| 37 | + parsed_files, file_infos, repo_structure, source_map, graph_builder = \ |
| 38 | + await _run_ingestion(repo_path, exclude_patterns=None, skip_tests=False, |
| 39 | + skip_infra=False, progress=None) |
| 40 | + return parsed_files, source_map, graph_builder |
| 41 | + |
| 42 | + |
| 43 | +def _render_file_page_prompt(pf, graph, pagerank, betweenness, community, |
| 44 | + source_map, page_summaries, assembler, jinja_env): |
| 45 | + ctx = assembler.assemble_file_page( |
| 46 | + pf, graph, pagerank, betweenness, community, |
| 47 | + source_map.get(pf.file_info.path, b""), |
| 48 | + page_summaries=page_summaries, |
| 49 | + ) |
| 50 | + return jinja_env.get_template("file_page.j2").render(ctx=ctx), ctx |
| 51 | + |
| 52 | + |
| 53 | +async def main(repo_path: Path, max_pages: int, verbose: bool) -> None: |
| 54 | + # --- Load cached pages from wiki.db --- |
| 55 | + db_path = repo_path / ".repowise" / "wiki.db" |
| 56 | + if not db_path.exists(): |
| 57 | + print(f"ERROR: wiki.db not found at {db_path}") |
| 58 | + sys.exit(1) |
| 59 | + |
| 60 | + conn = sqlite3.connect(str(db_path)) |
| 61 | + conn.row_factory = sqlite3.Row |
| 62 | + rows = conn.execute( |
| 63 | + "SELECT id, target_path, source_hash FROM wiki_pages " |
| 64 | + "WHERE page_type = 'file_page' ORDER BY RANDOM() LIMIT ?", |
| 65 | + (max_pages,), |
| 66 | + ).fetchall() |
| 67 | + conn.close() |
| 68 | + print(f"Loaded {len(rows)} random file_page(s) from wiki.db.\n") |
| 69 | + |
| 70 | + # --- Run ingestion TWICE to detect non-determinism --- |
| 71 | + print("Run 1: ingestion pipeline...") |
| 72 | + p1, sm1, gb1 = await _build_pipeline(repo_path) |
| 73 | + print(f" {len(p1)} files parsed.") |
| 74 | + |
| 75 | + print("Run 2: ingestion pipeline (repeat to check stability)...") |
| 76 | + p2, sm2, gb2 = await _build_pipeline(repo_path) |
| 77 | + print(f" {len(p2)} files parsed.\n") |
| 78 | + |
| 79 | + # --- Compare graph properties between runs --- |
| 80 | + g1, g2 = gb1.graph(), gb2.graph() |
| 81 | + pr1, pr2 = gb1.pagerank(), gb2.pagerank() |
| 82 | + bc1, bc2 = gb1.betweenness_centrality(), gb2.betweenness_centrality() |
| 83 | + cm1, cm2 = gb1.community_detection(), gb2.community_detection() |
| 84 | + |
| 85 | + # Check edge ordering stability |
| 86 | + edge_order_unstable: list[str] = [] |
| 87 | + for node in list(g1.nodes())[:200]: |
| 88 | + succ1 = list(g1.successors(node)) |
| 89 | + succ2 = list(g2.successors(node)) |
| 90 | + if succ1 != succ2: |
| 91 | + edge_order_unstable.append(node) |
| 92 | + |
| 93 | + bc_diff = {k for k in bc1 if abs(bc1[k] - bc2.get(k, 0)) > 1e-9} |
| 94 | + cm_diff = {k for k in cm1 if cm1[k] != cm2.get(k)} |
| 95 | + pr_diff = {k for k in pr1 if abs(pr1[k] - pr2.get(k, 0)) > 1e-9} |
| 96 | + |
| 97 | + print("=== Stability check (Run 1 vs Run 2) ===") |
| 98 | + print(f" Graph nodes: {g1.number_of_nodes()} vs {g2.number_of_nodes()}") |
| 99 | + print(f" Graph edges: {g1.number_of_edges()} vs {g2.number_of_edges()}") |
| 100 | + _ok = lambda n: "[ok]" if n == 0 else f"[!!] {n} differ" |
| 101 | + print(f" Edge ordering: {_ok(len(edge_order_unstable))}" |
| 102 | + + (f" e.g. {edge_order_unstable[:2]}" if edge_order_unstable else "")) |
| 103 | + print(f" PageRank: {_ok(len(pr_diff))}") |
| 104 | + print(f" BetweennessCentral: {_ok(len(bc_diff))}") |
| 105 | + print(f" Community detect: {_ok(len(cm_diff))}") |
| 106 | + print() |
| 107 | + |
| 108 | + # --- Render prompts and compare with stored hashes --- |
| 109 | + from repowise.core.generation import ContextAssembler, GenerationConfig |
| 110 | + import jinja2 |
| 111 | + |
| 112 | + config = GenerationConfig() |
| 113 | + assembler = ContextAssembler(config) |
| 114 | + templates_dir = REPOWISE_ROOT / "packages" / "core" / "src" / \ |
| 115 | + "repowise" / "core" / "generation" / "templates" |
| 116 | + jinja_env = jinja2.Environment( |
| 117 | + loader=jinja2.FileSystemLoader(str(templates_dir)), |
| 118 | + undefined=jinja2.StrictUndefined, autoescape=False, |
| 119 | + ) |
| 120 | + |
| 121 | + path_to_pf = {pf.file_info.path: pf for pf in p1} |
| 122 | + graph, pagerank, betweenness, community = g1, pr1, bc1, cm1 |
| 123 | + |
| 124 | + print("=== Hash comparison (wiki.db vs fresh render) ===") |
| 125 | + matches = mismatches_dep = mismatches_other = 0 |
| 126 | + |
| 127 | + for row in rows: |
| 128 | + page_id = row["id"] |
| 129 | + tpath = row["target_path"] |
| 130 | + stored_hash = row["source_hash"] |
| 131 | + pf = path_to_pf.get(tpath) |
| 132 | + if pf is None: |
| 133 | + print(f" [skip] {tpath}: not found in parsed files") |
| 134 | + continue |
| 135 | + |
| 136 | + # Render without dep_summaries (re-run scenario, level 0/1 skipped) |
| 137 | + prompt_nodep, ctx = _render_file_page_prompt( |
| 138 | + pf, graph, pagerank, betweenness, community, sm1, |
| 139 | + page_summaries=None, assembler=assembler, jinja_env=jinja_env, |
| 140 | + ) |
| 141 | + hash_nodep = _sha256(prompt_nodep) |
| 142 | + |
| 143 | + if hash_nodep == stored_hash: |
| 144 | + matches += 1 |
| 145 | + print(f" [MATCH] {tpath}") |
| 146 | + continue |
| 147 | + |
| 148 | + # Check if edge ordering is the issue: render with run 2's graph |
| 149 | + prompt_run2, _ = _render_file_page_prompt( |
| 150 | + path_to_pf.get(tpath) or pf, |
| 151 | + g2, pr2, bc2, cm2, sm2, |
| 152 | + page_summaries=None, assembler=assembler, jinja_env=jinja_env, |
| 153 | + ) |
| 154 | + hash_run2 = _sha256(prompt_run2) |
| 155 | + edge_order_issue = (hash_nodep != hash_run2) |
| 156 | + |
| 157 | + # Check if dep_summaries explain the mismatch: |
| 158 | + # inject dummy summaries for all out-edges |
| 159 | + out_edges = list(graph.successors(tpath)) if tpath in graph else [] |
| 160 | + out_edges = [e for e in out_edges if not e.startswith("external:")] |
| 161 | + fake_summaries = {dep: f"[summary of {dep}]" for dep in out_edges} |
| 162 | + prompt_fakedep, _ = _render_file_page_prompt( |
| 163 | + pf, graph, pagerank, betweenness, community, sm1, |
| 164 | + page_summaries=fake_summaries, assembler=assembler, jinja_env=jinja_env, |
| 165 | + ) |
| 166 | + hash_fakedep = _sha256(prompt_fakedep) |
| 167 | + dep_affects = (prompt_nodep != prompt_fakedep) |
| 168 | + |
| 169 | + if dep_affects: |
| 170 | + mismatches_dep += 1 |
| 171 | + cause = "dep_summaries differ" |
| 172 | + else: |
| 173 | + mismatches_other += 1 |
| 174 | + cause = "unknown — dep_summaries do NOT affect prompt" |
| 175 | + |
| 176 | + if edge_order_issue: |
| 177 | + cause += " + edge-ordering non-deterministic" |
| 178 | + |
| 179 | + print(f" [MISMATCH] {tpath}") |
| 180 | + print(f" cause: {cause}") |
| 181 | + print(f" stored: {stored_hash[:20]}...") |
| 182 | + print(f" fresh(nodep): {hash_nodep[:20]}...") |
| 183 | + print(f" fresh(run2): {hash_run2[:20]}...") |
| 184 | + print(f" out_edges: {len(out_edges)} dep_affects_prompt: {dep_affects}") |
| 185 | + |
| 186 | + if verbose: |
| 187 | + # Show first real diff between stored prompt and fresh prompt |
| 188 | + # We can't reconstruct the exact stored prompt, but we can diff run1 vs run2 |
| 189 | + diff = list(difflib.unified_diff( |
| 190 | + prompt_nodep.splitlines(), |
| 191 | + prompt_run2.splitlines(), |
| 192 | + fromfile="run1", tofile="run2", lineterm="", n=1, |
| 193 | + )) |
| 194 | + if diff: |
| 195 | + print(" -- prompt diff run1 vs run2 (first 20 lines) --") |
| 196 | + for line in diff[:20]: |
| 197 | + print(f" {line}") |
| 198 | + else: |
| 199 | + print(" -- prompts are identical across runs (edge order stable) --") |
| 200 | + |
| 201 | + print() |
| 202 | + print("=== Summary ===") |
| 203 | + print(f" Match (empty dep_summaries = stored): {matches}") |
| 204 | + print(f" Mismatch caused by dep_summaries: {mismatches_dep}") |
| 205 | + print(f" Mismatch with unknown cause: {mismatches_other}") |
| 206 | + total = matches + mismatches_dep + mismatches_other |
| 207 | + print(f" Total checked: {total}") |
| 208 | + |
| 209 | + if mismatches_dep and not mismatches_other: |
| 210 | + print("\nCONCLUSION: dep_summaries (completed_page_summaries from level 0/1)") |
| 211 | + print(" is the sole cause. Fix: pre-populate from wiki.db before level 2.") |
| 212 | + elif mismatches_other: |
| 213 | + print("\nCONCLUSION: at least one other factor causes hash instability.") |
| 214 | + print(" Run with --verbose to see prompt diffs.") |
| 215 | + elif matches == total: |
| 216 | + print("\nCONCLUSION: all hashes match on empty dep_summaries — no other instability.") |
| 217 | + |
| 218 | + |
| 219 | +if __name__ == "__main__": |
| 220 | + ap = argparse.ArgumentParser(description=__doc__, |
| 221 | + formatter_class=argparse.RawDescriptionHelpFormatter) |
| 222 | + ap.add_argument("repo_path", nargs="?", default=".", help="Repo path (default: cwd)") |
| 223 | + ap.add_argument("--max-pages", type=int, default=10) |
| 224 | + ap.add_argument("--verbose", action="store_true", help="Show prompt diffs") |
| 225 | + args = ap.parse_args() |
| 226 | + asyncio.run(main(Path(args.repo_path).resolve(), args.max_pages, args.verbose)) |
0 commit comments