Skip to content

Commit 6fdf4ee

Browse files
dmikushinclaude
andcommitted
fix(generation): stabilize prompt hashes across re-runs
Graph edge ordering and community IDs were non-deterministic because files are parsed in parallel (ProcessPoolExecutor + as_completed), causing NetworkX node insertion order to vary between runs. Changes: - context_assembler: sort predecessors/successors before including them in FilePageContext so dependents/dependencies lists are identical across runs regardless of graph construction order - graph: rebuild a sorted copy of the undirected graph before passing it to louvain_communities so adjacency traversal order is reproducible; also sort the returned community list by each community's smallest member before assigning integer IDs via enumerate() Adds scripts/diagnose_hash_mismatch.py to verify the fix and identify any remaining sources of hash instability (dep_summaries, betweenness sampling, etc.). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 263dce2 commit 6fdf4ee

File tree

3 files changed

+245
-5
lines changed

3 files changed

+245
-5
lines changed

packages/core/src/repowise/core/generation/context_assembler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -277,9 +277,10 @@ def assemble_file_page(
277277
else:
278278
import_list = []
279279

280-
# Graph edges
281-
in_edges = list(graph.predecessors(path)) if path in graph else []
282-
out_edges = list(graph.successors(path)) if path in graph else []
280+
# Graph edges — sorted for deterministic prompt hashes across runs
281+
# (graph node insertion order is non-deterministic due to parallel parsing)
282+
in_edges = sorted(graph.predecessors(path)) if path in graph else []
283+
out_edges = sorted(graph.successors(path)) if path in graph else []
283284
# Filter out external nodes
284285
in_edges = [e for e in in_edges if not e.startswith("external:")]
285286
out_edges = [e for e in out_edges if not e.startswith("external:")]

packages/core/src/repowise/core/ingestion/graph.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,22 @@ def community_detection(self) -> dict[str, int]:
214214
if g.number_of_nodes() == 0:
215215
return {}
216216
try:
217-
communities = nx.community.louvain_communities(g.to_undirected(), seed=42)
217+
# Build an undirected graph with nodes AND edges in deterministic
218+
# (sorted) order so that Louvain's adjacency traversal is reproducible
219+
# across runs regardless of the order files were parsed (parallel I/O
220+
# via ProcessPoolExecutor + as_completed → non-deterministic insertion
221+
# order in the main graph).
222+
g_und = g.to_undirected()
223+
g_stable = nx.Graph()
224+
g_stable.add_nodes_from(sorted(g_und.nodes()))
225+
for u, v in sorted((min(a, b), max(a, b)) for a, b in g_und.edges()):
226+
g_stable.add_edge(u, v)
227+
communities = nx.community.louvain_communities(g_stable, seed=42)
228+
# Also sort the returned community list by each community's smallest
229+
# member so that the integer IDs assigned via enumerate() are stable.
230+
sorted_communities = sorted(communities, key=lambda c: min(c, default=""))
218231
result: dict[str, int] = {}
219-
for community_id, members in enumerate(communities):
232+
for community_id, members in enumerate(sorted_communities):
220233
for node in members:
221234
result[node] = community_id
222235
return result

scripts/diagnose_hash_mismatch.py

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
#!/usr/bin/env python3
2+
"""Diagnose source_hash mismatches between wiki.db and fresh renders.
3+
4+
Usage:
5+
cd ~/forge/free-code
6+
python3.11 ~/forge/repowise/scripts/diagnose_hash_mismatch.py [--max-pages N]
7+
8+
What it checks:
9+
A) dep_summaries (completed_page_summaries) — empty on re-run (level 0/1 skipped)
10+
B) graph edge ordering — non-deterministic due to ProcessPoolExecutor + as_completed
11+
C) betweenness_centrality — random sampling when n > 30000 nodes
12+
D) community_id — Louvain seed=42, should be stable
13+
E) git history via git_meta — NOT passed to assemble_file_page (won't affect hash)
14+
"""
15+
from __future__ import annotations
16+
17+
import argparse
18+
import asyncio
19+
import difflib
20+
import hashlib
21+
import sqlite3
22+
import sys
23+
from pathlib import Path
24+
25+
REPOWISE_ROOT = Path(__file__).parent.parent
26+
sys.path.insert(0, str(REPOWISE_ROOT / "packages" / "core" / "src"))
27+
sys.path.insert(0, str(REPOWISE_ROOT / "packages" / "cli" / "src"))
28+
29+
30+
def _sha256(text: str) -> str:
31+
return hashlib.sha256(text.encode()).hexdigest()
32+
33+
34+
async def _build_pipeline(repo_path: Path):
35+
"""Run ingestion + graph, return (parsed_files, source_map, graph_builder)."""
36+
from repowise.core.pipeline.orchestrator import _run_ingestion
37+
parsed_files, file_infos, repo_structure, source_map, graph_builder = \
38+
await _run_ingestion(repo_path, exclude_patterns=None, skip_tests=False,
39+
skip_infra=False, progress=None)
40+
return parsed_files, source_map, graph_builder
41+
42+
43+
def _render_file_page_prompt(pf, graph, pagerank, betweenness, community,
44+
source_map, page_summaries, assembler, jinja_env):
45+
ctx = assembler.assemble_file_page(
46+
pf, graph, pagerank, betweenness, community,
47+
source_map.get(pf.file_info.path, b""),
48+
page_summaries=page_summaries,
49+
)
50+
return jinja_env.get_template("file_page.j2").render(ctx=ctx), ctx
51+
52+
53+
async def main(repo_path: Path, max_pages: int, verbose: bool) -> None:
54+
# --- Load cached pages from wiki.db ---
55+
db_path = repo_path / ".repowise" / "wiki.db"
56+
if not db_path.exists():
57+
print(f"ERROR: wiki.db not found at {db_path}")
58+
sys.exit(1)
59+
60+
conn = sqlite3.connect(str(db_path))
61+
conn.row_factory = sqlite3.Row
62+
rows = conn.execute(
63+
"SELECT id, target_path, source_hash FROM wiki_pages "
64+
"WHERE page_type = 'file_page' ORDER BY RANDOM() LIMIT ?",
65+
(max_pages,),
66+
).fetchall()
67+
conn.close()
68+
print(f"Loaded {len(rows)} random file_page(s) from wiki.db.\n")
69+
70+
# --- Run ingestion TWICE to detect non-determinism ---
71+
print("Run 1: ingestion pipeline...")
72+
p1, sm1, gb1 = await _build_pipeline(repo_path)
73+
print(f" {len(p1)} files parsed.")
74+
75+
print("Run 2: ingestion pipeline (repeat to check stability)...")
76+
p2, sm2, gb2 = await _build_pipeline(repo_path)
77+
print(f" {len(p2)} files parsed.\n")
78+
79+
# --- Compare graph properties between runs ---
80+
g1, g2 = gb1.graph(), gb2.graph()
81+
pr1, pr2 = gb1.pagerank(), gb2.pagerank()
82+
bc1, bc2 = gb1.betweenness_centrality(), gb2.betweenness_centrality()
83+
cm1, cm2 = gb1.community_detection(), gb2.community_detection()
84+
85+
# Check edge ordering stability
86+
edge_order_unstable: list[str] = []
87+
for node in list(g1.nodes())[:200]:
88+
succ1 = list(g1.successors(node))
89+
succ2 = list(g2.successors(node))
90+
if succ1 != succ2:
91+
edge_order_unstable.append(node)
92+
93+
bc_diff = {k for k in bc1 if abs(bc1[k] - bc2.get(k, 0)) > 1e-9}
94+
cm_diff = {k for k in cm1 if cm1[k] != cm2.get(k)}
95+
pr_diff = {k for k in pr1 if abs(pr1[k] - pr2.get(k, 0)) > 1e-9}
96+
97+
print("=== Stability check (Run 1 vs Run 2) ===")
98+
print(f" Graph nodes: {g1.number_of_nodes()} vs {g2.number_of_nodes()}")
99+
print(f" Graph edges: {g1.number_of_edges()} vs {g2.number_of_edges()}")
100+
_ok = lambda n: "[ok]" if n == 0 else f"[!!] {n} differ"
101+
print(f" Edge ordering: {_ok(len(edge_order_unstable))}"
102+
+ (f" e.g. {edge_order_unstable[:2]}" if edge_order_unstable else ""))
103+
print(f" PageRank: {_ok(len(pr_diff))}")
104+
print(f" BetweennessCentral: {_ok(len(bc_diff))}")
105+
print(f" Community detect: {_ok(len(cm_diff))}")
106+
print()
107+
108+
# --- Render prompts and compare with stored hashes ---
109+
from repowise.core.generation import ContextAssembler, GenerationConfig
110+
import jinja2
111+
112+
config = GenerationConfig()
113+
assembler = ContextAssembler(config)
114+
templates_dir = REPOWISE_ROOT / "packages" / "core" / "src" / \
115+
"repowise" / "core" / "generation" / "templates"
116+
jinja_env = jinja2.Environment(
117+
loader=jinja2.FileSystemLoader(str(templates_dir)),
118+
undefined=jinja2.StrictUndefined, autoescape=False,
119+
)
120+
121+
path_to_pf = {pf.file_info.path: pf for pf in p1}
122+
graph, pagerank, betweenness, community = g1, pr1, bc1, cm1
123+
124+
print("=== Hash comparison (wiki.db vs fresh render) ===")
125+
matches = mismatches_dep = mismatches_other = 0
126+
127+
for row in rows:
128+
page_id = row["id"]
129+
tpath = row["target_path"]
130+
stored_hash = row["source_hash"]
131+
pf = path_to_pf.get(tpath)
132+
if pf is None:
133+
print(f" [skip] {tpath}: not found in parsed files")
134+
continue
135+
136+
# Render without dep_summaries (re-run scenario, level 0/1 skipped)
137+
prompt_nodep, ctx = _render_file_page_prompt(
138+
pf, graph, pagerank, betweenness, community, sm1,
139+
page_summaries=None, assembler=assembler, jinja_env=jinja_env,
140+
)
141+
hash_nodep = _sha256(prompt_nodep)
142+
143+
if hash_nodep == stored_hash:
144+
matches += 1
145+
print(f" [MATCH] {tpath}")
146+
continue
147+
148+
# Check if edge ordering is the issue: render with run 2's graph
149+
prompt_run2, _ = _render_file_page_prompt(
150+
path_to_pf.get(tpath) or pf,
151+
g2, pr2, bc2, cm2, sm2,
152+
page_summaries=None, assembler=assembler, jinja_env=jinja_env,
153+
)
154+
hash_run2 = _sha256(prompt_run2)
155+
edge_order_issue = (hash_nodep != hash_run2)
156+
157+
# Check if dep_summaries explain the mismatch:
158+
# inject dummy summaries for all out-edges
159+
out_edges = list(graph.successors(tpath)) if tpath in graph else []
160+
out_edges = [e for e in out_edges if not e.startswith("external:")]
161+
fake_summaries = {dep: f"[summary of {dep}]" for dep in out_edges}
162+
prompt_fakedep, _ = _render_file_page_prompt(
163+
pf, graph, pagerank, betweenness, community, sm1,
164+
page_summaries=fake_summaries, assembler=assembler, jinja_env=jinja_env,
165+
)
166+
hash_fakedep = _sha256(prompt_fakedep)
167+
dep_affects = (prompt_nodep != prompt_fakedep)
168+
169+
if dep_affects:
170+
mismatches_dep += 1
171+
cause = "dep_summaries differ"
172+
else:
173+
mismatches_other += 1
174+
cause = "unknown — dep_summaries do NOT affect prompt"
175+
176+
if edge_order_issue:
177+
cause += " + edge-ordering non-deterministic"
178+
179+
print(f" [MISMATCH] {tpath}")
180+
print(f" cause: {cause}")
181+
print(f" stored: {stored_hash[:20]}...")
182+
print(f" fresh(nodep): {hash_nodep[:20]}...")
183+
print(f" fresh(run2): {hash_run2[:20]}...")
184+
print(f" out_edges: {len(out_edges)} dep_affects_prompt: {dep_affects}")
185+
186+
if verbose:
187+
# Show first real diff between stored prompt and fresh prompt
188+
# We can't reconstruct the exact stored prompt, but we can diff run1 vs run2
189+
diff = list(difflib.unified_diff(
190+
prompt_nodep.splitlines(),
191+
prompt_run2.splitlines(),
192+
fromfile="run1", tofile="run2", lineterm="", n=1,
193+
))
194+
if diff:
195+
print(" -- prompt diff run1 vs run2 (first 20 lines) --")
196+
for line in diff[:20]:
197+
print(f" {line}")
198+
else:
199+
print(" -- prompts are identical across runs (edge order stable) --")
200+
201+
print()
202+
print("=== Summary ===")
203+
print(f" Match (empty dep_summaries = stored): {matches}")
204+
print(f" Mismatch caused by dep_summaries: {mismatches_dep}")
205+
print(f" Mismatch with unknown cause: {mismatches_other}")
206+
total = matches + mismatches_dep + mismatches_other
207+
print(f" Total checked: {total}")
208+
209+
if mismatches_dep and not mismatches_other:
210+
print("\nCONCLUSION: dep_summaries (completed_page_summaries from level 0/1)")
211+
print(" is the sole cause. Fix: pre-populate from wiki.db before level 2.")
212+
elif mismatches_other:
213+
print("\nCONCLUSION: at least one other factor causes hash instability.")
214+
print(" Run with --verbose to see prompt diffs.")
215+
elif matches == total:
216+
print("\nCONCLUSION: all hashes match on empty dep_summaries — no other instability.")
217+
218+
219+
if __name__ == "__main__":
220+
ap = argparse.ArgumentParser(description=__doc__,
221+
formatter_class=argparse.RawDescriptionHelpFormatter)
222+
ap.add_argument("repo_path", nargs="?", default=".", help="Repo path (default: cwd)")
223+
ap.add_argument("--max-pages", type=int, default=10)
224+
ap.add_argument("--verbose", action="store_true", help="Show prompt diffs")
225+
args = ap.parse_args()
226+
asyncio.run(main(Path(args.repo_path).resolve(), args.max_pages, args.verbose))

0 commit comments

Comments
 (0)