frontier-swe-openenv/scripts/build_hcapo_dataset.py at main · 3xcaffeine/frontier-swe-openenv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python3
"""Build an HCAPO step-weighted SFT dataset from trajectories + hindsight scores.

Combines trajectory-level GRPO advantages with step-level hindsight Q_H values
to produce per-step HCAPO advantages (Eq. 8 from paper 2603.08754).

Input:
    trajectories/episode_NNN/  — result.json + pi_session.jsonl + hindsight_scores.json

Output:
    datasets/hcapo_train.jsonl — one row per episode with step-level advantages

Usage:
    uv run python scripts/build_hcapo_dataset.py --min-reward 0.2 --omega 1.0
"""

from __future__ import annotations

import argparse
import json
import logging
import math
import sys
from pathlib import Path
from typing import Any

_SCRIPT_DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(_SCRIPT_DIR))
from build_training_dataset import load_episode

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger("build_hcapo")


# ---------------------------------------------------------------------------
# Loading
# ---------------------------------------------------------------------------

def load_hindsight_scores(episode_dir: Path) -> dict | None:
    path = episode_dir / "hindsight_scores.json"
    if not path.exists():
        return None
    return json.loads(path.read_text())


def load_episodes_with_scores(
    input_dir: Path, min_reward: float,
) -> list[dict]:
    """Load episodes that have both valid rewards and hindsight scores."""
    episodes = []
    for ep_dir in sorted(input_dir.glob("episode_*")):
        ep = load_episode(ep_dir, include_thinking=True, max_tool_result_chars=4000)
        if ep is None:
            continue
        if ep["reward"] < min_reward:
            logger.info("  Episode %s: reward=%.4f < %.4f, skipped", ep["episode_id"], ep["reward"], min_reward)
            continue

        scores = load_hindsight_scores(ep_dir)
        if scores is None:
            logger.warning("  Episode %s: no hindsight_scores.json, skipped", ep["episode_id"])
            continue

        ep["_hindsight"] = scores
        ep["_dir"] = str(ep_dir)
        episodes.append(ep)
        logger.info(
            "  Episode %s: reward=%.4f, %d steps, %d messages",
            ep["episode_id"], ep["reward"],
            len(scores.get("steps", [])), len(ep["messages"]),
        )

    return episodes


# ---------------------------------------------------------------------------
# Advantage computation (Eq. 3, 5, 8)
# ---------------------------------------------------------------------------

def compute_grpo_advantages(episodes: list[dict]) -> list[float]:
    """Trajectory-level GRPO advantages: A_i = (R_i - mu) / sigma  (Eq. 3)."""
    rewards = [ep["reward"] for ep in episodes]
    mu = sum(rewards) / len(rewards)
    variance = sum((r - mu) ** 2 for r in rewards) / len(rewards)
    sigma = math.sqrt(variance) if variance > 0 else 1.0
    return [(r - mu) / sigma for r in rewards]


def compute_hcapo_advantages(
    episodes: list[dict],
    omega: float = 1.0,
    use_smoothed: bool = True,
) -> list[list[float]]:
    """Multi-scale HCAPO advantages per step (Eq. 8).

    Returns a list of step-advantage lists, one per episode.
    """
    grpo_advs = compute_grpo_advantages(episodes)

    # Collect all Q_H values for global normalization
    all_qh: list[float] = []
    for ep in episodes:
        for step in ep["_hindsight"]["steps"]:
            key = "q_h_smoothed" if use_smoothed else "q_h"
            all_qh.append(step.get(key, step.get("q_h", 0.0)))

    mu_h = sum(all_qh) / len(all_qh) if all_qh else 0.0
    var_h = sum((q - mu_h) ** 2 for q in all_qh) / len(all_qh) if all_qh else 1.0
    sigma_h = math.sqrt(var_h) if var_h > 0 else 1.0

    logger.info(
        "GRPO advantages: min=%.3f max=%.3f | Q_H stats: mu=%.4f sigma=%.4f",
        min(grpo_advs), max(grpo_advs), mu_h, sigma_h,
    )

    per_episode_advantages: list[list[float]] = []
    for ep_idx, ep in enumerate(episodes):
        a_grpo = grpo_advs[ep_idx]
        steps = ep["_hindsight"]["steps"]
        key = "q_h_smoothed" if use_smoothed else "q_h"

        step_advs: list[float] = []
        for step in steps:
            qh = step.get(key, step.get("q_h", 0.0))
            a_micro = (qh - mu_h) / sigma_h

            # Do-no-harm mask: for successful trajectories, clip negative micro advantages
            if a_grpo > 0:
                a_micro = max(a_micro, 0.0)

            a_hcapo = a_grpo + omega * a_micro
            step_advs.append(a_hcapo)

        per_episode_advantages.append(step_advs)

    return per_episode_advantages


def normalize_advantages(
    per_episode_advantages: list[list[float]],
) -> list[list[float]]:
    """Clip to non-negative, then normalize so non-zero weights have mean 1.0."""
    all_positive: list[float] = []
    for advs in per_episode_advantages:
        for a in advs:
            clamped = max(a, 0.0)
            if clamped > 0:
                all_positive.append(clamped)

    mean_pos = sum(all_positive) / len(all_positive) if all_positive else 1.0

    normalized: list[list[float]] = []
    for advs in per_episode_advantages:
        normalized.append([max(a, 0.0) / mean_pos for a in advs])

    return normalized


# ---------------------------------------------------------------------------
# Dataset construction
# ---------------------------------------------------------------------------

def identify_assistant_indices(messages: list[dict]) -> list[int]:
    return [i for i, m in enumerate(messages) if m.get("role") == "assistant"]


def build_hcapo_dataset(
    episodes: list[dict],
    per_episode_advantages: list[list[float]],
) -> list[dict]:
    dataset: list[dict] = []

    for ep, advantages in zip(episodes, per_episode_advantages):
        messages = ep["messages"]
        assistant_indices = identify_assistant_indices(messages)

        if len(advantages) != len(assistant_indices):
            logger.warning(
                "Episode %s: %d advantages vs %d assistant messages — truncating to min",
                ep["episode_id"], len(advantages), len(assistant_indices),
            )
            n = min(len(advantages), len(assistant_indices))
            advantages = advantages[:n]
            assistant_indices = assistant_indices[:n]

        # Skip episodes where all advantages are 0 (below-average trajectories)
        if all(a == 0 for a in advantages):
            logger.info("  Episode %s: all advantages are 0, excluded", ep["episode_id"])
            continue

        grpo_advs = compute_grpo_advantages(episodes)
        ep_idx = episodes.index(ep)

        dataset.append({
            "messages": messages,
            "step_advantages": [round(a, 6) for a in advantages],
            "step_message_indices": assistant_indices,
            "_episode_id": ep["episode_id"],
            "_reward": ep["reward"],
            "_grpo_advantage": round(grpo_advs[ep_idx], 6),
            "_num_steps": len(advantages),
        })

    return dataset


# ---------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------

def write_jsonl(data: list[dict], path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    logger.info("Wrote %d examples to %s (%.1f KB)", len(data), path, path.stat().st_size / 1024)


def write_summary(
    data: list[dict],
    episodes: list[dict],
    args: argparse.Namespace,
    path: Path,
) -> None:
    all_advs = []
    for row in data:
        all_advs.extend(row["step_advantages"])

    nonzero = [a for a in all_advs if a > 0]
    summary = {
        "total_episodes_loaded": len(episodes),
        "episodes_in_dataset": len(data),
        "total_steps": len(all_advs),
        "nonzero_steps": len(nonzero),
        "advantage_stats": {
            "min": round(min(all_advs), 4) if all_advs else 0,
            "max": round(max(all_advs), 4) if all_advs else 0,
            "mean": round(sum(all_advs) / len(all_advs), 4) if all_advs else 0,
            "nonzero_mean": round(sum(nonzero) / len(nonzero), 4) if nonzero else 0,
        },
        "config": {
            "omega": args.omega,
            "min_reward": args.min_reward,
            "use_smoothed": not args.no_smooth,
        },
    }
    path.write_text(json.dumps(summary, indent=2))
    logger.info("Summary → %s", path)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Build HCAPO step-weighted SFT dataset")
    parser.add_argument("--input-dir", default="trajectories")
    parser.add_argument("--output-dir", default="datasets")
    parser.add_argument("--min-reward", type=float, default=0.2, help="Min episode reward to include")
    parser.add_argument("--omega", type=float, default=1.0, help="Hindsight weighting coefficient (Eq. 8)")
    parser.add_argument("--no-smooth", action="store_true", help="Use raw Q_H instead of smoothed")
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    input_dir = Path(args.input_dir)
    output_dir = Path(args.output_dir)

    logger.info("Loading episodes from %s...", input_dir)
    episodes = load_episodes_with_scores(input_dir, min_reward=args.min_reward)

    if not episodes:
        logger.error("No valid episodes found! Run compute_hindsight_scores.py first.")
        sys.exit(1)

    logger.info(
        "Loaded %d episodes (rewards: %.4f — %.4f)",
        len(episodes),
        min(ep["reward"] for ep in episodes),
        max(ep["reward"] for ep in episodes),
    )

    logger.info("Computing HCAPO advantages (omega=%.2f)...", args.omega)
    raw_advantages = compute_hcapo_advantages(
        episodes, omega=args.omega, use_smoothed=not args.no_smooth,
    )

    logger.info("Normalizing advantages...")
    advantages = normalize_advantages(raw_advantages)

    logger.info("Building dataset...")
    dataset = build_hcapo_dataset(episodes, advantages)

    if not dataset:
        logger.error("No usable episodes after advantage computation!")
        sys.exit(1)

    write_jsonl(dataset, output_dir / "hcapo_train.jsonl")
    write_summary(dataset, episodes, args, output_dir / "hcapo_summary.json")

    logger.info(
        "Done — %d episodes, %d total steps in dataset.",
        len(dataset),
        sum(row["_num_steps"] for row in dataset),
    )


if __name__ == "__main__":
    main()