Skip to content

Commit 9e7a9d1

Browse files
fix: clamp episode summary scores to open interval
1 parent caa13e3 commit 9e7a9d1

2 files changed

Lines changed: 6 additions & 1 deletion

File tree

server/shopOps_environment.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@
5353

5454
ENV_SCHEMA_VERSION = "2.0.0"
5555
INVALID_LIMIT = 4
56+
SCORE_MIN = 1e-9
57+
SCORE_MAX = 1.0 - 1e-9
5658
TASK_ALIASES = {
5759
"easy": "refund_policy_recovery",
5860
"medium": "sla_queue_juggle",
@@ -1185,10 +1187,11 @@ def _episode_summary(self) -> Dict[str, object]:
11851187
business_score -= 0.35 * invalid_penalty
11861188
business_score -= 0.25 * unresolved_ratio
11871189
terminal_bonus = max(0.0, min(0.5, business_score * 0.5))
1190+
final_score = max(SCORE_MIN, min(SCORE_MAX, business_score))
11881191
return {
11891192
"task": self._task_name,
11901193
"difficulty": self._difficulty,
1191-
"final_score": round(max(0.0, min(1.0, business_score)), 4),
1194+
"final_score": final_score,
11921195
"terminal_bonus": round(terminal_bonus, 4),
11931196
"closed_cases": closed_cases,
11941197
"resolved_cases": resolved_cases,

tests/test_metrics_baselines.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def test_episode_summary_schema() -> None:
1515
assert "closed_cases" in summary
1616
assert "reopened_cases" in summary
1717
assert "fraud_loss_usd" in summary
18+
assert 0.0 < summary["final_score"] < 1.0
1819

1920

2021
def test_eval_aggregate_metrics() -> None:
@@ -34,6 +35,7 @@ def test_baseline_scores_are_monotonic_by_difficulty_seed_1() -> None:
3435
for task in TASKS
3536
]
3637
assert scores[0] >= scores[1] >= scores[2]
38+
assert all(0.0 < score < 1.0 for score in scores)
3739

3840

3941
def test_graders_return_open_interval_scores() -> None:

0 commit comments

Comments
 (0)