Skip to content

Commit 635eb1e

Browse files
fix: keep rewards and serialized scores in open interval
1 parent 9e7a9d1 commit 635eb1e

6 files changed

Lines changed: 80 additions & 40 deletions

File tree

eval.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from .server.shopOps_environment import ShopopsEnvironment
2222

2323
OUTPUT_DIR = Path("outputs/evals")
24+
SCORE_MIN = 1e-9
25+
SCORE_MAX = 1.0 - 1e-9
2426
TASKS = [
2527
"refund_policy_recovery",
2628
"sla_queue_juggle",
@@ -33,6 +35,10 @@
3335
}
3436

3537

38+
def _open_interval_score(value: float) -> float:
39+
return max(SCORE_MIN, min(SCORE_MAX, value))
40+
41+
3642
def _priority_rank(priority: CasePriority) -> int:
3743
return {
3844
CasePriority.LOW: 0,
@@ -204,9 +210,10 @@ def aggregate_results(results: List[Dict[str, object]]) -> Dict[str, object]:
204210
fraud_loss += float(summary.get("fraud_loss_usd", 0.0))
205211

206212
count = len(results)
213+
avg_final_score = _open_interval_score(total_score / count)
207214
return {
208215
"episodes": count,
209-
"avg_final_score": round(total_score / count, 4),
216+
"avg_final_score": avg_final_score,
210217
"avg_total_reward": round(total_reward / count, 4),
211218
"avg_closed_cases": round(closed_cases / count, 4),
212219
"avg_reopened_cases": round(reopened_cases / count, 4),

inference.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
"sla_queue_juggle": 5.4,
4040
"fraud_stockout_cascade": 7.6,
4141
}
42+
SCORE_MIN = 1e-9
43+
SCORE_MAX = 1.0 - 1e-9
4244

4345
SYSTEM_PROMPT = (
4446
"You are operating a customer-ops command center. Return ONLY a JSON object with keys: "
@@ -84,6 +86,10 @@ def _log_end(success: bool, steps: int, score: float, rewards: List[float]) -> N
8486
)
8587

8688

89+
def _open_interval_score(value: float) -> float:
90+
return max(SCORE_MIN, min(SCORE_MAX, value))
91+
92+
8793
def _parse_action(text: str) -> Dict[str, Any]:
8894
try:
8995
return json.loads(text)
@@ -260,7 +266,7 @@ def _run_task(client: OpenAI, task: str) -> None:
260266
_log_step(step=step, action=action_str, reward=reward, done=done, error=error)
261267

262268
score = sum(rewards) / MAX_TOTAL_REWARD[task] if MAX_TOTAL_REWARD[task] > 0 else 0.0
263-
score = max(0.0, min(1.0, score))
269+
score = _open_interval_score(score)
264270
success = score >= 0.4
265271
finally:
266272
_log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

scripts/meta_review_eval.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
"sla_queue_juggle": 5.4,
2323
"fraud_stockout_cascade": 7.6,
2424
}
25+
SCORE_MIN = 1e-9
26+
SCORE_MAX = 1.0 - 1e-9
2527

2628

2729
@dataclass
@@ -36,6 +38,10 @@ class EpisodeStats:
3638
PolicyFn = Callable[[ShopopsObservation], ShopopsAction]
3739

3840

41+
def _open_interval_score(value: float) -> float:
42+
return max(SCORE_MIN, min(SCORE_MAX, value))
43+
44+
3945
def _run_policy(task: str, seed: int, policy: PolicyFn) -> EpisodeStats:
4046
env = ShopopsEnvironment(debug_mode=True)
4147
obs = env.reset(seed=seed, task=task)
@@ -52,7 +58,7 @@ def _run_policy(task: str, seed: int, policy: PolicyFn) -> EpisodeStats:
5258
return EpisodeStats(
5359
total_reward=round(total_reward, 4),
5460
normalized_reward=round(
55-
max(0.0, min(1.0, total_reward / NORMALIZATION_CAPS[task])),
61+
_open_interval_score(total_reward / NORMALIZATION_CAPS[task]),
5662
4,
5763
),
5864
final_score=float(summary.get("final_score", 0.0)),

server/shopOps_environment.py

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
INVALID_LIMIT = 4
5656
SCORE_MIN = 1e-9
5757
SCORE_MAX = 1.0 - 1e-9
58+
STEP_REWARD_MIN = 0.01
5859
TASK_ALIASES = {
5960
"easy": "refund_policy_recovery",
6061
"medium": "sla_queue_juggle",
@@ -336,7 +337,7 @@ def step(
336337

337338
if self._is_done():
338339
return self._build_observation(
339-
reward=0.0,
340+
reward=STEP_REWARD_MIN,
340341
done=True,
341342
info={"already_done": True, "termination_reason": self._termination_reason()},
342343
)
@@ -349,7 +350,7 @@ def step(
349350
self._state.step_count += 1
350351
self._advance_events()
351352
self._update_sla_breaches()
352-
reward = -0.25
353+
reward = STEP_REWARD_MIN
353354
self._cumulative_reward += reward
354355
self._latest_tool_result = ToolResult(
355356
action_type=action.action_type,
@@ -364,8 +365,8 @@ def step(
364365
"last_action_error": validation_error,
365366
"reward_breakdown": {
366367
"information_gain": 0.0,
367-
"workflow_progress": -0.15,
368-
"business_outcome": -0.1,
368+
"workflow_progress": STEP_REWARD_MIN,
369+
"business_outcome": STEP_REWARD_MIN,
369370
},
370371
}
371372
if done:
@@ -793,9 +794,9 @@ def _inspect_order(self, case: CaseInternal, action: ShopopsAction) -> ActionOut
793794
del action
794795
if "order" in case.completed_checks:
795796
return ActionOutcome(
796-
reward=-0.03,
797+
reward=0.02,
797798
summary="Order details were already inspected.",
798-
details={"reward_breakdown": {"information_gain": -0.03}},
799+
details={"reward_breakdown": {"information_gain": 0.02}},
799800
)
800801
case.completed_checks.add("order")
801802
case.order_summary = case.order_details_text
@@ -812,9 +813,9 @@ def _inspect_policy(self, case: CaseInternal, action: ShopopsAction) -> ActionOu
812813
del action
813814
if "policy" in case.completed_checks:
814815
return ActionOutcome(
815-
reward=-0.03,
816+
reward=0.02,
816817
summary="Policy details were already inspected.",
817-
details={"reward_breakdown": {"information_gain": -0.03}},
818+
details={"reward_breakdown": {"information_gain": 0.02}},
818819
)
819820
case.completed_checks.add("policy")
820821
case.policy_summary = case.policy_details_text or "No special policy guidance for this case."
@@ -832,9 +833,9 @@ def _inspect_inventory(self, case: CaseInternal, action: ShopopsAction) -> Actio
832833
del action
833834
if "inventory" in case.completed_checks:
834835
return ActionOutcome(
835-
reward=-0.03,
836+
reward=0.02,
836837
summary="Inventory was already inspected.",
837-
details={"reward_breakdown": {"information_gain": -0.03}},
838+
details={"reward_breakdown": {"information_gain": 0.02}},
838839
)
839840
case.completed_checks.add("inventory")
840841
sku = case.replacement_sku or "none"
@@ -853,9 +854,9 @@ def _inspect_customer_history(self, case: CaseInternal, action: ShopopsAction) -
853854
del action
854855
if "history" in case.completed_checks:
855856
return ActionOutcome(
856-
reward=-0.03,
857+
reward=0.02,
857858
summary="Customer history was already inspected.",
858-
details={"reward_breakdown": {"information_gain": -0.03}},
859+
details={"reward_breakdown": {"information_gain": 0.02}},
859860
)
860861
case.completed_checks.add("history")
861862
case.history_summary = case.history_details_text or "No significant customer history was found."
@@ -873,21 +874,21 @@ def _request_evidence(self, case: CaseInternal, action: ShopopsAction) -> Action
873874
del action
874875
if not case.needs_evidence:
875876
return ActionOutcome(
876-
reward=-0.05,
877+
reward=0.01,
877878
summary="This case does not require customer evidence.",
878-
details={"reward_breakdown": {"workflow_progress": -0.05}},
879+
details={"reward_breakdown": {"workflow_progress": 0.01}},
879880
)
880881
if case.evidence_status == EvidenceStatus.REQUESTED:
881882
return ActionOutcome(
882-
reward=-0.03,
883+
reward=0.02,
883884
summary="Evidence request is already pending.",
884-
details={"reward_breakdown": {"workflow_progress": -0.03}},
885+
details={"reward_breakdown": {"workflow_progress": 0.02}},
885886
)
886887
if case.evidence_status in {EvidenceStatus.RECEIVED, EvidenceStatus.INSUFFICIENT}:
887888
return ActionOutcome(
888-
reward=-0.02,
889+
reward=0.02,
889890
summary="Evidence result is already available.",
890-
details={"reward_breakdown": {"workflow_progress": -0.02}},
891+
details={"reward_breakdown": {"workflow_progress": 0.02}},
891892
)
892893
case.evidence_status = EvidenceStatus.REQUESTED
893894
case.status = CaseStatus.WAITING_CUSTOMER
@@ -912,21 +913,21 @@ def _contact_carrier(self, case: CaseInternal, action: ShopopsAction) -> ActionO
912913
del action
913914
if not case.needs_carrier_contact:
914915
return ActionOutcome(
915-
reward=-0.05,
916+
reward=0.01,
916917
summary="Carrier contact is not needed for this case.",
917-
details={"reward_breakdown": {"workflow_progress": -0.05}},
918+
details={"reward_breakdown": {"workflow_progress": 0.01}},
918919
)
919920
if case.carrier_status == CarrierStatus.INVESTIGATING:
920921
return ActionOutcome(
921-
reward=-0.03,
922+
reward=0.02,
922923
summary="Carrier investigation is already pending.",
923-
details={"reward_breakdown": {"workflow_progress": -0.03}},
924+
details={"reward_breakdown": {"workflow_progress": 0.02}},
924925
)
925926
if case.carrier_status in {CarrierStatus.APPROVED, CarrierStatus.DENIED}:
926927
return ActionOutcome(
927-
reward=-0.02,
928+
reward=0.02,
928929
summary="Carrier result is already available.",
929-
details={"reward_breakdown": {"workflow_progress": -0.02}},
930+
details={"reward_breakdown": {"workflow_progress": 0.02}},
930931
)
931932
case.carrier_status = CarrierStatus.INVESTIGATING
932933
case.status = CaseStatus.WAITING_CARRIER
@@ -956,13 +957,13 @@ def _issue_refund(self, case: CaseInternal, action: ShopopsAction) -> ActionOutc
956957
fit = self._refund_fit(case, amount)
957958
workflow = self._check_coverage(case)
958959
business = 0.22 + fit
959-
workflow_reward = 0.08 if workflow >= 1.0 else max(-0.08, 0.08 * (workflow - 1.0))
960+
workflow_reward = 0.08 if workflow >= 1.0 else max(0.01, 0.08 * workflow)
960961
if case.fraud_signal == FraudSignal.HIGH and case.evidence_status in {
961962
EvidenceStatus.NOT_REQUESTED,
962963
EvidenceStatus.REQUESTED,
963964
}:
964965
business -= 0.18
965-
reward = max(-0.2, business + workflow_reward)
966+
reward = max(STEP_REWARD_MIN, business + workflow_reward)
966967
case.resolution_summary = f"Refund of ${amount:.2f} prepared."
967968
return ActionOutcome(
968969
reward=reward,
@@ -982,9 +983,9 @@ def _ship_replacement(self, case: CaseInternal, action: ShopopsAction) -> Action
982983
if units <= 0:
983984
self._stockouts += 1
984985
return ActionOutcome(
985-
reward=-0.2,
986+
reward=STEP_REWARD_MIN,
986987
summary="Replacement failed because inventory is exhausted.",
987-
details={"reward_breakdown": {"business_outcome": -0.2}},
988+
details={"reward_breakdown": {"business_outcome": STEP_REWARD_MIN}},
988989
)
989990
self._inventory[sku] = units - 1
990991
case.resolution_action = ActionType.SHIP_REPLACEMENT
@@ -995,7 +996,7 @@ def _ship_replacement(self, case: CaseInternal, action: ShopopsAction) -> Action
995996
workflow = self._check_coverage(case)
996997
expedite_bonus = 0.08 if action.expedite == case.preferred_expedite else -0.04
997998
resolution_bonus = 0.26 if case.preferred_resolution == ActionType.SHIP_REPLACEMENT else -0.12
998-
reward = max(-0.2, resolution_bonus + expedite_bonus + 0.06 * workflow)
999+
reward = max(STEP_REWARD_MIN, resolution_bonus + expedite_bonus + 0.06 * workflow)
9991000
case.resolution_summary = (
10001001
f"Replacement for {sku} queued{' with expedite' if action.expedite else ''}."
10011002
)
@@ -1038,9 +1039,9 @@ def _add_internal_note(self, case: CaseInternal, action: ShopopsAction) -> Actio
10381039
note_code = action.note_code or "general_note"
10391040
if note_code in case.notes:
10401041
return ActionOutcome(
1041-
reward=-0.02,
1042+
reward=0.02,
10421043
summary="That note already exists on the case.",
1043-
details={"reward_breakdown": {"workflow_progress": -0.02}},
1044+
details={"reward_breakdown": {"workflow_progress": 0.02}},
10441045
)
10451046
case.notes.append(note_code)
10461047
reward = 0.05 if case.requires_note else 0.01
@@ -1070,12 +1071,12 @@ def _close_case(self, case: CaseInternal, action: ShopopsAction) -> ActionOutcom
10701071
)
10711072
case.resolution_summary = case.resolution_summary or "Case closed."
10721073
return ActionOutcome(
1073-
reward=max(-0.25, reward),
1074+
reward=max(STEP_REWARD_MIN, reward),
10741075
summary=f"Case {case.case_id} closed.",
10751076
details={
10761077
"reward_breakdown": {
10771078
"workflow_progress": 0.12,
1078-
"business_outcome": max(-0.37, reward - 0.12),
1079+
"business_outcome": max(STEP_REWARD_MIN, reward - 0.12),
10791080
},
10801081
"closure_quality": round(quality, 4),
10811082
"remaining_blockers": blockers,
@@ -1087,15 +1088,15 @@ def _switch_case(self, case: CaseInternal, action: ShopopsAction) -> ActionOutco
10871088
target = self._case_by_id(action.case_id or "")
10881089
if target is None:
10891090
return ActionOutcome(
1090-
reward=-0.1,
1091+
reward=STEP_REWARD_MIN,
10911092
summary="Cannot switch because the target case does not exist.",
1092-
details={"reward_breakdown": {"workflow_progress": -0.1}},
1093+
details={"reward_breakdown": {"workflow_progress": STEP_REWARD_MIN}},
10931094
)
10941095
if target.case_id == self._active_case_id:
10951096
return ActionOutcome(
1096-
reward=-0.02,
1097+
reward=0.02,
10971098
summary="The target case is already active.",
1098-
details={"reward_breakdown": {"workflow_progress": -0.02}},
1099+
details={"reward_breakdown": {"workflow_progress": 0.02}},
10991100
)
11001101
current = self._active_case()
11011102
self._active_case_id = target.case_id

tests/test_metrics_baselines.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def test_eval_aggregate_metrics() -> None:
2727
assert "avg_final_score" in summary
2828
assert "avg_total_reward" in summary
2929
assert "avg_closed_cases" in summary
30+
assert 0.0 < summary["avg_final_score"] < 1.0
3031

3132

3233
def test_baseline_scores_are_monotonic_by_difficulty_seed_1() -> None:

tests/test_validation_resources_rewards.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
ShopopsAction,
77
)
88
from shopOps.server.shopOps_environment import ShopopsEnvironment
9+
from shopOps.eval import baseline_policy, TASKS
910

1011

1112
def test_validation_missing_refund_amount() -> None:
@@ -69,3 +70,21 @@ def test_escalation_requires_reason() -> None:
6970

7071
obs = env.step(ShopopsAction(action_type=ActionType.ESCALATE_RISK))
7172
assert obs.metadata["last_action_error"] == "escalation_reason_required"
73+
74+
75+
def test_all_returned_rewards_stay_in_open_interval() -> None:
76+
for task in TASKS:
77+
env = ShopopsEnvironment(debug_mode=True)
78+
obs = env.reset(seed=1, task=task)
79+
while True:
80+
obs = env.step(baseline_policy(obs))
81+
assert obs.reward is not None
82+
assert 0.0 < obs.reward < 1.0
83+
if obs.done:
84+
break
85+
86+
env = ShopopsEnvironment(debug_mode=True)
87+
env.reset(seed=1, task=task)
88+
invalid = env.step(ShopopsAction(action_type=ActionType.CLOSE_CASE))
89+
assert invalid.reward is not None
90+
assert 0.0 < invalid.reward < 1.0

0 commit comments

Comments
 (0)