fix: clamp grader scores to open interval (0, 1) to pass Phase 2 validation

satyam-mishra-dev · claude · satyam-mishra-dev · commit 113c5224f45e · 2026-04-07T13:04:38.000+05:30
The OpenEnv validator requires task scores to be strictly between 0 and 1.
ScoreGrader was returning exactly 0.0 (empty trajectory or all-negative
rewards) and exactly 1.0 (perfect agent), causing all three tasks to fail
the score-range check and the "at least 3 tasks with graders" check.

Changed clamping bounds from [0.0, 1.0] to [_SCORE_MIN, _SCORE_MAX]
where _SCORE_MIN = 1e-9 and _SCORE_MAX = 1 - 1e-9.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/graders.py b/graders.py
@@ -8,7 +8,8 @@
 Graders for the ShopOps OpenEnv tasks.
 
 Each grader receives the full episode trajectory (list of step dicts, each
-containing at least a "reward" key) and returns a normalised score in [0.0, 1.0].
+containing at least a "reward" key) and returns a normalised score strictly
+in (0.0, 1.0) — i.e. scores are always > 0 and < 1 as required by the spec.
 """
 
 from __future__ import annotations
@@ -19,6 +20,10 @@
 # Used as the theoretical maximum total reward (1.0 per step × 20 steps).
 _MAX_STEPS = 20
 
+# Scores must be strictly between 0 and 1 (exclusive) per the OpenEnv spec.
+_SCORE_MIN = 1e-9
+_SCORE_MAX = 1.0 - 1e-9
+
 
 class ScoreGrader:
     """
@@ -31,14 +36,16 @@ class ScoreGrader:
       * -1.0         for invalid actions
 
     The grader sums all rewards and divides by the theoretical maximum
-    (_MAX_STEPS × 1.0 = 20.0), then clamps the result to [0.0, 1.0]:
+    (_MAX_STEPS × 1.0 = 20.0), then clamps the result to the open interval
+    (_SCORE_MIN, _SCORE_MAX) so the returned value is always strictly between
+    0 and 1 (never exactly 0.0 or 1.0):
 
-        score = clamp(sum(rewards) / _MAX_STEPS, 0.0, 1.0)
+        score = clamp(sum(rewards) / _MAX_STEPS, _SCORE_MIN, _SCORE_MAX)
 
     This means:
-      * A perfect agent that scores 1.0 every step → score = 1.0
+      * A perfect agent that scores 1.0 every step → score ≈ 1.0 (capped at _SCORE_MAX)
       * An agent that always rejects correctly     → score ≈ 0.45–0.75 (task-dependent)
-      * An agent that triggers the invalid limit   → score = 0.0 (clamped)
+      * An agent that triggers the invalid limit   → score ≈ 0.0 (floored at _SCORE_MIN)
 
     The grader is deterministic: identical trajectories always yield the same score.
     """
@@ -52,11 +59,11 @@ def grade(self, trajectory: List[Dict[str, Any]]) -> float:
                         key whose value is a float (or None, treated as 0.0).
 
         Returns:
-            Normalised score in [0.0, 1.0].
+            Normalised score strictly in (0.0, 1.0).
         """
         if not trajectory:
-            return 0.0
+            return _SCORE_MIN
 
         total_reward = sum(float(step.get("reward") or 0.0) for step in trajectory)
         score = total_reward / _MAX_STEPS
-        return float(max(0.0, min(1.0, score)))
+        return float(max(_SCORE_MIN, min(_SCORE_MAX, score)))