satyam-mishra-dev
diff --git a/‎.openenvignore‎
Lines changed: 8 additions & 0 deletions b/‎.openenvignore‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 34 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 18 additions & 11 deletions b/‎README.md‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎__pycache__/__init__.cpython-314.pyc‎
121 Bytes b/‎__pycache__/__init__.cpython-314.pyc‎
121 Bytes
diff --git a/‎__pycache__/client.cpython-314.pyc‎
56 Bytes b/‎__pycache__/client.cpython-314.pyc‎
56 Bytes
diff --git a/‎graders.py‎
Lines changed: 62 additions & 0 deletions b/‎graders.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎inference.py‎
Lines changed: 96 additions & 45 deletions b/‎inference.py‎
Lines changed: 96 additions & 45 deletions
@@ -0,0 +1,8 @@
+venv/
+.venv/
+__pycache__/
+outputs/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+*.pyc
@@ -0,0 +1,34 @@
+# Standalone Dockerfile for ShopOps OpenEnv environment.
+# Uses python:3.11-slim so docker build works without access to internal base images.
+#
+# Build:
+#   docker build -t shopops-env:latest .
+#
+# Run:
+#   docker run -p 8000:8000 shopops-env:latest
+
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install curl (needed for HEALTHCHECK)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends curl && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY server/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+
+# Copy the full project so server.app:app and models.py are importable
+COPY . /app
+
+ENV PYTHONPATH="/app:$PYTHONPATH"
+ENV PORT=8000
+
+EXPOSE 8000
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+    CMD curl -f http://localhost:${PORT}/health || exit 1
+
+CMD ["sh", "-c", "uvicorn server.app:app --host 0.0.0.0 --port ${PORT}"]
@@ -1,6 +1,6 @@
 ---
 title: ShopOps Environment Server
-emoji: 🎥
+emoji: 🛒
 colorFrom: indigo
 colorTo: purple
 sdk: docker
@@ -160,8 +160,8 @@ Run these before submitting:
    Confirm your Space responds:  
    `curl -s -o /dev/null -w "%{http_code}" -X POST "$PING_URL/reset"` → `200`
 
-2. **Docker build**  
-   `docker build -t shopops-env:latest -f server/Dockerfile .`
+2. **Docker build**
+   `docker build -t shopops-env:latest .`
 
 3. **OpenEnv validate**  
    `openenv validate`
@@ -202,16 +202,19 @@ Rule-based baseline policy on test split (total-seeds=200 → 40 test episodes).
 
 ## Model Benchmarks (Inference Script)
 
-Inference-based benchmarks using `inference.py` against the local server, `MAX_STEPS=20`, 10 seeds.
+Inference-based benchmarks using `inference.py` against the local server, `MAX_STEPS=20`, `SEED=42`.
+`inference.py` runs all three tiers sequentially and emits one `[START]`/`[STEP]+`/`[END]` block per tier.
 
-| Model | Avg Score | Success Rate | Avg Steps | Seeds |
-| --- | --- | --- | --- | --- |
-| gpt-4o | 0.2825 | 100.0% | 20.0 | 10 |
-| gpt-4.1 | 0.2825 | 100.0% | 20.0 | 10 |
-| gpt-4.1-mini | 0.2825 | 100.0% | 20.0 | 10 |
-| gpt-4o-mini | 0.2825 | 100.0% | 20.0 | 10 |
+Score formula: `max(0, min(1, sum(rewards) / MAX_STEPS))` — normalises cumulative reward
+against the theoretical ceiling of 1.0 per step × 20 steps.
 
-Score is computed as average reward per step (`sum(rewards) / MAX_STEPS`), since the HTTP API does not expose `episode_summary`.
+| Model | Tier | Score |
+| --- | --- | --- |
+| Qwen2.5-72B-Instruct | easy | TBD |
+| Qwen2.5-72B-Instruct | medium | TBD |
+| Qwen2.5-72B-Instruct | hard | TBD |
+
+Re-run benchmarks after setting env vars (see **Reproduce Benchmarks** below).
 
 ### Reproduce Benchmarks
 
@@ -250,6 +253,10 @@ The script prints a markdown table that matches the benchmark table above.
 ## Building the Docker Image
 
 ```bash
+# Standalone build (uses root Dockerfile, no internal base image required)
+docker build -t shopOps-env:latest .
+
+# Or explicitly with the in-repo Dockerfile:
 docker build -t shopOps-env:latest -f server/Dockerfile .
 ```
 
 
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Graders for the ShopOps OpenEnv tasks.
+
+Each grader receives the full episode trajectory (list of step dicts, each
+containing at least a "reward" key) and returns a normalised score in [0.0, 1.0].
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+# Must match MAX_CASES in shopOps_environment.py.
+# Used as the theoretical maximum total reward (1.0 per step × 20 steps).
+_MAX_STEPS = 20
+
+
+class ScoreGrader:
+    """
+    Trajectory-quality grader for ShopOps.
+
+    Scoring formula
+    ---------------
+    Per-step rewards are in the range [-1.0, 1.0]:
+      * +0.0 – 1.0  for valid actions  (weighted correctness + efficiency + priority)
+      * -1.0         for invalid actions
+
+    The grader sums all rewards and divides by the theoretical maximum
+    (_MAX_STEPS × 1.0 = 20.0), then clamps the result to [0.0, 1.0]:
+
+        score = clamp(sum(rewards) / _MAX_STEPS, 0.0, 1.0)
+
+    This means:
+      * A perfect agent that scores 1.0 every step → score = 1.0
+      * An agent that always rejects correctly     → score ≈ 0.45–0.75 (task-dependent)
+      * An agent that triggers the invalid limit   → score = 0.0 (clamped)
+
+    The grader is deterministic: identical trajectories always yield the same score.
+    """
+
+    def grade(self, trajectory: List[Dict[str, Any]]) -> float:
+        """
+        Score a completed episode.
+
+        Args:
+            trajectory: List of step dicts.  Each dict must contain a "reward"
+                        key whose value is a float (or None, treated as 0.0).
+
+        Returns:
+            Normalised score in [0.0, 1.0].
+        """
+        if not trajectory:
+            return 0.0
+
+        total_reward = sum(float(step.get("reward") or 0.0) for step in trajectory)
+        score = total_reward / _MAX_STEPS
+        return float(max(0.0, min(1.0, score)))
@@ -1,3 +1,20 @@
+"""
+ShopOps Inference Script
+========================
+Runs the LLM agent against all three difficulty tiers (easy, medium, hard)
+and emits strict [START] / [STEP] / [END] logs to stdout.
+
+Required environment variables:
+    API_BASE_URL  – LLM API endpoint  (default: https://router.huggingface.co/v1)
+    MODEL_NAME    – model identifier  (default: Qwen/Qwen2.5-72B-Instruct)
+    HF_TOKEN      – Hugging Face / API key  (required)
+
+Optional:
+    ENV_URL       – environment server URL (default: http://localhost:8000)
+    MAX_STEPS     – max steps per episode  (default: 20)
+    SEED          – random seed for reproducibility (default: 42)
+"""
+
 import json
 import os
 import re
@@ -12,29 +29,47 @@
 HF_TOKEN = os.getenv("HF_TOKEN")
 ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
 
-TASK_NAME = os.getenv("TASK_NAME", "shopops")
-BENCHMARK = os.getenv("BENCHMARK", "shopops")
+BENCHMARK = "shopops"
 MAX_STEPS = int(os.getenv("MAX_STEPS", "20"))
-
-REQUIRED_VARS = {
-    "API_BASE_URL": API_BASE_URL,
-    "MODEL_NAME": MODEL_NAME,
-    "HF_TOKEN": HF_TOKEN,
-}
+SEED = int(os.getenv("SEED", "42"))
+TIERS = ["easy", "medium", "hard"]
+
+# Max theoretical reward per step is 1.0 (correctness=1, efficiency=1, priority=1).
+# Normalise cumulative reward against this ceiling so score stays in [0, 1].
+MAX_REWARD_PER_EPISODE = float(MAX_STEPS)
+
+_SYSTEM_PROMPT = (
+    "You are an e-commerce support agent. Analyse the case and return ONLY a valid JSON object "
+    "with exactly these four keys: action_type, refund_amount_usd, replacement_expedite, escalation_reason.\n\n"
+    "action_type choices:\n"
+    "  refund    – set refund_amount_usd to a positive float <= order value\n"
+    "  replace   – set replacement_expedite to true/false\n"
+    "  escalate  – set escalation_reason to one of: suspected_fraud | high_value | policy_exception | safety_issue\n"
+    "  reject    – no extra fields needed (set others to null/false)\n\n"
+    "Decision rules:\n"
+    "  fraud_signal=high                          → escalate, suspected_fraud\n"
+    "  fraud_signal=medium                        → reject\n"
+    "  refund_request + return window closed      → reject\n"
+    "  delivery lost                              → replace\n"
+    "  delivery delayed                           → refund 20% of order value\n"
+    "  delivery in_transit                        → escalate, policy_exception\n"
+    "  wrong_item with evidence                   → replace\n"
+    "  wrong_item gold/platinum, few refunds      → replace\n"
+    "  default                                    → reject\n"
+)
 
 
 def _require_env() -> None:
-    missing = [key for key, value in REQUIRED_VARS.items() if not value]
-    if missing:
-        print("Missing required env vars: " + ", ".join(missing))
+    if not HF_TOKEN:
+        print("Missing required env var: HF_TOKEN", flush=True)
         sys.exit(2)
 
 
 def _parse_action(text: str) -> Dict[str, Any]:
     try:
         return json.loads(text)
     except json.JSONDecodeError:
-        match = re.search(r"\{.*\}", text, re.DOTALL)
+        match = re.search(r"\{.*?\}", text, re.DOTALL)
         if match:
             return json.loads(match.group(0))
         raise
@@ -70,50 +105,62 @@ def _log_end(success: bool, steps: int, score: float, rewards: List[float]) -> N
     )
 
 
-def main() -> None:
-    _require_env()
-    client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
-
-    seed = int(os.getenv("SEED", "42"))
-    _log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
+def _get_action(client: OpenAI, obs: Dict[str, Any]) -> Dict[str, Any]:
+    """Call the LLM to decide an action; fall back to reject on any error."""
+    user_msg = (
+        f"Case: {json.dumps(obs.get('case', {}))}\n"
+        f"Resources: {json.dumps(obs.get('resources', {}))}\n"
+        f"Tier: {obs.get('tier', 'unknown')}\n\n"
+        "Return ONLY the JSON object."
+    )
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": _SYSTEM_PROMPT},
+                {"role": "user", "content": user_msg},
+            ],
+            temperature=0.0,
+            max_tokens=150,
+        )
+        text = (response.choices[0].message.content or "").strip()
+        return _parse_action(text)
+    except Exception as exc:
+        print(f"[DEBUG] LLM call failed: {exc}", flush=True)
+        return _safe_action()
+
+
+def _run_tier(client: OpenAI, tier: str) -> None:
+    """Run one full episode for the given tier, emitting START / STEP / END logs."""
+    _log_start(task=tier, env=BENCHMARK, model=MODEL_NAME)
 
     rewards: List[float] = []
     steps_taken = 0
     success = False
     score = 0.0
 
     try:
-        reset_resp = requests.post(f"{ENV_URL}/reset", json={"seed": seed})
+        reset_resp = requests.post(
+            f"{ENV_URL}/reset",
+            json={"seed": SEED, "tier": tier},
+            timeout=30,
+        )
         reset_resp.raise_for_status()
         payload = reset_resp.json()
-        obs = payload["observation"]
+        obs = payload.get("observation", {})
         episode_id = obs.get("episode_id", "unknown")
-
-        step = 1
         done = payload.get("done", False)
 
+        step = 1
         while not done and step <= MAX_STEPS:
-            prompt = (
-                "You are an e-commerce ops agent. Return ONLY JSON with keys: "
-                "action_type, refund_amount_usd, replacement_expedite, escalation_reason. "
-                f"Observation: {json.dumps(obs)}"
-            )
-
-            try:
-                response = client.responses.create(
-                    model=MODEL_NAME,
-                    input=prompt,
-                    text={"format": {"type": "json_object"}},
-                )
-                action = _parse_action(response.output_text)
-            except Exception as exc:
-                action = _safe_action()
+            action = _get_action(client, obs)
 
             step_resp = requests.post(
                 f"{ENV_URL}/step",
                 json={"action": action, "episode_id": episode_id},
+                timeout=30,
             )
-            step_payload = {}
+            error: Optional[str] = None
             if step_resp.status_code == 200:
                 step_payload = step_resp.json()
                 reward = float(step_payload.get("reward") or 0.0)
@@ -123,6 +170,7 @@ def main() -> None:
                     .get("metadata", {})
                     .get("validation_error")
                 )
+                obs = step_payload.get("observation", obs)
             else:
                 try:
                     err_payload = step_resp.json()
@@ -134,27 +182,30 @@ def main() -> None:
 
             rewards.append(reward)
             steps_taken = step
-
             _log_step(
                 step=step,
                 action=json.dumps(action, separators=(",", ":")),
                 reward=reward,
                 done=done,
                 error=error,
             )
-
-            if step_payload:
-                obs = step_payload["observation"]
             step += 1
 
-        # HTTP API does not include episode_summary, so compute a normalized score.
-        # This keeps score within [0, 1] for logging.
-        score = sum(rewards) / float(MAX_STEPS) if MAX_STEPS > 0 else 0.0
+        # Normalise: max reward per step = 1.0, so dividing by MAX_STEPS maps [0, 20] → [0, 1].
+        # Negative rewards are clamped to 0.
+        score = sum(rewards) / MAX_REWARD_PER_EPISODE if MAX_REWARD_PER_EPISODE > 0 else 0.0
         score = max(0.0, min(1.0, score))
         success = score > 0.0
     finally:
         _log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
 
+def main() -> None:
+    _require_env()
+    client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
+    for tier in TIERS:
+        _run_tier(client, tier)
+
+
 if __name__ == "__main__":
     main()