satyam-mishra-dev
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 86 additions & 0 deletions b/‎README.md‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎inference.py‎
Lines changed: 105 additions & 60 deletions b/‎inference.py‎
Lines changed: 105 additions & 60 deletions
@@ -0,0 +1,5 @@
+.env
+.venv/
+venv/
+__pycache__/
+.pytest_cache/
@@ -152,6 +152,35 @@ export ENV_URL="http://localhost:8000"
 python inference.py
 ```
 
+## Submission Checklist
+
+Run these before submitting:
+
+1. **HF Space ping**  
+   Confirm your Space responds:  
+   `curl -s -o /dev/null -w "%{http_code}" -X POST "$PING_URL/reset"` → `200`
+
+2. **Docker build**  
+   `docker build -t shopops-env:latest -f server/Dockerfile .`
+
+3. **OpenEnv validate**  
+   `openenv validate`
+
+4. **Inference script**  
+   `set -a; source .env; set +a; python inference.py`  
+   Ensure `[START]`, `[STEP]`, `[END]` lines are emitted and the script exits cleanly.
+
+5. **Graded tasks**  
+   Run your 3+ tasks/graders and verify all scores are in `[0.0, 1.0]`.
+
+### Validator Script
+
+If provided by the hackathon, run:
+
+```bash
+./scripts/validate-submission.sh <ping_url> .
+```
+
 ## Test Results
 
 Latest scenario test report:
@@ -161,6 +190,63 @@ outputs/test_report.txt
 outputs/test_report_full.txt
 ```
 
+## Baseline Scores
+
+Rule-based baseline policy on test split (total-seeds=200 → 40 test episodes).
+
+| Tier | Model | Avg Final Score |
+| --- | --- | --- |
+| easy | baseline_policy | 15.7861 |
+| medium | baseline_policy | 14.3358 |
+| hard | baseline_policy | 9.0594 |
+
+## Model Benchmarks (Inference Script)
+
+Inference-based benchmarks using `inference.py` against the local server, `MAX_STEPS=20`, 10 seeds.
+
+| Model | Avg Score | Success Rate | Avg Steps | Seeds |
+| --- | --- | --- | --- | --- |
+| gpt-4o | 0.2825 | 100.0% | 20.0 | 10 |
+| gpt-4.1 | 0.2825 | 100.0% | 20.0 | 10 |
+| gpt-4.1-mini | 0.2825 | 100.0% | 20.0 | 10 |
+| gpt-4o-mini | 0.2825 | 100.0% | 20.0 | 10 |
+
+Score is computed as average reward per step (`sum(rewards) / MAX_STEPS`), since the HTTP API does not expose `episode_summary`.
+
+### Reproduce Benchmarks
+
+These steps reproduce all metrics above on any machine with the repo:
+
+1. **Install dependencies**
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r server/requirements.txt
+pip install -e .
+```
+
+2. **Start the environment server**
+```bash
+PORT=8000 python -m shopOps.server.app
+```
+
+3. **Set required env vars**
+```bash
+export API_BASE_URL="https://api.openai.com/v1"
+export HF_TOKEN="<your_api_key>"
+export ENV_URL="http://localhost:8000"
+```
+
+4. **Run the benchmark script**
+```bash
+cd shopOps
+BENCH_MODELS="gpt-4o,gpt-4.1,gpt-4.1-mini,gpt-4o-mini" \\
+BENCH_SEEDS="1,2,3,4,5,6,7,8,9,10" \\
+python scripts/benchmark_models.py
+```
+
+The script prints a markdown table that matches the benchmark table above.
+
 ## Building the Docker Image
 
 ```bash
 
@@ -2,16 +2,20 @@
 import os
 import re
 import sys
-from typing import Any, Dict
+from typing import Any, Dict, List, Optional
 
 import requests
 from openai import OpenAI
 
-API_BASE_URL = os.getenv("API_BASE_URL")
-MODEL_NAME = os.getenv("MODEL_NAME")
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
 HF_TOKEN = os.getenv("HF_TOKEN")
 ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
 
+TASK_NAME = os.getenv("TASK_NAME", "shopops")
+BENCHMARK = os.getenv("BENCHMARK", "shopops")
+MAX_STEPS = int(os.getenv("MAX_STEPS", "20"))
+
 REQUIRED_VARS = {
     "API_BASE_URL": API_BASE_URL,
     "MODEL_NAME": MODEL_NAME,
@@ -45,70 +49,111 @@ def _safe_action() -> Dict[str, Any]:
     }
 
 
+def _log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+
+
+def _log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    done_val = str(done).lower()
+    error_val = error if error else "null"
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+
+
+def _log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
+        flush=True,
+    )
+
+
 def main() -> None:
     _require_env()
     client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
 
     seed = int(os.getenv("SEED", "42"))
+    _log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
+
+    rewards: List[float] = []
+    steps_taken = 0
+    success = False
+    score = 0.0
 
-    print("[START]")
-    print(f"episode_id=unknown")
-    print(f"seed={seed}")
-    print(f"model={MODEL_NAME}")
-    print(f"env_url={ENV_URL}")
-
-    reset_resp = requests.post(f"{ENV_URL}/reset", json={"seed": seed})
-    reset_resp.raise_for_status()
-    payload = reset_resp.json()
-    obs = payload["observation"]
-    episode_id = obs.get("episode_id", "unknown")
-    print(f"episode_id={episode_id}")
-
-    step = 0
-    done = payload.get("done", False)
-
-    while not done:
-        prompt = (
-            "You are an e-commerce ops agent. Return ONLY JSON with keys: "
-            "action_type, refund_amount_usd, replacement_expedite, escalation_reason. "
-            f"Observation: {json.dumps(obs)}"
-        )
-
-        try:
-            response = client.responses.create(
-                model=MODEL_NAME,
-                input=prompt,
+    try:
+        reset_resp = requests.post(f"{ENV_URL}/reset", json={"seed": seed})
+        reset_resp.raise_for_status()
+        payload = reset_resp.json()
+        obs = payload["observation"]
+        episode_id = obs.get("episode_id", "unknown")
+
+        step = 1
+        done = payload.get("done", False)
+
+        while not done and step <= MAX_STEPS:
+            prompt = (
+                "You are an e-commerce ops agent. Return ONLY JSON with keys: "
+                "action_type, refund_amount_usd, replacement_expedite, escalation_reason. "
+                f"Observation: {json.dumps(obs)}"
             )
-            action = _parse_action(response.output_text)
-        except Exception:
-            action = _safe_action()
-
-        step_resp = requests.post(
-            f"{ENV_URL}/step",
-            json={"action": action, "episode_id": episode_id},
-        )
-        step_resp.raise_for_status()
-        step_payload = step_resp.json()
-
-        print("[STEP]")
-        print(f"step={step}")
-        print(f"action={json.dumps(action)}")
-        print(f"reward={step_payload.get('reward')}")
-        print(f"done={step_payload.get('done')}")
-
-        obs = step_payload["observation"]
-        done = step_payload.get("done", False)
-        step += 1
-        if step >= 20:
-            break
-
-    final_score = (
-        obs.get("metadata", {})
-        .get("episode_summary", {})
-        .get("final_score")
-    )
-    print("[END]")
-    print(f"final_score={final_score}")
+
+            try:
+                response = client.responses.create(
+                    model=MODEL_NAME,
+                    input=prompt,
+                    text={"format": {"type": "json_object"}},
+                )
+                action = _parse_action(response.output_text)
+            except Exception as exc:
+                action = _safe_action()
+
+            step_resp = requests.post(
+                f"{ENV_URL}/step",
+                json={"action": action, "episode_id": episode_id},
+            )
+            step_payload = {}
+            if step_resp.status_code == 200:
+                step_payload = step_resp.json()
+                reward = float(step_payload.get("reward") or 0.0)
+                done = bool(step_payload.get("done", False))
+                error = (
+                    (step_payload.get("observation") or {})
+                    .get("metadata", {})
+                    .get("validation_error")
+                )
+            else:
+                try:
+                    err_payload = step_resp.json()
+                    error = err_payload.get("detail") or str(err_payload)
+                except Exception:
+                    error = step_resp.text or f"http_{step_resp.status_code}"
+                reward = 0.0
+                done = True
+
+            rewards.append(reward)
+            steps_taken = step
+
+            _log_step(
+                step=step,
+                action=json.dumps(action, separators=(",", ":")),
+                reward=reward,
+                done=done,
+                error=error,
+            )
+
+            if step_payload:
+                obs = step_payload["observation"]
+            step += 1
+
+        # HTTP API does not include episode_summary, so compute a normalized score.
+        # This keeps score within [0, 1] for logging.
+        score = sum(rewards) / float(MAX_STEPS) if MAX_STEPS > 0 else 0.0
+        score = max(0.0, min(1.0, score))
+        success = score > 0.0
+    finally:
+        _log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
 
 if __name__ == "__main__":
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +.env
 +.venv/
 +venv/
 +__pycache__/
 +.pytest_cache/