Skip to content

Commit a1cf313

Browse files
Benchmarking and inference
1 parent 16d3a02 commit a1cf313

6 files changed

Lines changed: 331 additions & 66 deletions

File tree

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.env
2+
.venv/
3+
venv/
4+
__pycache__/
5+
.pytest_cache/

README.md

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,35 @@ export ENV_URL="http://localhost:8000"
152152
python inference.py
153153
```
154154

155+
## Submission Checklist
156+
157+
Run these before submitting:
158+
159+
1. **HF Space ping**
160+
Confirm your Space responds:
161+
`curl -s -o /dev/null -w "%{http_code}" -X POST "$PING_URL/reset"``200`
162+
163+
2. **Docker build**
164+
`docker build -t shopops-env:latest -f server/Dockerfile .`
165+
166+
3. **OpenEnv validate**
167+
`openenv validate`
168+
169+
4. **Inference script**
170+
`set -a; source .env; set +a; python inference.py`
171+
Ensure `[START]`, `[STEP]`, `[END]` lines are emitted and the script exits cleanly.
172+
173+
5. **Graded tasks**
174+
Run your 3+ tasks/graders and verify all scores are in `[0.0, 1.0]`.
175+
176+
### Validator Script
177+
178+
If provided by the hackathon, run:
179+
180+
```bash
181+
./scripts/validate-submission.sh <ping_url> .
182+
```
183+
155184
## Test Results
156185

157186
Latest scenario test report:
@@ -161,6 +190,63 @@ outputs/test_report.txt
161190
outputs/test_report_full.txt
162191
```
163192

193+
## Baseline Scores
194+
195+
Rule-based baseline policy on test split (total-seeds=200 → 40 test episodes).
196+
197+
| Tier | Model | Avg Final Score |
198+
| --- | --- | --- |
199+
| easy | baseline_policy | 15.7861 |
200+
| medium | baseline_policy | 14.3358 |
201+
| hard | baseline_policy | 9.0594 |
202+
203+
## Model Benchmarks (Inference Script)
204+
205+
Inference-based benchmarks using `inference.py` against the local server, `MAX_STEPS=20`, 10 seeds.
206+
207+
| Model | Avg Score | Success Rate | Avg Steps | Seeds |
208+
| --- | --- | --- | --- | --- |
209+
| gpt-4o | 0.2825 | 100.0% | 20.0 | 10 |
210+
| gpt-4.1 | 0.2825 | 100.0% | 20.0 | 10 |
211+
| gpt-4.1-mini | 0.2825 | 100.0% | 20.0 | 10 |
212+
| gpt-4o-mini | 0.2825 | 100.0% | 20.0 | 10 |
213+
214+
Score is computed as average reward per step (`sum(rewards) / MAX_STEPS`), since the HTTP API does not expose `episode_summary`.
215+
216+
### Reproduce Benchmarks
217+
218+
These steps reproduce all metrics above on any machine with the repo:
219+
220+
1. **Install dependencies**
221+
```bash
222+
python3 -m venv venv
223+
source venv/bin/activate
224+
pip install -r server/requirements.txt
225+
pip install -e .
226+
```
227+
228+
2. **Start the environment server**
229+
```bash
230+
PORT=8000 python -m shopOps.server.app
231+
```
232+
233+
3. **Set required env vars**
234+
```bash
235+
export API_BASE_URL="https://api.openai.com/v1"
236+
export HF_TOKEN="<your_api_key>"
237+
export ENV_URL="http://localhost:8000"
238+
```
239+
240+
4. **Run the benchmark script**
241+
```bash
242+
cd shopOps
243+
BENCH_MODELS="gpt-4o,gpt-4.1,gpt-4.1-mini,gpt-4o-mini" \\
244+
BENCH_SEEDS="1,2,3,4,5,6,7,8,9,10" \\
245+
python scripts/benchmark_models.py
246+
```
247+
248+
The script prints a markdown table that matches the benchmark table above.
249+
164250
## Building the Docker Image
165251

166252
```bash

inference.py

Lines changed: 105 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,20 @@
22
import os
33
import re
44
import sys
5-
from typing import Any, Dict
5+
from typing import Any, Dict, List, Optional
66

77
import requests
88
from openai import OpenAI
99

10-
API_BASE_URL = os.getenv("API_BASE_URL")
11-
MODEL_NAME = os.getenv("MODEL_NAME")
10+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
11+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
1212
HF_TOKEN = os.getenv("HF_TOKEN")
1313
ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
1414

15+
TASK_NAME = os.getenv("TASK_NAME", "shopops")
16+
BENCHMARK = os.getenv("BENCHMARK", "shopops")
17+
MAX_STEPS = int(os.getenv("MAX_STEPS", "20"))
18+
1519
REQUIRED_VARS = {
1620
"API_BASE_URL": API_BASE_URL,
1721
"MODEL_NAME": MODEL_NAME,
@@ -45,70 +49,111 @@ def _safe_action() -> Dict[str, Any]:
4549
}
4650

4751

52+
def _log_start(task: str, env: str, model: str) -> None:
53+
print(f"[START] task={task} env={env} model={model}", flush=True)
54+
55+
56+
def _log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
57+
done_val = str(done).lower()
58+
error_val = error if error else "null"
59+
print(
60+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
61+
flush=True,
62+
)
63+
64+
65+
def _log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
66+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
67+
print(
68+
f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
69+
flush=True,
70+
)
71+
72+
4873
def main() -> None:
4974
_require_env()
5075
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
5176

5277
seed = int(os.getenv("SEED", "42"))
78+
_log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
79+
80+
rewards: List[float] = []
81+
steps_taken = 0
82+
success = False
83+
score = 0.0
5384

54-
print("[START]")
55-
print(f"episode_id=unknown")
56-
print(f"seed={seed}")
57-
print(f"model={MODEL_NAME}")
58-
print(f"env_url={ENV_URL}")
59-
60-
reset_resp = requests.post(f"{ENV_URL}/reset", json={"seed": seed})
61-
reset_resp.raise_for_status()
62-
payload = reset_resp.json()
63-
obs = payload["observation"]
64-
episode_id = obs.get("episode_id", "unknown")
65-
print(f"episode_id={episode_id}")
66-
67-
step = 0
68-
done = payload.get("done", False)
69-
70-
while not done:
71-
prompt = (
72-
"You are an e-commerce ops agent. Return ONLY JSON with keys: "
73-
"action_type, refund_amount_usd, replacement_expedite, escalation_reason. "
74-
f"Observation: {json.dumps(obs)}"
75-
)
76-
77-
try:
78-
response = client.responses.create(
79-
model=MODEL_NAME,
80-
input=prompt,
85+
try:
86+
reset_resp = requests.post(f"{ENV_URL}/reset", json={"seed": seed})
87+
reset_resp.raise_for_status()
88+
payload = reset_resp.json()
89+
obs = payload["observation"]
90+
episode_id = obs.get("episode_id", "unknown")
91+
92+
step = 1
93+
done = payload.get("done", False)
94+
95+
while not done and step <= MAX_STEPS:
96+
prompt = (
97+
"You are an e-commerce ops agent. Return ONLY JSON with keys: "
98+
"action_type, refund_amount_usd, replacement_expedite, escalation_reason. "
99+
f"Observation: {json.dumps(obs)}"
81100
)
82-
action = _parse_action(response.output_text)
83-
except Exception:
84-
action = _safe_action()
85-
86-
step_resp = requests.post(
87-
f"{ENV_URL}/step",
88-
json={"action": action, "episode_id": episode_id},
89-
)
90-
step_resp.raise_for_status()
91-
step_payload = step_resp.json()
92-
93-
print("[STEP]")
94-
print(f"step={step}")
95-
print(f"action={json.dumps(action)}")
96-
print(f"reward={step_payload.get('reward')}")
97-
print(f"done={step_payload.get('done')}")
98-
99-
obs = step_payload["observation"]
100-
done = step_payload.get("done", False)
101-
step += 1
102-
if step >= 20:
103-
break
104-
105-
final_score = (
106-
obs.get("metadata", {})
107-
.get("episode_summary", {})
108-
.get("final_score")
109-
)
110-
print("[END]")
111-
print(f"final_score={final_score}")
101+
102+
try:
103+
response = client.responses.create(
104+
model=MODEL_NAME,
105+
input=prompt,
106+
text={"format": {"type": "json_object"}},
107+
)
108+
action = _parse_action(response.output_text)
109+
except Exception as exc:
110+
action = _safe_action()
111+
112+
step_resp = requests.post(
113+
f"{ENV_URL}/step",
114+
json={"action": action, "episode_id": episode_id},
115+
)
116+
step_payload = {}
117+
if step_resp.status_code == 200:
118+
step_payload = step_resp.json()
119+
reward = float(step_payload.get("reward") or 0.0)
120+
done = bool(step_payload.get("done", False))
121+
error = (
122+
(step_payload.get("observation") or {})
123+
.get("metadata", {})
124+
.get("validation_error")
125+
)
126+
else:
127+
try:
128+
err_payload = step_resp.json()
129+
error = err_payload.get("detail") or str(err_payload)
130+
except Exception:
131+
error = step_resp.text or f"http_{step_resp.status_code}"
132+
reward = 0.0
133+
done = True
134+
135+
rewards.append(reward)
136+
steps_taken = step
137+
138+
_log_step(
139+
step=step,
140+
action=json.dumps(action, separators=(",", ":")),
141+
reward=reward,
142+
done=done,
143+
error=error,
144+
)
145+
146+
if step_payload:
147+
obs = step_payload["observation"]
148+
step += 1
149+
150+
# HTTP API does not include episode_summary, so compute a normalized score.
151+
# This keeps score within [0, 1] for logging.
152+
score = sum(rewards) / float(MAX_STEPS) if MAX_STEPS > 0 else 0.0
153+
score = max(0.0, min(1.0, score))
154+
success = score > 0.0
155+
finally:
156+
_log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
112157

113158

114159
if __name__ == "__main__":

0 commit comments

Comments
 (0)