1+ """
2+ ShopOps Inference Script
3+ ========================
4+ Runs the LLM agent against all three difficulty tiers (easy, medium, hard)
5+ and emits strict [START] / [STEP] / [END] logs to stdout.
6+
7+ Required environment variables:
8+ API_BASE_URL – LLM API endpoint (default: https://router.huggingface.co/v1)
9+ MODEL_NAME – model identifier (default: Qwen/Qwen2.5-72B-Instruct)
10+ HF_TOKEN – Hugging Face / API key (required)
11+
12+ Optional:
13+ ENV_URL – environment server URL (default: http://localhost:8000)
14+ MAX_STEPS – max steps per episode (default: 20)
15+ SEED – random seed for reproducibility (default: 42)
16+ """
17+
118import json
219import os
320import re
1229HF_TOKEN = os .getenv ("HF_TOKEN" )
1330ENV_URL = os .getenv ("ENV_URL" , "http://localhost:8000" )
1431
15- TASK_NAME = os .getenv ("TASK_NAME" , "shopops" )
16- BENCHMARK = os .getenv ("BENCHMARK" , "shopops" )
32+ BENCHMARK = "shopops"
1733MAX_STEPS = int (os .getenv ("MAX_STEPS" , "20" ))
18-
19- REQUIRED_VARS = {
20- "API_BASE_URL" : API_BASE_URL ,
21- "MODEL_NAME" : MODEL_NAME ,
22- "HF_TOKEN" : HF_TOKEN ,
23- }
34+ SEED = int (os .getenv ("SEED" , "42" ))
35+ TIERS = ["easy" , "medium" , "hard" ]
36+
37+ # Max theoretical reward per step is 1.0 (correctness=1, efficiency=1, priority=1).
38+ # Normalise cumulative reward against this ceiling so score stays in [0, 1].
39+ MAX_REWARD_PER_EPISODE = float (MAX_STEPS )
40+
41+ _SYSTEM_PROMPT = (
42+ "You are an e-commerce support agent. Analyse the case and return ONLY a valid JSON object "
43+ "with exactly these four keys: action_type, refund_amount_usd, replacement_expedite, escalation_reason.\n \n "
44+ "action_type choices:\n "
45+ " refund – set refund_amount_usd to a positive float <= order value\n "
46+ " replace – set replacement_expedite to true/false\n "
47+ " escalate – set escalation_reason to one of: suspected_fraud | high_value | policy_exception | safety_issue\n "
48+ " reject – no extra fields needed (set others to null/false)\n \n "
49+ "Decision rules:\n "
50+ " fraud_signal=high → escalate, suspected_fraud\n "
51+ " fraud_signal=medium → reject\n "
52+ " refund_request + return window closed → reject\n "
53+ " delivery lost → replace\n "
54+ " delivery delayed → refund 20% of order value\n "
55+ " delivery in_transit → escalate, policy_exception\n "
56+ " wrong_item with evidence → replace\n "
57+ " wrong_item gold/platinum, few refunds → replace\n "
58+ " default → reject\n "
59+ )
2460
2561
2662def _require_env () -> None :
27- missing = [key for key , value in REQUIRED_VARS .items () if not value ]
28- if missing :
29- print ("Missing required env vars: " + ", " .join (missing ))
63+ if not HF_TOKEN :
64+ print ("Missing required env var: HF_TOKEN" , flush = True )
3065 sys .exit (2 )
3166
3267
3368def _parse_action (text : str ) -> Dict [str , Any ]:
3469 try :
3570 return json .loads (text )
3671 except json .JSONDecodeError :
37- match = re .search (r"\{.*\}" , text , re .DOTALL )
72+ match = re .search (r"\{.*? \}" , text , re .DOTALL )
3873 if match :
3974 return json .loads (match .group (0 ))
4075 raise
@@ -70,50 +105,62 @@ def _log_end(success: bool, steps: int, score: float, rewards: List[float]) -> N
70105 )
71106
72107
73- def main () -> None :
74- _require_env ()
75- client = OpenAI (base_url = API_BASE_URL , api_key = HF_TOKEN )
76-
77- seed = int (os .getenv ("SEED" , "42" ))
78- _log_start (task = TASK_NAME , env = BENCHMARK , model = MODEL_NAME )
108+ def _get_action (client : OpenAI , obs : Dict [str , Any ]) -> Dict [str , Any ]:
109+ """Call the LLM to decide an action; fall back to reject on any error."""
110+ user_msg = (
111+ f"Case: { json .dumps (obs .get ('case' , {}))} \n "
112+ f"Resources: { json .dumps (obs .get ('resources' , {}))} \n "
113+ f"Tier: { obs .get ('tier' , 'unknown' )} \n \n "
114+ "Return ONLY the JSON object."
115+ )
116+ try :
117+ response = client .chat .completions .create (
118+ model = MODEL_NAME ,
119+ messages = [
120+ {"role" : "system" , "content" : _SYSTEM_PROMPT },
121+ {"role" : "user" , "content" : user_msg },
122+ ],
123+ temperature = 0.0 ,
124+ max_tokens = 150 ,
125+ )
126+ text = (response .choices [0 ].message .content or "" ).strip ()
127+ return _parse_action (text )
128+ except Exception as exc :
129+ print (f"[DEBUG] LLM call failed: { exc } " , flush = True )
130+ return _safe_action ()
131+
132+
133+ def _run_tier (client : OpenAI , tier : str ) -> None :
134+ """Run one full episode for the given tier, emitting START / STEP / END logs."""
135+ _log_start (task = tier , env = BENCHMARK , model = MODEL_NAME )
79136
80137 rewards : List [float ] = []
81138 steps_taken = 0
82139 success = False
83140 score = 0.0
84141
85142 try :
86- reset_resp = requests .post (f"{ ENV_URL } /reset" , json = {"seed" : seed })
143+ reset_resp = requests .post (
144+ f"{ ENV_URL } /reset" ,
145+ json = {"seed" : SEED , "tier" : tier },
146+ timeout = 30 ,
147+ )
87148 reset_resp .raise_for_status ()
88149 payload = reset_resp .json ()
89- obs = payload [ "observation" ]
150+ obs = payload . get ( "observation" , {})
90151 episode_id = obs .get ("episode_id" , "unknown" )
91-
92- step = 1
93152 done = payload .get ("done" , False )
94153
154+ step = 1
95155 while not done and step <= MAX_STEPS :
96- prompt = (
97- "You are an e-commerce ops agent. Return ONLY JSON with keys: "
98- "action_type, refund_amount_usd, replacement_expedite, escalation_reason. "
99- f"Observation: { json .dumps (obs )} "
100- )
101-
102- try :
103- response = client .responses .create (
104- model = MODEL_NAME ,
105- input = prompt ,
106- text = {"format" : {"type" : "json_object" }},
107- )
108- action = _parse_action (response .output_text )
109- except Exception as exc :
110- action = _safe_action ()
156+ action = _get_action (client , obs )
111157
112158 step_resp = requests .post (
113159 f"{ ENV_URL } /step" ,
114160 json = {"action" : action , "episode_id" : episode_id },
161+ timeout = 30 ,
115162 )
116- step_payload = {}
163+ error : Optional [ str ] = None
117164 if step_resp .status_code == 200 :
118165 step_payload = step_resp .json ()
119166 reward = float (step_payload .get ("reward" ) or 0.0 )
@@ -123,6 +170,7 @@ def main() -> None:
123170 .get ("metadata" , {})
124171 .get ("validation_error" )
125172 )
173+ obs = step_payload .get ("observation" , obs )
126174 else :
127175 try :
128176 err_payload = step_resp .json ()
@@ -134,27 +182,30 @@ def main() -> None:
134182
135183 rewards .append (reward )
136184 steps_taken = step
137-
138185 _log_step (
139186 step = step ,
140187 action = json .dumps (action , separators = ("," , ":" )),
141188 reward = reward ,
142189 done = done ,
143190 error = error ,
144191 )
145-
146- if step_payload :
147- obs = step_payload ["observation" ]
148192 step += 1
149193
150- # HTTP API does not include episode_summary , so compute a normalized score .
151- # This keeps score within [0, 1] for logging .
152- score = sum (rewards ) / float ( MAX_STEPS ) if MAX_STEPS > 0 else 0.0
194+ # Normalise: max reward per step = 1.0 , so dividing by MAX_STEPS maps [0, 20] → [0, 1] .
195+ # Negative rewards are clamped to 0 .
196+ score = sum (rewards ) / MAX_REWARD_PER_EPISODE if MAX_REWARD_PER_EPISODE > 0 else 0.0
153197 score = max (0.0 , min (1.0 , score ))
154198 success = score > 0.0
155199 finally :
156200 _log_end (success = success , steps = steps_taken , score = score , rewards = rewards )
157201
158202
203+ def main () -> None :
204+ _require_env ()
205+ client = OpenAI (base_url = API_BASE_URL , api_key = HF_TOKEN )
206+ for tier in TIERS :
207+ _run_tier (client , tier )
208+
209+
159210if __name__ == "__main__" :
160211 main ()
0 commit comments