-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathopenenv.yaml
More file actions
106 lines (100 loc) · 3.23 KB
/
openenv.yaml
File metadata and controls
106 lines (100 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
spec_version: 1
name: frontier-swe-type-checker
type: space
runtime: fastapi
app: frontier_swe_env.server.app:app
port: 8000
version: "0.1.0"
description: >
Frontier SWE — Dependent Type Checker. An OpenEnv-shaped FastAPI service
hosting a Martin-Löf-style dependently-typed language type checker
implementation task. Agents implement a Rust binary at
/app/type-checker/target/release/type-checker that exits 0 iff every
top-level command in supplied .sexp files type-checks. Scored on
correctness gates (accept-rate >= 0.99, reject-rate >= 0.95) followed by
geometric-mean speedup vs the reference implementation.
repo:
source: https://github.com/3xcaffeine/frontier-swe-openenv
task_directory: tasks/dependent-type-checker
environment:
task_name: dependent-type-checker
workspace_dir: /app/type-checker
build_command: cd /app/type-checker && cargo build --release
episode_timeout_s: 3600
max_attempts_per_subtask: 3
l1_score_mode: reward_json_score
l1_timeout_s: 600
reward_json_path: /logs/verifier/reward.json
reward_json_score_field: score
reward_json_score_anchors: [0.0, 2.0]
reward_json_score_higher_is_better: true
task_domain: programming languages / type theory
cpus: 8
memory_mb: 32768
rubric:
type: composite
layers:
- name: gate_checks
kind: shell
script: /app/gate_checks.sh
output: GATE_SCORE=N/M
gates:
- workspace + Cargo.toml present
- rustc + cargo on PATH
- cargo build --release succeeds
- name: l1_tests
kind: structured_reward
score_mode: reward_json_score
reward_json_path: /logs/verifier/reward.json
score_field: score
anchors: [0.0, 2.0]
higher_is_better: true
hard_fail_signals:
- additional_data.reason set (correctness gate fail / source scan match)
- score == 0.0
- name: l2_code_review
kind: llm_judge
model_env: FSWE_GRADER_MODEL
api_url_env: FSWE_GRADER_API_URL
api_key_env: FSWE_GRADER_API_KEY
dimensions:
[completeness, correctness, robustness, forward_compatibility]
- name: l3_plan_review
kind: llm_judge
model_env: FSWE_GRADER_MODEL
- name: episode_aggregator
kind: weighted_blend
output_field: observation.episode_reward
tools:
- name: submit_plan
description: Propose a subtask plan for the episode (PLANNING -> EXECUTING).
parameters:
- name: subtasks
type: list[dict]
required: true
- name: submit_subtask
description: Submit the current subtask for L1 + L2 scoring.
parameters:
- name: subtask_id
type: str
required: true
- name: get_status
description: Return the current episode status snapshot (phase, scores, time remaining).
- name: advance
description: Freeze the current subtask score and advance to the next subtask.
metrics:
observation:
- observation.phase
- observation.current_subtask
- observation.frozen_scores
- observation.time_remaining_s
- observation.plan_score
- observation.subtask_feedback
- observation.episode_reward
reward:
- reward.gate_score
- reward.l1_test_score
- reward.l1_blended
- reward.l2_code_review
- reward.l3_plan_review
- reward.episode_reward