sre-enginnerllm/openenv.yaml at main · dakshdoesdev/sre-enginnerllm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
name: sre-engineer-llm
version: 3.1.0
description: >
  Tier-escalating SRE training environment. Three tiers escalate three
  different dimensions: Basic (compute), Advanced (horizon), Max (realism).

  Basic ships 12 templates x 6 entries each (1 base + 5 procgen variants) =
  72 deterministic scenarios over a 4-service topology, with a hardened
  scripted-optimal ceiling of <=0.80.

  Advanced runs as a Python orchestrator that chains Basic episodes with
  persistent horizon state (unresolved alerts, pending deploys, tech-debt
  counter, horizon-decay reward). It does not simulate a 15-20 service
  topology faithfully -- the YAMLs declare a richer action set as a design
  spec only.

  Max runs as a Python state-machine simulator over a 22-node service graph
  using the same 11-action interface as Basic. The docker-compose stack
  under sre_gym/max/families/ references stub images that are not published;
  it is shipped as design spec, not as runnable infrastructure.

  Training is not committed to this repo. See notebooks/01 for the GRPO
  pipeline that needs to be executed externally (Colab A100). The README
  baseline tables show frontier-LLM measurements only; the trained-model
  row is intentionally absent until a real run is committed.
author: Daksh Verma
license: Apache-2.0

environment:
  action_type: UnifiedIncidentAction
  observation_type: UnifiedIncidentObservation
  state_type: UnifiedIncidentState
  max_steps: 13
  difficulties: [easy, medium, hard]
  reward_type: dense
  # GRPO/TRL training contract: parallel env instances per training step.
  # Required for the OpenEnv batched-rollout pattern documented at
  # https://huggingface.co/docs/trl/openenv.
  max_concurrent_envs: 64
  scenario_count: 72                     # 12 templates x 6 entries (1 base + 5 procgen)
  scenario_templates: 12
  procgen_variants_per_template: 5       # 5 variants per template; 6 entries total each
  deterministic_seeded: true
  tier: basic                            # the runnable surface served by this Space
  tier_escalation_dimension: compute     # see docs/ARCHITECTURE.md for full design

tiers:
  basic:
    runnable: true
    runnable_kind: live_environment       # real /reset + /step routes against the live env
    escalation_dimension: compute
    persona: "ML student / Kaggle, $30 of HF credits"
    scenario_count: 72
    notes: "12 base templates + 5 procgen variants each = 72 deterministic scenarios."
  advanced:
    runnable: true
    runnable_kind: python_orchestrator    # chained Basic episodes with horizon state
    escalation_dimension: horizon
    persona: "seed/Series A startup, $300-500 budget"
    scenario_count: 3
    docs: docs/ADVANCED_TIER.md
    notes: >
      Advanced runs each scenario as a sequence of Basic episodes glued
      together by the horizon-state object (unresolved alerts, pending
      deploys, tech-debt counter, horizon-decay reward). It is NOT a
      simulator of a 15-20 service topology; the wider action universe
      declared in the YAMLs (~28 actions / scenario) is design spec
      only and is not implemented in the env.
  max:
    runnable: true
    runnable_kind: python_simulator       # graph state machine, not real cluster
    escalation_dimension: realism
    persona: "enterprise SRE platform, 8x A100/H100"
    scenario_count: 1                     # one specced family
    chaos_pattern_count: 12               # includes one alias (payment_webhook_storm)
    docs: docs/MAX_TIER.md
    notes: >
      Max runs as an in-memory 22-node graph mutator. Reuses the Basic
      11-action interface; correct_action across patterns is heavily
      skewed toward rollback_deploy (11/12) so the patterns are
      separable on observation alone -- not a hidden-information
      benchmark. The compose file under sre_gym/max/families/ references
      stub images that are NOT published; do not attempt `docker compose
      up` expecting it to pull successfully.

training:
  status: pending
  pipeline: notebooks/01_basic_train_grpo_unsloth.ipynb
  comparison: notebooks/02_basic_eval_comparison.ipynb
  expected_artifacts:
    - eval/results/comparison_hero.png
    - dakshdoesdev/sre-gym-qwen25-3b-grpo  (HF Hub adapter)
  current_artifacts: []                   # nothing committed until a real run lands
  trajectory_corpus:
    seed_combined_jsonl_rows: 21
    templates_with_teacher_data: 6        # of 12 -- the 6 round-2 templates have none yet
    templates_without_teacher_data:
      - auth_token_expiry
      - dep_degradation
      - memory_leak_oom
      - migration_lock
      - network_partition
      - rate_limit_retry_storm

huggingface:
  space_id: Madhav189/sre-env             # current authoritative HF Space
  github_repo: dakshdoesdev/sre-enginnerllm  # repo name has a known typo (enginner)
  sdk: docker
  hardware: cpu-basic