xiaomi-miloco/config/ai_engine_config.yaml at main · XiaoMi/xiaomi-miloco · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Mico AI Engine Configuration
# Unified management of all AI engine configuration items

# Logging configuration
logging:
  log_level: "info"
  enable_console_logging: true
  enable_file_logging: true

# Server configuration
server:
  host: "0.0.0.0"
  port: 8001

# Application information configuration
app:
  title: "Mico AI Engine"
  service_name: "mico-ai-engine"
  description: "Mico AI Engine, responsible for model inference and management"
  version: "0.0.1"

# Model configurations
models:
  MiMo-VL-Miloco-7B:Q4_0:
    # Basic settings
    model_path: "/models/MiMo-VL-Miloco-7B/MiMo-VL-Miloco-7B_Q4_0.gguf" # Model path
    mmproj_path: "/models/MiMo-VL-Miloco-7B/mmproj-MiMo-VL-Miloco-7B_BF16.gguf" # VIT model path

    # Cache settings
    cache_seq_num: 5 # Maximum number of sequences to dynamic prompt cache [Recommended rule cameras num + 1]

    # Model parameters
    parallel_seq_num: 12 # Parallel seq num [Recommended rule num + 2]
    total_context_num: 16384 # Maximum context tokens for all seq sums, Affects the size of VRAM [Recommended rule num * 1000 + 3000]
    context_per_seq: 4096 # Maximum effective context tokens for each seq, multi-turn could use
    chunk_size: 256 # Model seqlen, Affects the size of VRAM [Recommended ≥ 256]
    device: "cuda" # Model device [cuda/cpu]

    # Inference parameters
    max_tokens: 512 # Maximum tokens to generate
    temperature: -1 # sampling temperature [-1 Greedy sampling]

    # Business hardcoded configuration
    business:
      # Task labels(prompt config key) and corresponding priorities
      task_labels: ["trigger_rule_condition", "vision_understanding"]
      task_priorities: [5, 10]

  Qwen3-8b:Q4_0:
    model_path: "/models/Qwen3-8B/Qwen3-8B-Q4_K_M.gguf"

    parallel_seq_num: 2
    total_context_num: 12288
    context_per_seq: 6144
    chunk_size: 1024

    device: "cuda"
    max_tokens: 512
    temperature: -1

    business: {}

# Server concurrency control
server_concurrency:
  max_queue_size: 100 # Maximum queue size
  queue_wait_timeout: 1 #task wait timeout(seconds)
  abandon_low_priority: true # Abandon low priority tasks when high priority tasks arrive

# Automatic model configuration to optimization vram
#[Only support default model: MiMo-VL-Miloco-7B:Q4_0/Qwen3-8b:Q4_0]
#[May invalidate the following configurations: cache_seq_num/parallel_seq_num/total_context_num/context_per_seq/chunk_size]
auto_opt_vram: true