-
Notifications
You must be signed in to change notification settings - Fork 181
Expand file tree
/
Copy pathai_engine_config.yaml
More file actions
72 lines (58 loc) · 2.38 KB
/
ai_engine_config.yaml
File metadata and controls
72 lines (58 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Mico AI Engine Configuration
# Unified management of all AI engine configuration items
# Logging configuration
logging:
log_level: "info"
enable_console_logging: true
enable_file_logging: true
# Server configuration
server:
host: "0.0.0.0"
port: 8001
# Application information configuration
app:
title: "Mico AI Engine"
service_name: "mico-ai-engine"
description: "Mico AI Engine, responsible for model inference and management"
version: "0.0.1"
# Model configurations
models:
MiMo-VL-Miloco-7B:Q4_0:
# Basic settings
model_path: "/models/MiMo-VL-Miloco-7B/MiMo-VL-Miloco-7B_Q4_0.gguf" # Model path
mmproj_path: "/models/MiMo-VL-Miloco-7B/mmproj-MiMo-VL-Miloco-7B_BF16.gguf" # VIT model path
# Cache settings
cache_seq_num: 5 # Maximum number of sequences to dynamic prompt cache [Recommended rule cameras num + 1]
# Model parameters
parallel_seq_num: 12 # Parallel seq num [Recommended rule num + 2]
total_context_num: 16384 # Maximum context tokens for all seq sums, Affects the size of VRAM [Recommended rule num * 1000 + 3000]
context_per_seq: 4096 # Maximum effective context tokens for each seq, multi-turn could use
chunk_size: 256 # Model seqlen, Affects the size of VRAM [Recommended ≥ 256]
device: "cuda" # Model device [cuda/cpu]
# Inference parameters
max_tokens: 512 # Maximum tokens to generate
temperature: -1 # sampling temperature [-1 Greedy sampling]
# Business hardcoded configuration
business:
# Task labels(prompt config key) and corresponding priorities
task_labels: ["trigger_rule_condition", "vision_understanding"]
task_priorities: [5, 10]
Qwen3-8b:Q4_0:
model_path: "/models/Qwen3-8B/Qwen3-8B-Q4_K_M.gguf"
parallel_seq_num: 2
total_context_num: 12288
context_per_seq: 6144
chunk_size: 1024
device: "cuda"
max_tokens: 512
temperature: -1
business: {}
# Server concurrency control
server_concurrency:
max_queue_size: 100 # Maximum queue size
queue_wait_timeout: 1 #task wait timeout(seconds)
abandon_low_priority: true # Abandon low priority tasks when high priority tasks arrive
# Automatic model configuration to optimization vram
#[Only support default model: MiMo-VL-Miloco-7B:Q4_0/Qwen3-8b:Q4_0]
#[May invalidate the following configurations: cache_seq_num/parallel_seq_num/total_context_num/context_per_seq/chunk_size]
auto_opt_vram: true