-
Notifications
You must be signed in to change notification settings - Fork 124
Expand file tree
/
Copy pathvoice_assistant.py
More file actions
226 lines (180 loc) · 7.55 KB
/
voice_assistant.py
File metadata and controls
226 lines (180 loc) · 7.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import argparse
import threading
from io import StringIO
from contextlib import redirect_stderr
from pathlib import Path
import sys
from hailo_platform import VDevice
from hailo_platform.genai import LLM
repo_root = None
for p in Path(__file__).resolve().parents:
if (p / "hailo_apps" / "config" / "config_manager.py").exists():
repo_root = p
break
if repo_root is not None:
sys.path.insert(0, str(repo_root))
from hailo_apps.python.core.common.defines import LLM_PROMPT_PREFIX, SHARED_VDEVICE_GROUP_ID, HAILO10H_ARCH, VOICE_ASSISTANT_APP, VOICE_ASSISTANT_MODEL_NAME
from hailo_apps.python.core.common.core import resolve_hef_path
from hailo_apps.python.core.common.hailo_logger import add_logging_cli_args, init_logging, level_from_args
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.interaction import VoiceInteractionManager
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.vad import add_vad_args
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.speech_to_text import SpeechToTextProcessor
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.text_to_speech import (
TextToSpeechProcessor,
PiperModelNotFoundError,
)
from hailo_apps.python.gen_ai_apps.gen_ai_utils.llm_utils import streaming
class VoiceAssistantApp:
"""
Manages the main application logic for the voice assistant.
Builds the pipeline using common AI components.
"""
def __init__(self, debug=False, no_tts=False):
self.debug = debug
self.no_tts = no_tts
self.abort_event = threading.Event()
print("Initializing AI components... (This might take a moment)")
# Suppress noisy ALSA messages during initialization
with redirect_stderr(StringIO()):
# 1. VDevice
params = VDevice.create_params()
params.group_id = SHARED_VDEVICE_GROUP_ID
self.vdevice = VDevice(params)
# 2. Speech to Text
self.s2t = SpeechToTextProcessor(self.vdevice)
# 3. LLM
# USER CONFIGURATION: You can change the LLM model here.
# By default, it uses the default model for voice_assistant app.
# To use a custom HEF, provide the absolute path to your .hef file.
model_path = resolve_hef_path(
hef_path=VOICE_ASSISTANT_MODEL_NAME,
app_name=VOICE_ASSISTANT_APP,
arch=HAILO10H_ARCH
)
if model_path is None:
raise RuntimeError("Failed to resolve HEF path for LLM model. Please ensure the model is available.")
self.llm = LLM(self.vdevice, str(model_path))
# 4. TTS
self.tts = None
if not no_tts:
try:
self.tts = TextToSpeechProcessor()
except PiperModelNotFoundError:
# Warning handled by TextToSpeechProcessor
self.tts = None
self.interaction = None
self.interaction = None
print("✅ AI components ready!")
def on_processing_start(self):
self.on_abort()
if self.tts:
self.tts.interrupt()
def on_abort(self):
"""Abort current generation and speech."""
self.abort_event.set()
if self.tts:
self.tts.interrupt()
def on_audio_ready(self, audio):
self.abort_event.clear()
# 1. Transcribe
user_text = self.s2t.transcribe(audio)
if not user_text:
print("No speech detected.")
return
print(f"\nYou: {user_text}")
print("\nLLM response:\n")
# 2. Prepare TTS
current_gen_id = None
# Use a mutable container to track state inside callback
state = {
'sentence_buffer': "",
'first_chunk_sent': False
}
if self.tts:
self.tts.clear_interruption()
current_gen_id = self.tts.get_current_gen_id()
# 3. Generate Response
prompt_text = LLM_PROMPT_PREFIX + user_text
# Format prompt as a list of messages for the LLM
formatted_prompt = [{'role': 'user', 'content': prompt_text}]
def tts_callback(chunk: str):
if self.tts:
state['sentence_buffer'] += chunk
# Chunk and queue speech using the centralized method
state['sentence_buffer'] = self.tts.chunk_and_queue(
state['sentence_buffer'], current_gen_id, not state['first_chunk_sent']
)
if not state['first_chunk_sent'] and not self.tts.speech_queue.empty():
state['first_chunk_sent'] = True
# Use streaming utility to handle generation, printing, and TTS callback
# Note: simple voice assistant might not use tools, so filtered output should be clean text
streaming.generate_and_stream_response(
llm=self.llm,
prompt=formatted_prompt,
prefix="", # No prefix for this app
show_raw_stream=self.debug, # Show raw stream in debug mode
token_callback=tts_callback,
abort_callback=self.abort_event.is_set
)
# 4. Send remaining text
# 4. Send remaining text
if self.tts and state['sentence_buffer'].strip():
self.tts.queue_text(state['sentence_buffer'].strip(), current_gen_id)
print() # New line after streaming
# 5. Handshake: Wait for TTS to finish, then restart listening
if self.interaction:
try:
self.interaction.restart_after_tts()
except Exception as e:
# If interaction somehow fails or isn't set up
pass
def on_clear_context(self):
self.llm.clear_context()
print("Context cleared.")
def close(self):
if self.tts:
self.tts.stop()
# Clean up LLM resources
try:
self.llm.release()
except Exception:
pass
# We rely on process cleanup for VDevice release or Python's GC,
# matching original pattern.
# Ideally self.vdevice.release() but it's shared.
def main():
parser = argparse.ArgumentParser(
description='A simple, voice-controlled AI assistant for your terminal.')
add_logging_cli_args(parser)
parser.add_argument('--no-tts', action='store_true',
help='Disable text-to-speech output for lower resource usage.')
# Add VAD arguments
add_vad_args(parser)
args = parser.parse_args()
# Initialize logging
init_logging(level=level_from_args(args))
# Check if debug mode is enabled (for logging purposes)
debug_mode = getattr(args, 'debug', False)
if args.no_tts:
print("TTS disabled: Running in low-resource mode.")
# Initialize the app
app = VoiceAssistantApp(debug=debug_mode, no_tts=args.no_tts)
# Initialize the interaction manager
interaction = VoiceInteractionManager(
title="Voice Assistant",
on_audio_ready=app.on_audio_ready,
on_processing_start=app.on_processing_start,
on_clear_context=app.on_clear_context,
on_shutdown=app.close,
on_abort=app.on_abort,
debug=debug_mode,
vad_enabled=args.vad,
vad_aggressiveness=args.vad_aggressiveness,
vad_energy_threshold=args.vad_energy_threshold,
tts=app.tts, # Pass TTS for automatic inhibition and handshake
)
# Inject interaction into app for handshake control
app.interaction = interaction
interaction.run()
if __name__ == "__main__":
main()