hailo-apps/hailo_apps/python/gen_ai_apps/voice_assistant/voice_assistant.py at main · hailo-ai/hailo-apps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import argparse
import threading
from io import StringIO
from contextlib import redirect_stderr
from pathlib import Path
import sys
from hailo_platform import VDevice
from hailo_platform.genai import LLM

repo_root = None
for p in Path(__file__).resolve().parents:
    if (p / "hailo_apps" / "config" / "config_manager.py").exists():
        repo_root = p
        break
if repo_root is not None:
    sys.path.insert(0, str(repo_root))

from hailo_apps.python.core.common.defines import LLM_PROMPT_PREFIX, SHARED_VDEVICE_GROUP_ID, HAILO10H_ARCH, VOICE_ASSISTANT_APP, VOICE_ASSISTANT_MODEL_NAME
from hailo_apps.python.core.common.core import resolve_hef_path
from hailo_apps.python.core.common.hailo_logger import add_logging_cli_args, init_logging, level_from_args
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.interaction import VoiceInteractionManager
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.vad import add_vad_args
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.speech_to_text import SpeechToTextProcessor
from hailo_apps.python.gen_ai_apps.gen_ai_utils.voice_processing.text_to_speech import (
    TextToSpeechProcessor,
    PiperModelNotFoundError,
)
from hailo_apps.python.gen_ai_apps.gen_ai_utils.llm_utils import streaming


class VoiceAssistantApp:
    """
    Manages the main application logic for the voice assistant.
    Builds the pipeline using common AI components.
    """

    def __init__(self, debug=False, no_tts=False):
        self.debug = debug
        self.no_tts = no_tts
        self.abort_event = threading.Event()

        print("Initializing AI components... (This might take a moment)")

        # Suppress noisy ALSA messages during initialization
        with redirect_stderr(StringIO()):
            # 1. VDevice
            params = VDevice.create_params()
            params.group_id = SHARED_VDEVICE_GROUP_ID
            self.vdevice = VDevice(params)

            # 2. Speech to Text
            self.s2t = SpeechToTextProcessor(self.vdevice)

            # 3. LLM
            # USER CONFIGURATION: You can change the LLM model here.
            # By default, it uses the default model for voice_assistant app.
            # To use a custom HEF, provide the absolute path to your .hef file.
            model_path = resolve_hef_path(
                hef_path=VOICE_ASSISTANT_MODEL_NAME,
                app_name=VOICE_ASSISTANT_APP,
                arch=HAILO10H_ARCH
            )
            if model_path is None:
                raise RuntimeError("Failed to resolve HEF path for LLM model. Please ensure the model is available.")

            self.llm = LLM(self.vdevice, str(model_path))

            # 4. TTS
            self.tts = None
            if not no_tts:
                try:
                    self.tts = TextToSpeechProcessor()
                except PiperModelNotFoundError:
                    # Warning handled by TextToSpeechProcessor
                    self.tts = None

        self.interaction = None

        self.interaction = None

        print("✅ AI components ready!")

    def on_processing_start(self):
        self.on_abort()
        if self.tts:
            self.tts.interrupt()

    def on_abort(self):
        """Abort current generation and speech."""
        self.abort_event.set()
        if self.tts:
            self.tts.interrupt()

    def on_audio_ready(self, audio):
        self.abort_event.clear()

        # 1. Transcribe
        user_text = self.s2t.transcribe(audio)
        if not user_text:
            print("No speech detected.")
            return

        print(f"\nYou: {user_text}")
        print("\nLLM response:\n")

        # 2. Prepare TTS
        current_gen_id = None
        # Use a mutable container to track state inside callback
        state = {
            'sentence_buffer': "",
            'first_chunk_sent': False
        }

        if self.tts:
            self.tts.clear_interruption()
            current_gen_id = self.tts.get_current_gen_id()

        # 3. Generate Response
        prompt_text = LLM_PROMPT_PREFIX + user_text

        # Format prompt as a list of messages for the LLM
        formatted_prompt = [{'role': 'user', 'content': prompt_text}]

        def tts_callback(chunk: str):
            if self.tts:
                state['sentence_buffer'] += chunk
                # Chunk and queue speech using the centralized method
                state['sentence_buffer'] = self.tts.chunk_and_queue(
                    state['sentence_buffer'], current_gen_id, not state['first_chunk_sent']
                )

                if not state['first_chunk_sent'] and not self.tts.speech_queue.empty():
                    state['first_chunk_sent'] = True

        # Use streaming utility to handle generation, printing, and TTS callback
        # Note: simple voice assistant might not use tools, so filtered output should be clean text
        streaming.generate_and_stream_response(
            llm=self.llm,
            prompt=formatted_prompt,
            prefix="", # No prefix for this app
            show_raw_stream=self.debug,  # Show raw stream in debug mode
            token_callback=tts_callback,
            abort_callback=self.abort_event.is_set
        )

        # 4. Send remaining text
        # 4. Send remaining text
        if self.tts and state['sentence_buffer'].strip():
            self.tts.queue_text(state['sentence_buffer'].strip(), current_gen_id)

        print() # New line after streaming

        # 5. Handshake: Wait for TTS to finish, then restart listening
        if self.interaction:
            try:
                self.interaction.restart_after_tts()
            except Exception as e:
                # If interaction somehow fails or isn't set up
                pass

    def on_clear_context(self):
        self.llm.clear_context()
        print("Context cleared.")

    def close(self):
        if self.tts:
            self.tts.stop()

        # Clean up LLM resources
        try:
            self.llm.release()
        except Exception:
            pass

        # We rely on process cleanup for VDevice release or Python's GC,
        # matching original pattern.
        # Ideally self.vdevice.release() but it's shared.


def main():
    parser = argparse.ArgumentParser(
        description='A simple, voice-controlled AI assistant for your terminal.')
    add_logging_cli_args(parser)
    parser.add_argument('--no-tts', action='store_true',
                        help='Disable text-to-speech output for lower resource usage.')

    # Add VAD arguments
    add_vad_args(parser)

    args = parser.parse_args()

    # Initialize logging
    init_logging(level=level_from_args(args))

    # Check if debug mode is enabled (for logging purposes)
    debug_mode = getattr(args, 'debug', False)

    if args.no_tts:
        print("TTS disabled: Running in low-resource mode.")

    # Initialize the app
    app = VoiceAssistantApp(debug=debug_mode, no_tts=args.no_tts)

    # Initialize the interaction manager
    interaction = VoiceInteractionManager(
        title="Voice Assistant",
        on_audio_ready=app.on_audio_ready,
        on_processing_start=app.on_processing_start,
        on_clear_context=app.on_clear_context,
        on_shutdown=app.close,
        on_abort=app.on_abort,
        debug=debug_mode,
        vad_enabled=args.vad,
        vad_aggressiveness=args.vad_aggressiveness,
        vad_energy_threshold=args.vad_energy_threshold,
        tts=app.tts,  # Pass TTS for automatic inhibition and handshake
    )

    # Inject interaction into app for handshake control
    app.interaction = interaction

    interaction.run()


if __name__ == "__main__":
    main()