diff --git a/api/interface.py b/api/interface.py index 2512e8d9..1846a01b 100644 --- a/api/interface.py +++ b/api/interface.py @@ -127,7 +127,6 @@ class WhispercppSettings(BaseModel): class FasterWhisperSettings(BaseModel): model_config = ConfigDict(protected_namespaces=()) - enable: bool = True """tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en, distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3, large-v3-turbo, or turbo""" model_size: str """default (model original), auto (fastest available on device), int8, int8_float16 etc. - see https://opennmt.net/CTranslate2/quantization.html#quantize-on-model-conversion""" @@ -181,7 +180,6 @@ class FasterWhisperTranscript(BaseModel): class ParakeetSettings(BaseModel): - enable: bool run_locally: bool = True model_variant: str """v2 (English) or v3 (Multilingual, 25 languages)""" diff --git a/docs/parakeet-issues.md b/docs/parakeet-issues.md new file mode 100644 index 00000000..5e257003 --- /dev/null +++ b/docs/parakeet-issues.md @@ -0,0 +1,62 @@ +# Parakeet STT Issues (2026-04-07) + +## Issue 1: macOS CoreML crash + +**Symptom**: Switching execution provider to CoreML gives `ONNXRuntimeError: model_path must not be empty` from `onnxruntime/core/optimizer/initializer.cc:45`. + +**Root cause**: The `NemoConformerAED` model class explicitly excludes CoreML in `onnx_asr/models/nemo.py:183`: +```python +def _get_excluded_providers() -> list[str]: + return [*TensorRtOptions.get_provider_names(), "CoreMLExecutionProvider"] +``` +But `NemoConformerTdt` (the model Parakeet uses — `nemo-parakeet-tdt-0.6b-v2/v3`) inherits from `NemoConformerRnnt` which has **no CoreML exclusion**. So CoreML is passed to the ONNX session, but the TDT model uses external data files (`onnx?data` pattern in `loader.py:134`) that CoreML can't handle. + +**Fix options**: +1. Exclude CoreML from TDT models (like AED does) — simplest, but removes the option +2. Catch the error in `parakeet.py:__load_model_inner()` and fall back to CPU with a toast message +3. Hide the CoreML option in the Client UI when the model variant doesn't support it (requires Core→Client communication of supported providers) + +## Issue 2: Windows CUDA doesn't reinitialize properly + +**Symptom**: Switching execution provider to CUDA downloads the model but STT doesn't work until full app restart. + +**Root cause (suspected)**: `del self.model` in `parakeet.py:101` drops the Python reference to the old ONNX session, but CUDA GPU memory isn't released immediately (Python GC is non-deterministic). When the new CUDA session tries to allocate GPU memory, the old one may still be holding it. + +**Fix options**: +1. Force `gc.collect()` after unloading the model before loading the new one +2. Add explicit ONNX session cleanup (call `self.model` internals to release sessions) +3. Add a brief delay between unload and reload for CUDA specifically + +## Broader architecture issues + +- `__load_model_inner()` has a generic `except Exception` that toasts an error but leaves `self.model = None`. No retry, no CPU fallback. +- No way for the user to recover without restarting the app if reload fails. +- The `_loading` flag prevents transcription during reload, but there's no timeout or progress feedback. +- The Preprocessor (`onnx_asr`) explicitly excludes CUDA — it always runs on CPU regardless of the selected provider. This is by design but may confuse users expecting full CUDA acceleration. + +## Key files + +### Core (wingman-ai) +- `providers/parakeet.py` — Parakeet provider, model loading, settings update +- `services/settings_service.py:144-148` — calls `parakeet.update_settings_async()` +- `api/interface.py:183-192` — `ParakeetSettings` dataclass + +### onnx_asr library (3rd party, in venv) +- `onnx_asr/loader.py:187-357` — `load_model()`, downloads files, creates ONNX sessions +- `onnx_asr/models/nemo.py:70-85` — `NemoConformerRnnt.__init__()` creates encoder/decoder sessions +- `onnx_asr/models/nemo.py:181-183` — `NemoConformerAED._get_excluded_providers()` excludes CoreML +- `onnx_asr/preprocessors/preprocessor.py` — Preprocessor, excludes CUDA +- `onnx_asr/onnx.py:81-101` — `update_onnx_providers()` filters provider list + +### Client (wingman-client) +- The execution provider dropdown is in the STT settings UI — may need to filter options per model/platform + +## Provider/model compatibility matrix + +| Provider | NemoConformerTdt (Parakeet) | NemoConformerAED | Preprocessor | +|----------|---------------------------|------------------|-------------| +| CPU | Yes | Yes | Yes | +| DirectML | Yes | Yes | Yes | +| CUDA | Yes | Yes | **Excluded** | +| CoreML | **Crashes** (should exclude) | **Excluded** | ? | +| TensorRT | Excluded | Excluded | Excluded | diff --git a/providers/faster_whisper.py b/providers/faster_whisper.py index 7fe358c6..842ce1aa 100644 --- a/providers/faster_whisper.py +++ b/providers/faster_whisper.py @@ -11,29 +11,42 @@ ) from services.printr import Printr -MODELS_DIR = "faster-whisper-models" - class FasterWhisper: - def __init__( - self, - settings: FasterWhisperSettings, - app_root_path: str, - app_is_bundled: bool, - ): + def __init__(self, settings: FasterWhisperSettings): self.printr = Printr() self.settings = settings self.model: Optional[WhisperModel] = None - # move one dir up, out of _internal (if bundled) - app_dir = path.dirname(app_root_path) if app_is_bundled else app_root_path - self.models_dir = path.join(app_dir, MODELS_DIR) + def load(self, model_dir: str): + """Load the FasterWhisper model. Called by SttProviderManager. + + Args: + model_dir: Directory containing model files (from ModelDownloader). + """ + self.unload() + + model_file = path.join(model_dir, self.settings.model_size) + model = model_file if path.exists(model_file) else self.settings.model_size - if self.settings.enable: - self.__update_model() + try: + self.model = WhisperModel( + model, + device=self.settings.device, + compute_type=self.settings.compute_type, + ) + self.printr.print( + f"FasterWhisper initialized with model '{model}' (device: '{self.settings.device}').", + server_only=True, + color=LogType.POSITIVE, + ) + except Exception as e: + self.printr.toast_error( + f"Failed to initialize FasterWhisper with model {model_file}. Error: {e}" + ) - def __unload_model(self): - """Unload the current model to free VRAM.""" + def unload(self): + """Unload the current model to free VRAM. Called by SttProviderManager.""" if self.model is not None: self.printr.print( "FasterWhisper: Unloading current model to free VRAM...", @@ -42,48 +55,22 @@ def __unload_model(self): del self.model self.model = None - # Force garbage collection to release memory gc.collect() - # Clear CUDA cache if using GPU try: import torch if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() except ImportError: - pass # torch not available, skip CUDA cleanup + pass except Exception as e: - # any other CUDA-related cleanup error should not crash model reload self.printr.print( f"FasterWhisper: CUDA cleanup failed during model unload: {e}", server_only=True, color=LogType.WARNING, ) - def __update_model(self): - # Unload the existing model first to free VRAM - self.__unload_model() - - model_file = path.join(self.models_dir, self.settings.model_size) - model = model_file if path.exists(model_file) else self.settings.model_size - - try: - self.model = WhisperModel( - model, - device=self.settings.device, - compute_type=self.settings.compute_type, - ) - self.printr.print( - f"FasterWhisper initialized with model '{model}' (device: '{self.settings.device}').", - server_only=True, - color=LogType.POSITIVE, - ) - except Exception as e: - self.printr.toast_error( - f"Failed to initialize FasterWhisper with model {model_file}. Error: {e}" - ) - def transcribe( self, config: FasterWhisperSttConfig, @@ -127,28 +114,5 @@ def transcribe( return None - def update_settings(self, settings: FasterWhisperSettings): - if self.settings == settings: - self.printr.print("FasterWhisper settings unchanged.", server_only=True) - return - - was_enabled = self.settings.enable - self.settings = settings - - if not settings.enable: - if was_enabled: - self.printr.print( - "FasterWhisper disabled, unloading model...", - server_only=True, - ) - self.__unload_model() - return - - self.printr.print( - "FasterWhisper settings updated, reloading model...", - server_only=True, - ) - self.__update_model() - def validate(self, errors: list[WingmanInitializationError]): pass diff --git a/providers/parakeet.py b/providers/parakeet.py index cd0b3ef8..d07aa427 100644 --- a/providers/parakeet.py +++ b/providers/parakeet.py @@ -1,4 +1,4 @@ -import asyncio +import gc import platform import threading from typing import Optional @@ -27,6 +27,9 @@ "v3": "nemo-parakeet-tdt-0.6b-v3", } +# CoreML is excluded for TDT models — they use external data files that CoreML can't handle +COREML_EXCLUDED_PROVIDERS = ["CoreMLExecutionProvider"] + class Parakeet: def __init__(self, settings: ParakeetSettings): @@ -37,16 +40,22 @@ def __init__(self, settings: ParakeetSettings): self._loading = False self._load_lock = threading.Lock() - if settings.enable and settings.run_locally: - self.__load_model() + def load(self, model_path: Optional[str] = None): + """Load the Parakeet model. Called by SttProviderManager. - def __load_model(self): + Args: + model_path: Local directory containing model files (from ModelDownloader). + If None, onnx_asr downloads internally. + """ with self._load_lock: self._loading = True - self.__load_model_inner() + try: + self._load_model_inner(model_path) + finally: + self._loading = False - def __load_model_inner(self): - self.__unload_model() + def _load_model_inner(self, model_path: Optional[str] = None): + self.unload() try: import onnx_asr @@ -58,7 +67,18 @@ def __load_model_inner(self): self.settings.execution_provider, ["CPUExecutionProvider"] ) - self.model = onnx_asr.load_model(model_name, providers=providers) + # Exclude CoreML for TDT models — crashes with external data files + providers = [ + p for p in providers if p not in COREML_EXCLUDED_PROVIDERS + ] + if not providers: + providers = ["CPUExecutionProvider"] + + load_kwargs = {"providers": providers} + if model_path: + load_kwargs["path"] = model_path + + self.model = onnx_asr.load_model(model_name, **load_kwargs) # Check if requested CUDA provider was actually available if self.settings.execution_provider == "cuda": @@ -89,10 +109,9 @@ def __load_model_inner(self): self.printr.toast_error( f"Failed to initialize Parakeet: {e}" ) - finally: - self._loading = False - def __unload_model(self): + def unload(self): + """Unload the model and free all resources. Called by SttProviderManager.""" if self.model is not None: self.printr.print( "Parakeet: Unloading current model...", @@ -100,48 +119,7 @@ def __unload_model(self): ) del self.model self.model = None - - def __transcribe_remote(self, filename: str) -> Optional[ParakeetTranscript]: - """POST audio file to remote Parakeet server for transcription.""" - host = (self.settings.host or "localhost").strip().rstrip("/") - if not host.startswith(("http://", "https://")): - host = f"http://{host}" - url = f"{host}:{self.settings.port}/v1/audio/transcriptions" - try: - with open(filename, "rb") as f: - response = requests.post( - url=url, - files={"file": f}, - data={ - "model": "parakeet", - "response_format": "json", - }, - timeout=30, - ) - response.raise_for_status() - text = response.json().get("text", "").strip() - return ParakeetTranscript(text=text) - except requests.ConnectionError: - self.printr.toast_error( - f"Parakeet remote: Could not connect to {self.settings.host}:{self.settings.port}. Is the server running?" - ) - except requests.Timeout: - self.printr.toast_error( - f"Parakeet remote: Request timed out after 30s." - ) - except requests.HTTPError as e: - self.printr.toast_error( - f"Parakeet remote: Server returned error: {e}" - ) - except FileNotFoundError: - self.printr.toast_error( - f"Parakeet: File to transcribe '{filename}' not found." - ) - except Exception as e: - self.printr.toast_error( - f"Parakeet remote transcription failed: {e}" - ) - return None + gc.collect() def transcribe( self, @@ -149,7 +127,7 @@ def transcribe( filename: str, ) -> Optional[ParakeetTranscript]: if not self.settings.run_locally: - return self.__transcribe_remote(filename) + return self._transcribe_remote(filename) if self._loading: self.printr.toast_error( @@ -159,7 +137,7 @@ def transcribe( if not self.model: self.printr.toast_error( - "Parakeet model is not loaded. Enable Parakeet in settings first." + "Parakeet model is not loaded. Check STT settings." ) return None @@ -180,69 +158,47 @@ def transcribe( return None - async def update_settings_async(self, settings: ParakeetSettings): - """Async version that runs model loading in a thread to avoid blocking the event loop.""" - old = self.settings - self.settings = settings - - if not settings.enable: - if old.enable and old.run_locally: - self.__unload_model() - return - - if settings.run_locally: - needs_reload = ( - not old.enable - or not old.run_locally - or old.model_variant != settings.model_variant - or old.execution_provider != settings.execution_provider - ) - if needs_reload: - self.printr.print( - "Parakeet settings changed, reloading model...", - server_only=True, + def _transcribe_remote(self, filename: str) -> Optional[ParakeetTranscript]: + """POST audio file to remote Parakeet server for transcription.""" + host = (self.settings.host or "localhost").strip().rstrip("/") + if not host.startswith(("http://", "https://")): + host = f"http://{host}" + url = f"{host}:{self.settings.port}/v1/audio/transcriptions" + try: + with open(filename, "rb") as f: + response = requests.post( + url=url, + files={"file": f}, + data={ + "model": "parakeet", + "response_format": "json", + }, + timeout=30, ) - await asyncio.to_thread(self.__load_model) - else: - if old.run_locally: - self.__unload_model() - self.printr.print( - f"Parakeet remote mode: {settings.host}:{settings.port}", - server_only=True, + response.raise_for_status() + text = response.json().get("text", "").strip() + return ParakeetTranscript(text=text) + except requests.ConnectionError: + self.printr.toast_error( + f"Parakeet remote: Could not connect to {self.settings.host}:{self.settings.port}. Is the server running?" ) - - def update_settings(self, settings: ParakeetSettings): - old = self.settings - self.settings = settings - - if not settings.enable: - # Disabled — unload if needed - if old.enable and old.run_locally: - self.__unload_model() - return - - if settings.run_locally: - # Local mode — load model if switching to local or settings changed - needs_reload = ( - not old.enable - or not old.run_locally - or old.model_variant != settings.model_variant - or old.execution_provider != settings.execution_provider + except requests.Timeout: + self.printr.toast_error( + "Parakeet remote: Request timed out after 30s." ) - if needs_reload: - self.printr.print( - "Parakeet settings changed, reloading model...", - server_only=True, - ) - self.__load_model() - else: - # Remote mode — unload local model if it was loaded - if old.run_locally: - self.__unload_model() - self.printr.print( - f"Parakeet remote mode: {settings.host}:{settings.port}", - server_only=True, + except requests.HTTPError as e: + self.printr.toast_error( + f"Parakeet remote: Server returned error: {e}" + ) + except FileNotFoundError: + self.printr.toast_error( + f"Parakeet: File to transcribe '{filename}' not found." ) + except Exception as e: + self.printr.toast_error( + f"Parakeet remote transcription failed: {e}" + ) + return None def validate(self, errors: list[WingmanInitializationError]): pass diff --git a/services/file.py b/services/file.py index 6436874d..d28877a6 100644 --- a/services/file.py +++ b/services/file.py @@ -105,6 +105,24 @@ def get_local_models_dir() -> str: return local_models_path +def get_models_dir() -> str: + """Get the path to the unified models directory. + + NOT versioned - models persist across Wingman AI updates. + Location: APPDATA/WingmanAI/models/ + """ + dirs = PlatformDirs( + appname=APP_NAME, + appauthor=APP_AUTHOR, + ensure_exists=True, + roaming=True, + ) + models_path = path.join(dirs.user_data_dir, "models") + if not path.exists(models_path): + makedirs(models_path) + return models_path + + def get_generated_files_dir(skill_name: str) -> str: """Get the path to a skill's generated files directory. diff --git a/services/local_model_manager.py b/services/local_model_manager.py index 43fb5798..3dc1550f 100644 --- a/services/local_model_manager.py +++ b/services/local_model_manager.py @@ -11,7 +11,6 @@ from api.enums import LogType from api.interface import LlamaCppSettings -from services.file import get_local_models_dir from services.printr import Printr printr = Printr() @@ -82,10 +81,19 @@ class LocalModelManager: def __init__(self, settings: LlamaCppSettings): self.settings = settings - self.models_dir = get_local_models_dir() + self.models_dir = self._get_models_dir() self._downloading = False self._download_progress: dict = {} # {file, pct, downloaded_mb, total_mb} + @staticmethod + def _get_models_dir() -> str: + from services.file import get_models_dir + models_root = get_models_dir() + local_ai_dir = os.path.join(models_root, "local-ai") + if not os.path.exists(local_ai_dir): + os.makedirs(local_ai_dir) + return local_ai_dir + def update_settings(self, new_settings: LlamaCppSettings): self.settings = new_settings diff --git a/services/migrations/migration_310_to_311.py b/services/migrations/migration_310_to_311.py new file mode 100644 index 00000000..c3989f3d --- /dev/null +++ b/services/migrations/migration_310_to_311.py @@ -0,0 +1,47 @@ +"""Migration from version 3.1.0 to 3.1.1. + +Switches default STT provider from FasterWhisper to Parakeet. +Removes enable flags from FasterWhisper and Parakeet settings. +""" + +from services.migrations.base_migration import BaseMigration + + +class Migration310To311(BaseMigration): + """Migration from 3.1.0 to 3.1.1.""" + + old_version = "3_1_0" + new_version = "3_1_1" + + def migrate_settings(self, old: dict, new: dict) -> dict: + """Switch STT provider to Parakeet and remove enable flags.""" + va = old.get("voice_activation", {}) + + # Switch default STT provider + old_provider = va.get("stt_provider", "fasterwhisper") + va["stt_provider"] = "parakeet" + if old_provider != "parakeet": + self.log(f"- updated stt_provider: {old_provider} → parakeet") + + # Remove enable from fasterwhisper + fw = va.get("fasterwhisper", {}) + if "enable" in fw: + del fw["enable"] + self.log("- removed fasterwhisper.enable flag") + + # Remove enable from parakeet + pk = va.get("parakeet", {}) + if "enable" in pk: + del pk["enable"] + self.log("- removed parakeet.enable flag") + + return old + + def migrate_defaults(self, old: dict, new: dict) -> dict: + """Update defaults stt_provider to parakeet.""" + features = old.get("features", {}) + old_provider = features.get("stt_provider", "fasterwhisper") + if old_provider != "parakeet": + features["stt_provider"] = "parakeet" + self.log(f"- updated defaults stt_provider: {old_provider} → parakeet") + return old diff --git a/services/model_downloader.py b/services/model_downloader.py new file mode 100644 index 00000000..2a787fe1 --- /dev/null +++ b/services/model_downloader.py @@ -0,0 +1,184 @@ +import asyncio +import os +from os import path +from typing import Callable, Optional + +import requests + +from api.enums import LogType +from services.printr import Printr + +printr = Printr() + + +class ModelDownloader: + """Generic model download service with progress reporting. + + Manages the unified models/ directory and handles downloads from + HuggingFace Hub and direct URLs with consistent progress callbacks. + """ + + def __init__(self, models_root: str): + self.models_root = models_root + if not path.exists(models_root): + os.makedirs(models_root) + + def get_model_dir(self, category: str) -> str: + """Return the subdirectory for a model category, creating it if needed. + + Args: + category: Subdirectory name (e.g., "parakeet", "faster-whisper", "local-ai") + """ + category_dir = path.join(self.models_root, category) + if not path.exists(category_dir): + os.makedirs(category_dir) + return category_dir + + def models_exist(self, category: str, expected_files: list[str]) -> bool: + """Check if all expected model files exist in the category directory.""" + category_dir = self.get_model_dir(category) + return all(path.exists(path.join(category_dir, f)) for f in expected_files) + + async def download_huggingface( + self, + repo_id: str, + category: str, + allow_patterns: list[str] | None = None, + ) -> str: + """Download model from HuggingFace Hub. + + Args: + repo_id: HuggingFace repository ID (e.g., "istupakov/parakeet-tdt-0.6b-v3-onnx") + category: Subdirectory name under models/ + allow_patterns: File patterns to download (None = all) + + Returns: + Local directory path where files were downloaded. + """ + local_dir = self.get_model_dir(category) + loop = asyncio.get_event_loop() + + def _download(): + try: + from huggingface_hub import snapshot_download + except ImportError as e: + raise ImportError( + "huggingface_hub is required for HuggingFace downloads. " + "Install it with: pip install huggingface_hub" + ) from e + + return snapshot_download( + repo_id, + local_dir=local_dir, + allow_patterns=allow_patterns, + ) + + printr.print( + f"Downloading model from {repo_id}...", + color=LogType.INFO, + server_only=True, + ) + + try: + result_path = await loop.run_in_executor(None, _download) + printr.print( + f"Download complete: {repo_id}", + color=LogType.POSITIVE, + server_only=True, + ) + return result_path + except Exception as e: + printr.print( + f"Failed to download {repo_id}: {e}", + color=LogType.ERROR, + server_only=True, + ) + raise + + async def download_file( + self, + url: str, + category: str, + filename: str, + on_progress: Optional[Callable[[str, float, float, float], None]] = None, + ) -> str: + """Download a single file via HTTP with progress tracking. + + Args: + url: Direct download URL + category: Subdirectory name under models/ + filename: Target filename + on_progress: Callback (filename, percent, downloaded_mb, total_mb) + + Returns: + Local file path. + """ + category_dir = self.get_model_dir(category) + target_path = path.join(category_dir, filename) + + if path.exists(target_path): + printr.print( + f"Model already exists: {filename}", + color=LogType.INFO, + server_only=True, + ) + return target_path + + loop = asyncio.get_event_loop() + + def _download(): + temp_path = target_path + ".part" + try: + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + + total_size = int(response.headers.get("content-length", 0)) + downloaded = 0 + last_callback_pct = -2 + total_mb = total_size // (1024 * 1024) if total_size > 0 else 0 + + with open(temp_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8 * 1024 * 1024): + f.write(chunk) + downloaded += len(chunk) + if total_size > 0: + pct = int(downloaded / total_size * 100) + downloaded_mb = downloaded // (1024 * 1024) + if on_progress and pct - last_callback_pct >= 2: + on_progress(filename, pct, downloaded_mb, total_mb) + last_callback_pct = pct + + if path.exists(target_path): + os.remove(target_path) + os.rename(temp_path, target_path) + return target_path + + except Exception: + if path.exists(temp_path): + try: + os.remove(temp_path) + except OSError: + pass + raise + + printr.print( + f"Downloading {filename}...", + color=LogType.INFO, + server_only=True, + ) + + try: + result = await loop.run_in_executor(None, _download) + printr.print( + f"Download complete: {filename}", + color=LogType.POSITIVE, + server_only=True, + ) + return result + except Exception as e: + printr.print( + f"Failed to download {filename}: {e}", + color=LogType.ERROR, + server_only=True, + ) + raise diff --git a/services/settings_service.py b/services/settings_service.py index 3216c439..259a713d 100644 --- a/services/settings_service.py +++ b/services/settings_service.py @@ -66,6 +66,7 @@ def initialize( xvasynth: XVASynth, pocket_tts: PocketTTS, local_ai_service: LocalAiService = None, + stt_provider_manager=None, ): self.whispercpp = whispercpp self.fasterwhisper = fasterwhisper @@ -73,6 +74,7 @@ def initialize( self.xvasynth = xvasynth self.pocket_tts = pocket_tts self.local_ai_service = local_ai_service + self.stt_provider_manager = stt_provider_manager # GET /settings def get_settings(self): @@ -87,26 +89,53 @@ def get_settings(self): async def save_settings(self, settings: SettingsConfig): old = deepcopy(self.config_manager.settings_config) - # Mutual exclusion: FasterWhisper and Parakeet - fw = settings.voice_activation.fasterwhisper - pk = settings.voice_activation.parakeet - old_fw = old.voice_activation.fasterwhisper - old_pk = old.voice_activation.parakeet - - if fw.enable and not old_fw.enable: - # FasterWhisper was just enabled -> disable Parakeet - pk.enable = False - settings.voice_activation.stt_provider = ( - VoiceActivationSttProvider.FASTER_WHISPER - ) - self.config_manager.cascade_local_stt_provider(SttProvider.FASTER_WHISPER) - elif pk.enable and not old_pk.enable: - # Parakeet was just enabled -> disable FasterWhisper - fw.enable = False - settings.voice_activation.stt_provider = ( - VoiceActivationSttProvider.PARAKEET - ) - self.config_manager.cascade_local_stt_provider(SttProvider.PARAKEET) + # STT provider switch — route through SttProviderManager + old_stt = old.voice_activation.stt_provider + new_stt = settings.voice_activation.stt_provider + if new_stt != old_stt and self.stt_provider_manager: + # Apply new settings BEFORE switching so the manager reads fresh values + self.parakeet.settings = settings.voice_activation.parakeet + self.fasterwhisper.settings = settings.voice_activation.fasterwhisper + self.config_manager.settings_config.voice_activation = settings.voice_activation + # Provider changed — let the manager handle unload/load + await self.stt_provider_manager.switch_provider(new_stt) + # Cascade the local stt_provider to wingman configs (disk + defaults) + if new_stt == VoiceActivationSttProvider.PARAKEET: + new_stt_provider = SttProvider.PARAKEET + self.config_manager.cascade_local_stt_provider(new_stt_provider) + elif new_stt == VoiceActivationSttProvider.FASTER_WHISPER: + new_stt_provider = SttProvider.FASTER_WHISPER + self.config_manager.cascade_local_stt_provider(new_stt_provider) + else: + new_stt_provider = None + # Also update running wingmen's in-memory config + if new_stt_provider and self.config_service.tower: + for wingman in self.config_service.tower.wingmen: + wingman.config.features.stt_provider = new_stt_provider + elif new_stt == VoiceActivationSttProvider.PARAKEET: + # Same provider, check if parakeet settings changed + old_pk = old.voice_activation.parakeet + new_pk = settings.voice_activation.parakeet + if (old_pk.model_variant != new_pk.model_variant + or old_pk.execution_provider != new_pk.execution_provider + or old_pk.run_locally != new_pk.run_locally): + self.parakeet.settings = new_pk + if self.stt_provider_manager: + await self.stt_provider_manager.switch_provider(new_stt) + else: + self.parakeet.settings = new_pk + elif new_stt == VoiceActivationSttProvider.FASTER_WHISPER: + # Same provider, check if fasterwhisper settings changed + old_fw = old.voice_activation.fasterwhisper + new_fw = settings.voice_activation.fasterwhisper + if (old_fw.model_size != new_fw.model_size + or old_fw.device != new_fw.device + or old_fw.compute_type != new_fw.compute_type): + self.fasterwhisper.settings = new_fw + if self.stt_provider_manager: + await self.stt_provider_manager.switch_provider(new_stt) + else: + self.fasterwhisper.settings = new_fw # audio devices if ( @@ -131,22 +160,6 @@ async def save_settings(self, settings: SettingsConfig): return self.whispercpp.update_settings(settings=settings.voice_activation.whispercpp) - # FasterWhisper - if not self.fasterwhisper: - self.printr.toast_error( - "FasterWhisper is not initialized. Please run SettingsService.initialize()", - ) - return - self.fasterwhisper.update_settings( - settings=settings.voice_activation.fasterwhisper - ) - - # Parakeet (async to avoid blocking event loop during model downloads) - if self.parakeet: - await self.parakeet.update_settings_async( - settings=settings.voice_activation.parakeet - ) - # XVASynth if not self.xvasynth: self.printr.toast_error( @@ -221,6 +234,10 @@ async def save_settings(self, settings: SettingsConfig): for wingman in self.config_service.tower.wingmen: await wingman.update_settings(settings=self.config_manager.settings_config) + def save_settings_to_disk(self): + """Persist current settings to disk without triggering provider updates.""" + self.config_manager.save_settings_config() + async def set_audio_devices( self, input_device: Optional[int] = None, output_device: Optional[int] = None ): diff --git a/services/stt_provider_manager.py b/services/stt_provider_manager.py new file mode 100644 index 00000000..ed26e76f --- /dev/null +++ b/services/stt_provider_manager.py @@ -0,0 +1,257 @@ +import asyncio +import os +import platform +from typing import Awaitable, Callable, Optional + +from api.enums import LogType, VoiceActivationSttProvider +from api.interface import ParakeetSttConfig, FasterWhisperSttConfig +from providers.faster_whisper import FasterWhisper +from providers.parakeet import Parakeet +from services.model_downloader import ModelDownloader +from services.printr import Printr +from services.system_manager import SystemManager + + +# HuggingFace repo IDs for Parakeet models +PARAKEET_REPO_MAP = { + "v2": "istupakov/parakeet-tdt-0.6b-v2-onnx", + "v3": "istupakov/parakeet-tdt-0.6b-v3-onnx", +} + + +class SttProviderManager: + """Manages STT provider lifecycle: CUDA detection, model download, load/unload.""" + + def __init__( + self, + settings_service, # forward ref to avoid circular import + system_manager: SystemManager, + model_downloader: ModelDownloader, + parakeet: Parakeet, + fasterwhisper: FasterWhisper, + app_root_path: str, + ): + self.settings_service = settings_service + self.system_manager = system_manager + self.model_downloader = model_downloader + self.parakeet = parakeet + self.fasterwhisper = fasterwhisper + self.app_root_path = app_root_path + self.printr = Printr() + self.active_provider: VoiceActivationSttProvider | None = None + + async def initialize( + self, + on_status: Optional[Callable[[str, float | None], Awaitable[None]]] = None, + ): + """Full STT startup sequence. + + Args: + on_status: Async callback (message, progress_or_none) for UI updates. + """ + va_settings = self.settings_service.settings.voice_activation + provider = va_settings.stt_provider + + # Check if this is a local provider that needs download + init + if provider == VoiceActivationSttProvider.PARAKEET and va_settings.parakeet.run_locally: + await self._initialize_parakeet(on_status) + elif provider == VoiceActivationSttProvider.FASTER_WHISPER: + await self._initialize_fasterwhisper(on_status) + else: + # Remote/cloud provider — nothing to download or init + self.printr.print( + f"STT provider '{provider.value}' is remote/cloud — skipping local init.", + server_only=True, + color=LogType.INFO, + ) + + self.active_provider = provider + + async def _initialize_parakeet( + self, + on_status: Optional[Callable[[str, float | None], Awaitable[None]]] = None, + ): + """Download and initialize Parakeet.""" + pk_settings = self.settings_service.settings.voice_activation.parakeet + + # Auto-detect CUDA and update execution_provider in settings + self._auto_detect_execution_provider(pk_settings) + + # Download model + variant = pk_settings.model_variant + repo_id = PARAKEET_REPO_MAP.get(variant) + if not repo_id: + self.printr.toast_error( + f"Unknown Parakeet model variant: {variant}. Using v3." + ) + repo_id = PARAKEET_REPO_MAP["v3"] + + model_path = None + try: + if on_status: + await on_status("Downloading STT model (Parakeet)...", None) + + model_path = await self.model_downloader.download_huggingface( + repo_id=repo_id, + category="parakeet", + ) + except Exception as e: + self.printr.toast_error( + f"Could not download the Parakeet STT model. " + f"Please check your internet connection and restart Wingman AI. " + f"If the problem persists, report it at github.com/ShipBit/wingman-ai/issues\n" + f"Error: {e}" + ) + return + + # Load model + if on_status: + await on_status("Initializing speech-to-text...", None) + + # Add brief delay for CUDA to allow GPU memory cleanup + if pk_settings.execution_provider == "cuda": + await asyncio.sleep(0.5) + + await asyncio.get_event_loop().run_in_executor( + None, self.parakeet.load, model_path + ) + + # Health check + if on_status: + await on_status("Verifying speech-to-text...", None) + + await self._health_check_parakeet() + + async def _initialize_fasterwhisper( + self, + on_status: Optional[Callable[[str, float | None], Awaitable[None]]] = None, + ): + """Download and initialize FasterWhisper.""" + if on_status: + await on_status("Initializing speech-to-text (FasterWhisper)...", None) + + model_dir = self.model_downloader.get_model_dir("faster-whisper") + + await asyncio.get_event_loop().run_in_executor( + None, self.fasterwhisper.load, model_dir + ) + + # Health check + if on_status: + await on_status("Verifying speech-to-text...", None) + + await self._health_check_fasterwhisper() + + def _auto_detect_execution_provider(self, pk_settings): + """Auto-detect CUDA and set execution_provider if still on default (cpu).""" + if pk_settings.execution_provider != "cpu": + # User has manually set a non-default provider — respect it + self.printr.print( + f"Parakeet execution_provider already set to '{pk_settings.execution_provider}', skipping auto-detection.", + server_only=True, + color=LogType.INFO, + ) + return + + if platform.system() == "Darwin": + # macOS — always CPU (CoreML excluded for TDT models) + pk_settings.execution_provider = "cpu" + return + + if self.system_manager.is_cuda_available(): + pk_settings.execution_provider = "cuda" + gpu_name = self.system_manager.get_gpu_name() or "Unknown GPU" + self.printr.print( + f"CUDA detected ({gpu_name}). Setting Parakeet to CUDA execution provider.", + server_only=True, + color=LogType.POSITIVE, + ) + else: + pk_settings.execution_provider = "cpu" + self.printr.print( + "No CUDA available. Parakeet will use CPU execution provider.", + server_only=True, + color=LogType.INFO, + ) + + # Persist to settings + self.settings_service.save_settings_to_disk() + + async def switch_provider(self, new_provider: VoiceActivationSttProvider): + """Switch active STT provider. Unloads old, downloads + loads new.""" + old_provider = self.active_provider + + # Unload current provider + if old_provider == VoiceActivationSttProvider.PARAKEET: + self.parakeet.unload() + elif old_provider == VoiceActivationSttProvider.FASTER_WHISPER: + self.fasterwhisper.unload() + + # Initialize new provider + va_settings = self.settings_service.settings.voice_activation + if new_provider == VoiceActivationSttProvider.PARAKEET and va_settings.parakeet.run_locally: + await self._initialize_parakeet() + elif new_provider == VoiceActivationSttProvider.FASTER_WHISPER: + await self._initialize_fasterwhisper() + + self.active_provider = new_provider + + async def _health_check_parakeet(self): + """Run a quick transcription test on the loaded Parakeet model.""" + wav_path = os.path.join(self.app_root_path, "audio_samples", "beep.wav") + config = ParakeetSttConfig(temperature=0.0) + try: + result = self.parakeet.transcribe(config=config, filename=wav_path) + if result and result.text is not None: + self.printr.print( + "Parakeet health check passed.", + server_only=True, + color=LogType.POSITIVE, + ) + else: + self.printr.toast_warning( + "STT loaded but verification failed — transcription may not work correctly." + ) + except Exception as e: + self.printr.toast_warning( + f"STT verification failed: {e}. Transcription may not work correctly." + ) + + async def _health_check_fasterwhisper(self): + """Run a quick transcription test on the loaded FasterWhisper model.""" + if not self.fasterwhisper.model: + self.printr.toast_warning( + "FasterWhisper model not loaded — health check skipped." + ) + return + + wav_path = os.path.join(self.app_root_path, "audio_samples", "beep.wav") + config = FasterWhisperSttConfig( + beam_size=1, best_of=1, temperature=0.0, + no_speech_threshold=0.7, language_detection_threshold=0.5, + multilingual=False, language=None, hotwords=[], additional_hotwords=[], + ) + try: + result = self.fasterwhisper.transcribe( + config=config, filename=wav_path, hotwords=None + ) + if result and result.text is not None: + self.printr.print( + "FasterWhisper health check passed.", + server_only=True, + color=LogType.POSITIVE, + ) + else: + self.printr.toast_warning( + "STT loaded but verification failed — transcription may not work correctly." + ) + except Exception as e: + self.printr.toast_warning( + f"STT verification failed: {e}. Transcription may not work correctly." + ) + + def unload_all(self): + """Unload all STT providers. Called on shutdown.""" + self.parakeet.unload() + self.fasterwhisper.unload() + self.active_provider = None diff --git a/services/system_manager.py b/services/system_manager.py index 2c538bd9..12c7984a 100644 --- a/services/system_manager.py +++ b/services/system_manager.py @@ -6,7 +6,7 @@ from api.enums import LogType from api.interface import SystemCore, SystemInfo -LOCAL_VERSION = "3.1.0" +LOCAL_VERSION = "3.1.1" class SystemManager: diff --git a/templates/configs/defaults.yaml b/templates/configs/defaults.yaml index 038308d9..65b586ca 100644 --- a/templates/configs/defaults.yaml +++ b/templates/configs/defaults.yaml @@ -57,7 +57,7 @@ prompts: persistent_memory: true features: tts_provider: wingman_pro - stt_provider: fasterwhisper + stt_provider: parakeet conversation_provider: wingman_pro image_generation_provider: wingman_pro use_generic_instant_responses: false diff --git a/templates/configs/settings.yaml b/templates/configs/settings.yaml index f1736a21..97b59959 100644 --- a/templates/configs/settings.yaml +++ b/templates/configs/settings.yaml @@ -7,7 +7,7 @@ voice_activation: enabled: false mute_toggle_key: "shift+x" energy_threshold: 0.01 - stt_provider: fasterwhisper + stt_provider: parakeet azure: region: westeurope languages: @@ -20,7 +20,6 @@ voice_activation: whispercpp_config: temperature: 0.0 fasterwhisper: - enable: true model_size: base device: cpu compute_type: auto @@ -34,7 +33,6 @@ voice_activation: hotwords: [] additional_hotwords: [] parakeet: - enable: false run_locally: true model_variant: v3 execution_provider: cpu diff --git a/templates/migration/3_1_1/configs/defaults.yaml b/templates/migration/3_1_1/configs/defaults.yaml new file mode 100644 index 00000000..65b586ca --- /dev/null +++ b/templates/migration/3_1_1/configs/defaults.yaml @@ -0,0 +1,270 @@ +prompts: + system_prompt: | + # ROLE + You are a voice-controlled AI assistant. Your name, personality and character are defined in the BACKSTORY section below. + + # USER CONTEXT + Metadata about the user's environment. If the BACKSTORY defines different names for you or the user, use those instead. + {user_context} + + # CHARACTER BACKSTORY + This defines your personality, speaking style, and role context. It affects HOW you communicate, not WHAT you can do (tools define capabilities). + {backstory} + + **Remember:** Your backstory affects your TONE and PERSONALITY, but never prevents you from using tools. If a user asks you to do something and you have a tool for it, use it - just respond in character. + + # OUTPUT FORMAT + Your responses are BOTH displayed in a UI AND spoken aloud via text-to-speech (TTS). + + **Formatting rules:** + - Use Markdown for visual formatting (links, lists, emphasis) - the UI renders it + - Write text that sounds natural when spoken aloud + - Keep responses concise (1-3 sentences unless more detail is needed) + + **TTS optimization (your response will be spoken!):** + - For links, use Markdown: [descriptive text](url) - the UI shows a clickable link, TTS reads just the text + - **Avoid "click here" or "more information here"**: Integrate links naturally into your sentences so they sound good when spoken (e.g., "You can find more [details about the Cutlass Black](url) on the wiki" instead of "For more info, click [here](url)") + - Don't read raw data aloud - summarize JSON, code, HTML, XML into plain language + - For long lists, summarize ("I found 12 items, here are the top 3...") + - Use normal formatting for dates, times, and prices (TTS handles these well) + - For very large numbers, round them ("about 1.8 million" not "1,847,293") but only if precision isn't critical + + **Example - tool returns JSON:** `{{"status": 200, "items": 47, "name": "Project Alpha"}}` + - BAD: "The response shows status 200, items 47, name Project Alpha" + - GOOD: "Project Alpha has 47 items and everything looks good." + + # YOUR CAPABILITIES + Use `activate_capability` to enable capabilities that provide additional tools. + The tool shows all available options - pick what you need for the task. + + **CRITICAL - Act immediately, never ask for confirmation:** + - If a user's request needs a capability → activate it AND use its tools in the SAME response + - NEVER ask "should I...?" or "are you ready?" after activating - just do it + - Example: User says "look at my screen" → activate VisionAI → immediately call analyse_what_you_or_user_sees → describe what you see + - Never say "I can't do that" if a relevant capability is available + + {skills} + + {conversation_summary} + + # CONVERSATION STYLE + - Keep responses brief and efficient + - Mirror the user's language + - Execute commands without over-explaining + - Don't ask if you can "help more" or "assist further" + + {ttsprompt} +persistent_memory: true +features: + tts_provider: wingman_pro + stt_provider: parakeet + conversation_provider: wingman_pro + image_generation_provider: wingman_pro + use_generic_instant_responses: false + condense_conversation: true + compress_tool_responses: true + condense_max_messages: 50 + condense_keep_recent: 6 +sound: + effects: [] + play_beep: false + play_beep_apollo: false + volume: 1.0 +openai: + conversation_model: gpt-4.1-mini + tts_voice: nova + tts_model: tts-1 + tts_speed: 1.0 + output_streaming: true +openai_compatible_tts: + api_key: "probably-not-needed" + voice: "" + model: "" + base_url: "" + speed: 1.0 + output_streaming: true + voices_endpoint: "" + use_tts_prompt: false + tts_prompt: | + Audio markups make your speech more expressive and human-like. Use them regularly to bring your personality to life and react naturally to the conversation. + + **Non-verbal sounds** (can be placed ANYWHERE in your response): + [clear_throat] [sigh] [shush] [cough] [groan] [sniff] [gasp] [chuckle] [laugh] + + **When to use audio markups:** + - Match your character's personality from the BACKSTORY - if playful, use [chuckle] or [laugh] often; if serious, use [sigh] when frustrated or [groan] when dealing with problems + - React naturally to conversation flow - [gasp] at shocking revelations, [sigh] at disappointments, [laugh] or [chuckle] at humor, [groan] at complications + - Place sounds where a human would naturally make them - mid-sentence or between thoughts for maximum realism + - Aim to use markups in roughly 1 out of 3-4 responses when contextually appropriate + - You can use multiple sounds in one response if it feels natural: "[clear_throat] Listen carefully. [sigh] This isn't going to be easy." + + **Examples:** + - "Well, [sigh] that didn't go as planned." + - "[clear_throat] Attention please. The mission starts in 5 minutes." + - "I found the data you were looking for [chuckle] but you might not like what it says." + - "[gasp] Wait, WHAT? [laugh] Are you kidding me right now?" + - "Look, [groan] I've told you three times already. [sigh] Let me explain it one more time." +mistral: + conversation_model: mistral-medium-latest + endpoint: https://api.mistral.ai/v1 +perplexity: + conversation_model: sonar + endpoint: https://api.perplexity.ai +xai: + conversation_model: grok-4-fast-non-reasoning + endpoint: https://api.x.ai/v1 +groq: + conversation_model: qwen/qwen3-32b + endpoint: https://api.groq.com/openai/v1 +cerebras: + conversation_model: qwen-3-32b + endpoint: https://api.cerebras.ai/v1 +google: + conversation_model: gemini-flash-latest +openrouter: + conversation_model: google/gemini-2.5-flash + endpoint: https://openrouter.ai/api/v1 +local_llm: + endpoint: http://localhost:11434/v1 # Ollama +edge_tts: + voice: en-US-GuyNeural +elevenlabs: + model: eleven_multilingual_v2 + output_streaming: true + voice: + name: Adam + voice_settings: + stability: 0.71 + similarity_boost: 0.5 + style: 0.0 + use_speaker_boost: true + use_tts_prompt: true + tts_prompt: | + Audio tags make your speech more expressive and human-like. Use them regularly when they fit your personality and the conversation context. + + **Emotional delivery** (place before text): + [excited] [curious] [sarcastic] [mischievously] [crying] [whispers] + + **Non-verbal sounds** (place naturally in text): + [laughs] [sighs] [exhales] [snorts] + + **Punctuation for expression:** + - Ellipses (…) add pauses and weight + - CAPITALIZATION for emphasis + - Standard punctuation for natural rhythm + + **When to use audio tags:** + - Match your character's personality from the BACKSTORY - if you're playful, use [laughs] or [mischievously] more often; if serious, use [sighs] when frustrated + - React emotionally to conversation context - use [excited] for good news, [sighs] for setbacks, [curious] when exploring topics + - Add non-verbal sounds naturally where a human would - [laughs] at humor, [exhales] after effort, [snorts] at absurdity + - Aim to use tags in roughly 1 out of 3-4 responses when contextually appropriate + - You can combine one emotional tag with non-verbal sounds: "[whispers] Listen… [sighs] this is serious" + + **Examples:** + - "[sighs] That was a VERY close call… we barely made it." + - "[excited] YES! We found it! [laughs] I told you it would work!" + - "[mischievously] Oh, you want to try THAT approach? [snorts] This should be interesting…" +hume: + description: "" + voice: + name: "" + id: "" + provider: "" +inworld: + tts_endpoint: https://api.inworld.ai/tts/v1/voice + model_id: inworld-tts-1 + voice_id: Deborah + temperature: 1.1 + output_streaming: true + audio_config: + audio_encoding: MP3 + bitrate: 128000 + sample_rate_hertz: 48000 + streaming_sample_rate_hertz: 24000 + speaking_rate: 1.0 + use_tts_prompt: true + tts_prompt: | + Audio markups make your speech more expressive and human-like. Use them regularly to bring your personality to life and react naturally to the conversation. + + **EMOTION AND DELIVERY STYLE MARKUPS** (place at START of text, ONE per response): + Emotions: [happy], [sad], [angry], [surprised], [fearful] + Delivery: [laughing] [whispering] + - These apply to the ENTIRE text that follows + - Use only ONE emotion or delivery markup at the beginning + - Choose based on your personality and the conversation context + + **NON-VERBAL VOCALIZATION MARKUPS** (place anywhere in text): + [breathe], [clear_throat], [cough], [laugh], [sigh], [yawn] + - These add vocal sounds where placed + - Can use multiple in one response + - Place where a human would naturally make these sounds + + **When to use markups - aim for 1 in 3-4 responses:** + - Match your BACKSTORY personality: cheerful → [happy] + [laugh]; serious → [fearful] + [sigh]; grumpy → [angry] + [sigh] + - React to context: good news → [happy]; setbacks → [sad] + [sigh]; shocking → [surprised]; humor → [laughing] or [laugh] + - Add natural sounds: [clear_throat] before announcements, [breathe] when stressed, [yawn] when tired + - Avoid conflicting markups: don't mix [angry] with [laugh], or [sad] with [laughing] + - Choose contextually appropriate markups that match your text content + + **Examples:** + - "[happy] Great news! The mission was a complete success!" + - "[clear_throat] Did you hear me? [sigh] You never listen!" + - "[angry] Are you serious right now? [sigh] Fine, I'll fix it." + - "[surprised] Wait, what? [laugh] I did not see that coming!" +azure: + whisper: + api_base_url: https://openai-w-eu.openai.azure.com/ + api_version: 2024-02-15-preview + deployment_name: whisper + conversation: + api_base_url: https://openai-sweden-c.openai.azure.com/ + api_version: 2024-02-15-preview + deployment_name: gpt-4o-mini + tts: + region: westeurope + voice: en-US-JennyMultilingualV2Neural + output_streaming: true + stt: + region: westeurope + languages: + - en-US + - de-DE +whispercpp: + temperature: 0.0 +fasterwhisper: + beam_size: 1 + best_of: 2 + temperature: 0 + no_speech_threshold: 0.7 + language_detection_threshold: 0.5 + multilingual: false + language: "" + hotwords: [] + additional_hotwords: [] +parakeet: + temperature: 0.0 +xvasynth: + voice: + model_directory: "" + voice_name: "" + language: en + pace: 1.0 + use_super_resolution: false + use_cleanup: false +pocket_tts: + voice: alba + speed: 1.0 + output_streaming: true +wingman_pro: + stt_provider: azure_speech + tts_provider: azure + conversation_deployment: gpt-4.1-mini +commands: + - name: ResetConversationHistory + instant_activation: + - Forget everything! + - Clear conversation history! + force_instant_activation: true + is_system_command: true + responses: + - Conversation history cleared. diff --git a/templates/migration/3_1_1/configs/settings.yaml b/templates/migration/3_1_1/configs/settings.yaml new file mode 100644 index 00000000..97b59959 --- /dev/null +++ b/templates/migration/3_1_1/configs/settings.yaml @@ -0,0 +1,81 @@ +debug_mode: false +audio: {} +streamer_mode: false +cancel_tts_key: "shift+y" +cancel_tts_joystick_button: null +voice_activation: + enabled: false + mute_toggle_key: "shift+x" + energy_threshold: 0.01 + stt_provider: parakeet + azure: + region: westeurope + languages: + - en-US + - de-DE + whispercpp: + host: http://127.0.0.1 + port: 8080 + enable: false + whispercpp_config: + temperature: 0.0 + fasterwhisper: + model_size: base + device: cpu + compute_type: auto + fasterwhisper_config: + beam_size: 1 + best_of: 2 + temperature: 0 + no_speech_threshold: 0.7 + language_detection_threshold: 0.5 + multilingual: false + hotwords: [] + additional_hotwords: [] + parakeet: + run_locally: true + model_variant: v3 + execution_provider: cpu + host: http://127.0.0.1 + port: 9876 + parakeet_config: + temperature: 0.0 +wingman_pro: + base_url: https://wingman-api-europe.azurewebsites.net + region: europe +xvasynth: + enable: false + host: http://127.0.0.1 + port: 8008 + install_dir: C:\Program Files (x86)\Steam\steamapps\common\xVASynth + process_device: cpu +pocket_tts: + enable: true + run_locally: true + custom_model_path: "" + host: "localhost" + port: 5002 +llama_cpp: + run_locally: true + gpu_backend: cpu + support_model: "Qwen3.5-2B-Q4_K_M.gguf" + embed_model: "nomic-embed-text-v1.5.f16.gguf" + n_ctx: 4096 + n_threads: 0 + reasoning_effort: 0 + temperature: 1.0 + top_p: 1.0 + top_k: 20 + presence_penalty: 2.0 + support_remote_host: "http://127.0.0.1" + support_remote_port: 49152 + embed_remote_host: "http://127.0.0.1" + embed_remote_port: 49153 +hud_server: + enabled: false + host: "127.0.0.1" + port: 7862 + framerate: 60 + layout_margin: 20 + layout_spacing: 15 + screen: 1 diff --git a/wingman_core.py b/wingman_core.py index 32cbbdc3..77715824 100644 --- a/wingman_core.py +++ b/wingman_core.py @@ -70,8 +70,11 @@ get_custom_voices_dir, get_custom_skills_dir, get_local_models_dir, + get_models_dir, get_prompt, ) +from services.model_downloader import ModelDownloader +from services.stt_provider_manager import SttProviderManager from services.local_ai_service import LocalAiService from services.token_utils import count_tokens from services.local_model_manager import LocalModelManager @@ -671,8 +674,6 @@ def __init__( ) self.fasterwhisper = FasterWhisper( settings=self.settings_service.settings.voice_activation.fasterwhisper, - app_root_path=app_root_path, - app_is_bundled=app_is_bundled, ) self.parakeet = Parakeet( settings=self.settings_service.settings.voice_activation.parakeet, @@ -680,6 +681,19 @@ def __init__( self.xvasynth = XVASynth(settings=self.settings_service.settings.xvasynth) self.pocket_tts = PocketTTS(settings=self.settings_service.settings.pocket_tts) + # Unified model management + self.model_downloader = ModelDownloader(models_root=get_models_dir()) + + # STT provider lifecycle manager + self.stt_provider_manager = SttProviderManager( + settings_service=self.settings_service, + system_manager=self.system_manager, + model_downloader=self.model_downloader, + parakeet=self.parakeet, + fasterwhisper=self.fasterwhisper, + app_root_path=app_root_path, + ) + # Local AI (llama.cpp for summarization + embedding) llama_cpp_settings = self.settings_service.settings.llama_cpp self.local_model_manager = LocalModelManager(settings=llama_cpp_settings) @@ -701,6 +715,7 @@ def __init__( xvasynth=self.xvasynth, pocket_tts=self.pocket_tts, local_ai_service=self.local_ai_service, + stt_provider_manager=self.stt_provider_manager, ) self.voice_service = VoiceService( @@ -725,10 +740,36 @@ def __init__( self.audio_recorder.update_input_stream() async def startup(self): + # 1. Detect hardware + await self.set_core_state( + CoreState.LOADING_CONFIG, + message="Detecting hardware...", + ) + self.system_manager.is_cuda_available() + + # 2. STT initialization (settings-aware) + async def stt_status(message: str, progress: float | None = None): + await self.set_core_state( + CoreState.LOADING_CONFIG, + message=message, + progress=progress, + ) + + await self.stt_provider_manager.initialize(on_status=stt_status) + + # 3. Voice activation if self.settings_service.settings.voice_activation.enabled: await self.set_voice_activation(is_enabled=True) - # Auto-download local AI models if run_locally is on but models are missing + # 4. TTS initialization (settings-aware) + pocket_settings = self.settings_service.settings.pocket_tts + if pocket_settings.run_locally: + await self.set_core_state( + CoreState.LOADING_CONFIG, + message="Initializing text-to-speech...", + ) + + # 5. Local AI download + init (settings-aware) llama_settings = self.settings_service.settings.llama_cpp if ( llama_settings.run_locally @@ -740,7 +781,6 @@ async def startup(self): server_only=True, ) - # Progress callback — store latest progress, then flush to clients progress_state = {} def on_download_progress(filename, pct, downloaded_mb, total_mb): @@ -749,14 +789,13 @@ def on_download_progress(filename, pct, downloaded_mb, total_mb): progress_state["downloaded_mb"] = downloaded_mb progress_state["total_mb"] = total_mb - # Kick off download with progress callback download_task = asyncio.create_task( self.local_model_manager.download_models( - on_progress=on_download_progress + cuda_available=self.system_manager.is_cuda_available(), + on_progress=on_download_progress, ) ) - # Poll progress_state and broadcast updates while download runs while not download_task.done(): if progress_state: fname = progress_state.get("filename", "") @@ -770,23 +809,21 @@ def on_download_progress(filename, pct, downloaded_mb, total_mb): ) await self.set_core_state( CoreState.LOADING_CONFIG, - message=f"Downloading {short_name}... ({dl_mb} / {t_mb} MB)", + message=f"Downloading Local AI model ({short_name})... ({dl_mb} / {t_mb} MB)", progress=pct / 100.0 if pct else None, ) await asyncio.sleep(0.5) - # Await to propagate exceptions await download_task - # Initialize local AI service (loads models if run_locally + available) if llama_settings.run_locally and self.local_model_manager.models_available(): await self.set_core_state( CoreState.LOADING_CONFIG, - message="Loading local AI models...", + message="Initializing Local AI...", ) await self.local_ai_service.initialize() - # Start HUD Server if enabled + # 6. HUD server hud_settings = getattr(self.settings_service.settings, "hud_server", None) if hud_settings and hud_settings.enabled: await self.set_core_state( @@ -2700,11 +2737,12 @@ async def test_parakeet(self) -> TestConnectionResult: """Test Parakeet by transcribing a short audio sample (locally or remotely).""" settings = self.settings_service.settings.voice_activation.parakeet - if not settings.enable: + stt_provider = self.settings_service.settings.voice_activation.stt_provider + if stt_provider != VoiceActivationSttProvider.PARAKEET: return TestConnectionResult( success=False, provider="parakeet", - error="Parakeet is not enabled.", + error="Parakeet is not the active STT provider.", ) wav_path = os.path.join(self.app_root_path, "audio_samples", "beep.wav") @@ -2721,7 +2759,7 @@ async def test_parakeet(self) -> TestConnectionResult: return TestConnectionResult( success=False, provider="parakeet", - error="Parakeet model is not loaded. Enable Parakeet in settings first.", + error="Parakeet model is not loaded. Check STT settings.", ) try: