ShipBit · Shackless · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/api/interface.py b/api/interface.py
@@ -127,7 +127,6 @@ class WhispercppSettings(BaseModel):
 
 class FasterWhisperSettings(BaseModel):
     model_config = ConfigDict(protected_namespaces=())
-    enable: bool = True
     """tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en, distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3, large-v3-turbo, or turbo"""
     model_size: str
     """default (model original), auto (fastest available on device), int8, int8_float16 etc. - see https://opennmt.net/CTranslate2/quantization.html#quantize-on-model-conversion"""
@@ -181,7 +180,6 @@ class FasterWhisperTranscript(BaseModel):
 
 
 class ParakeetSettings(BaseModel):
-    enable: bool
     run_locally: bool = True
     model_variant: str
     """v2 (English) or v3 (Multilingual, 25 languages)"""

diff --git a/docs/parakeet-issues.md b/docs/parakeet-issues.md
@@ -0,0 +1,62 @@
+# Parakeet STT Issues (2026-04-07)
+
+## Issue 1: macOS CoreML crash
+
+**Symptom**: Switching execution provider to CoreML gives `ONNXRuntimeError: model_path must not be empty` from `onnxruntime/core/optimizer/initializer.cc:45`.
+
+**Root cause**: The `NemoConformerAED` model class explicitly excludes CoreML in `onnx_asr/models/nemo.py:183`:
+```python
+def _get_excluded_providers() -> list[str]:
+    return [*TensorRtOptions.get_provider_names(), "CoreMLExecutionProvider"]
+```
+But `NemoConformerTdt` (the model Parakeet uses — `nemo-parakeet-tdt-0.6b-v2/v3`) inherits from `NemoConformerRnnt` which has **no CoreML exclusion**. So CoreML is passed to the ONNX session, but the TDT model uses external data files (`onnx?data` pattern in `loader.py:134`) that CoreML can't handle.
+
+**Fix options**:
+1. Exclude CoreML from TDT models (like AED does) — simplest, but removes the option
+2. Catch the error in `parakeet.py:__load_model_inner()` and fall back to CPU with a toast message
+3. Hide the CoreML option in the Client UI when the model variant doesn't support it (requires Core→Client communication of supported providers)
+
+## Issue 2: Windows CUDA doesn't reinitialize properly
+
+**Symptom**: Switching execution provider to CUDA downloads the model but STT doesn't work until full app restart.
+
+**Root cause (suspected)**: `del self.model` in `parakeet.py:101` drops the Python reference to the old ONNX session, but CUDA GPU memory isn't released immediately (Python GC is non-deterministic). When the new CUDA session tries to allocate GPU memory, the old one may still be holding it.
+
+**Fix options**:
+1. Force `gc.collect()` after unloading the model before loading the new one
+2. Add explicit ONNX session cleanup (call `self.model` internals to release sessions)
+3. Add a brief delay between unload and reload for CUDA specifically
+
+## Broader architecture issues
+
+- `__load_model_inner()` has a generic `except Exception` that toasts an error but leaves `self.model = None`. No retry, no CPU fallback.
+- No way for the user to recover without restarting the app if reload fails.
+- The `_loading` flag prevents transcription during reload, but there's no timeout or progress feedback.
+- The Preprocessor (`onnx_asr`) explicitly excludes CUDA — it always runs on CPU regardless of the selected provider. This is by design but may confuse users expecting full CUDA acceleration.
+
+## Key files
+
+### Core (wingman-ai)
+- `providers/parakeet.py` — Parakeet provider, model loading, settings update
+- `services/settings_service.py:144-148` — calls `parakeet.update_settings_async()`
+- `api/interface.py:183-192` — `ParakeetSettings` dataclass
+
+### onnx_asr library (3rd party, in venv)
+- `onnx_asr/loader.py:187-357` — `load_model()`, downloads files, creates ONNX sessions
+- `onnx_asr/models/nemo.py:70-85` — `NemoConformerRnnt.__init__()` creates encoder/decoder sessions
+- `onnx_asr/models/nemo.py:181-183` — `NemoConformerAED._get_excluded_providers()` excludes CoreML
+- `onnx_asr/preprocessors/preprocessor.py` — Preprocessor, excludes CUDA
+- `onnx_asr/onnx.py:81-101` — `update_onnx_providers()` filters provider list
+
+### Client (wingman-client)
+- The execution provider dropdown is in the STT settings UI — may need to filter options per model/platform
+
+## Provider/model compatibility matrix
+
+| Provider | NemoConformerTdt (Parakeet) | NemoConformerAED | Preprocessor |
+|----------|---------------------------|------------------|-------------|
+| CPU | Yes | Yes | Yes |
+| DirectML | Yes | Yes | Yes |
+| CUDA | Yes | Yes | **Excluded** |
+| CoreML | **Crashes** (should exclude) | **Excluded** | ? |
+| TensorRT | Excluded | Excluded | Excluded |
diff --git a/providers/faster_whisper.py b/providers/faster_whisper.py
@@ -11,29 +11,42 @@
 )
 from services.printr import Printr
 
-MODELS_DIR = "faster-whisper-models"
-
 
 class FasterWhisper:
-    def __init__(
-        self,
-        settings: FasterWhisperSettings,
-        app_root_path: str,
-        app_is_bundled: bool,
-    ):
+    def __init__(self, settings: FasterWhisperSettings):
         self.printr = Printr()
         self.settings = settings
         self.model: Optional[WhisperModel] = None
 
-        # move one dir up, out of _internal (if bundled)
-        app_dir = path.dirname(app_root_path) if app_is_bundled else app_root_path
-        self.models_dir = path.join(app_dir, MODELS_DIR)
+    def load(self, model_dir: str):
+        """Load the FasterWhisper model. Called by SttProviderManager.
+
+        Args:
+            model_dir: Directory containing model files (from ModelDownloader).
+        """
+        self.unload()
+
+        model_file = path.join(model_dir, self.settings.model_size)
+        model = model_file if path.exists(model_file) else self.settings.model_size
 
-        if self.settings.enable:
-            self.__update_model()
+        try:
+            self.model = WhisperModel(
+                model,
+                device=self.settings.device,
+                compute_type=self.settings.compute_type,
+            )
+            self.printr.print(
+                f"FasterWhisper initialized with model '{model}' (device: '{self.settings.device}').",
+                server_only=True,
+                color=LogType.POSITIVE,
+            )
+        except Exception as e:
+            self.printr.toast_error(
+                f"Failed to initialize FasterWhisper with model {model_file}. Error: {e}"
+            )
 
-    def __unload_model(self):
-        """Unload the current model to free VRAM."""
+    def unload(self):
+        """Unload the current model to free VRAM. Called by SttProviderManager."""
         if self.model is not None:
             self.printr.print(
                 "FasterWhisper: Unloading current model to free VRAM...",
@@ -42,48 +55,22 @@ def __unload_model(self):
             del self.model
             self.model = None
 
-            # Force garbage collection to release memory
             gc.collect()
 
-            # Clear CUDA cache if using GPU
             try:
                 import torch
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                     torch.cuda.synchronize()
             except ImportError:
-                pass  # torch not available, skip CUDA cleanup
+                pass
             except Exception as e:
-                # any other CUDA-related cleanup error should not crash model reload
                 self.printr.print(
                     f"FasterWhisper: CUDA cleanup failed during model unload: {e}",
                     server_only=True,
                     color=LogType.WARNING,
                 )
 
-    def __update_model(self):
-        # Unload the existing model first to free VRAM
-        self.__unload_model()
-
-        model_file = path.join(self.models_dir, self.settings.model_size)
-        model = model_file if path.exists(model_file) else self.settings.model_size
-
-        try:
-            self.model = WhisperModel(
-                model,
-                device=self.settings.device,
-                compute_type=self.settings.compute_type,
-            )
-            self.printr.print(
-                f"FasterWhisper initialized with model '{model}' (device: '{self.settings.device}').",
-                server_only=True,
-                color=LogType.POSITIVE,
-            )
-        except Exception as e:
-            self.printr.toast_error(
-                f"Failed to initialize FasterWhisper with model {model_file}. Error: {e}"
-            )
-
     def transcribe(
         self,
         config: FasterWhisperSttConfig,
@@ -127,28 +114,5 @@ def transcribe(
 
         return None
 
-    def update_settings(self, settings: FasterWhisperSettings):
-        if self.settings == settings:
-            self.printr.print("FasterWhisper settings unchanged.", server_only=True)
-            return
-
-        was_enabled = self.settings.enable
-        self.settings = settings
-
-        if not settings.enable:
-            if was_enabled:
-                self.printr.print(
-                    "FasterWhisper disabled, unloading model...",
-                    server_only=True,
-                )
-                self.__unload_model()
-            return
-
-        self.printr.print(
-            "FasterWhisper settings updated, reloading model...",
-            server_only=True,
-        )
-        self.__update_model()
-
     def validate(self, errors: list[WingmanInitializationError]):
         pass