Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions api/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ class WhispercppSettings(BaseModel):

class FasterWhisperSettings(BaseModel):
model_config = ConfigDict(protected_namespaces=())
enable: bool = True
"""tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en, distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3, large-v3-turbo, or turbo"""
model_size: str
"""default (model original), auto (fastest available on device), int8, int8_float16 etc. - see https://opennmt.net/CTranslate2/quantization.html#quantize-on-model-conversion"""
Expand Down Expand Up @@ -181,7 +180,6 @@ class FasterWhisperTranscript(BaseModel):


class ParakeetSettings(BaseModel):
enable: bool
run_locally: bool = True
model_variant: str
"""v2 (English) or v3 (Multilingual, 25 languages)"""
Expand Down
62 changes: 62 additions & 0 deletions docs/parakeet-issues.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Parakeet STT Issues (2026-04-07)

## Issue 1: macOS CoreML crash

**Symptom**: Switching execution provider to CoreML gives `ONNXRuntimeError: model_path must not be empty` from `onnxruntime/core/optimizer/initializer.cc:45`.

**Root cause**: The `NemoConformerAED` model class explicitly excludes CoreML in `onnx_asr/models/nemo.py:183`:
```python
def _get_excluded_providers() -> list[str]:
return [*TensorRtOptions.get_provider_names(), "CoreMLExecutionProvider"]
```
But `NemoConformerTdt` (the model Parakeet uses — `nemo-parakeet-tdt-0.6b-v2/v3`) inherits from `NemoConformerRnnt` which has **no CoreML exclusion**. So CoreML is passed to the ONNX session, but the TDT model uses external data files (`onnx?data` pattern in `loader.py:134`) that CoreML can't handle.

**Fix options**:
1. Exclude CoreML from TDT models (like AED does) — simplest, but removes the option
2. Catch the error in `parakeet.py:__load_model_inner()` and fall back to CPU with a toast message
3. Hide the CoreML option in the Client UI when the model variant doesn't support it (requires Core→Client communication of supported providers)

## Issue 2: Windows CUDA doesn't reinitialize properly

**Symptom**: Switching execution provider to CUDA downloads the model but STT doesn't work until full app restart.

**Root cause (suspected)**: `del self.model` in `parakeet.py:101` drops the Python reference to the old ONNX session, but CUDA GPU memory isn't released immediately (Python GC is non-deterministic). When the new CUDA session tries to allocate GPU memory, the old one may still be holding it.

**Fix options**:
1. Force `gc.collect()` after unloading the model before loading the new one
2. Add explicit ONNX session cleanup (call `self.model` internals to release sessions)
3. Add a brief delay between unload and reload for CUDA specifically

## Broader architecture issues

- `__load_model_inner()` has a generic `except Exception` that toasts an error but leaves `self.model = None`. No retry, no CPU fallback.
- No way for the user to recover without restarting the app if reload fails.
- The `_loading` flag prevents transcription during reload, but there's no timeout or progress feedback.
- The Preprocessor (`onnx_asr`) explicitly excludes CUDA — it always runs on CPU regardless of the selected provider. This is by design but may confuse users expecting full CUDA acceleration.

## Key files

### Core (wingman-ai)
- `providers/parakeet.py` — Parakeet provider, model loading, settings update
- `services/settings_service.py:144-148` — calls `parakeet.update_settings_async()`
- `api/interface.py:183-192` — `ParakeetSettings` dataclass

### onnx_asr library (3rd party, in venv)
- `onnx_asr/loader.py:187-357` — `load_model()`, downloads files, creates ONNX sessions
- `onnx_asr/models/nemo.py:70-85` — `NemoConformerRnnt.__init__()` creates encoder/decoder sessions
- `onnx_asr/models/nemo.py:181-183` — `NemoConformerAED._get_excluded_providers()` excludes CoreML
- `onnx_asr/preprocessors/preprocessor.py` — Preprocessor, excludes CUDA
- `onnx_asr/onnx.py:81-101` — `update_onnx_providers()` filters provider list

### Client (wingman-client)
- The execution provider dropdown is in the STT settings UI — may need to filter options per model/platform

## Provider/model compatibility matrix

| Provider | NemoConformerTdt (Parakeet) | NemoConformerAED | Preprocessor |
|----------|---------------------------|------------------|-------------|
| CPU | Yes | Yes | Yes |
| DirectML | Yes | Yes | Yes |
| CUDA | Yes | Yes | **Excluded** |
| CoreML | **Crashes** (should exclude) | **Excluded** | ? |
| TensorRT | Excluded | Excluded | Excluded |
94 changes: 29 additions & 65 deletions providers/faster_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,42 @@
)
from services.printr import Printr

MODELS_DIR = "faster-whisper-models"


class FasterWhisper:
def __init__(
self,
settings: FasterWhisperSettings,
app_root_path: str,
app_is_bundled: bool,
):
def __init__(self, settings: FasterWhisperSettings):
self.printr = Printr()
self.settings = settings
self.model: Optional[WhisperModel] = None

# move one dir up, out of _internal (if bundled)
app_dir = path.dirname(app_root_path) if app_is_bundled else app_root_path
self.models_dir = path.join(app_dir, MODELS_DIR)
def load(self, model_dir: str):
"""Load the FasterWhisper model. Called by SttProviderManager.

Args:
model_dir: Directory containing model files (from ModelDownloader).
"""
self.unload()

model_file = path.join(model_dir, self.settings.model_size)
model = model_file if path.exists(model_file) else self.settings.model_size

if self.settings.enable:
self.__update_model()
try:
self.model = WhisperModel(
model,
device=self.settings.device,
compute_type=self.settings.compute_type,
)
self.printr.print(
f"FasterWhisper initialized with model '{model}' (device: '{self.settings.device}').",
server_only=True,
color=LogType.POSITIVE,
)
except Exception as e:
self.printr.toast_error(
f"Failed to initialize FasterWhisper with model {model_file}. Error: {e}"
)

def __unload_model(self):
"""Unload the current model to free VRAM."""
def unload(self):
"""Unload the current model to free VRAM. Called by SttProviderManager."""
if self.model is not None:
self.printr.print(
"FasterWhisper: Unloading current model to free VRAM...",
Expand All @@ -42,48 +55,22 @@ def __unload_model(self):
del self.model
self.model = None

# Force garbage collection to release memory
gc.collect()

# Clear CUDA cache if using GPU
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
except ImportError:
pass # torch not available, skip CUDA cleanup
pass
except Exception as e:
# any other CUDA-related cleanup error should not crash model reload
self.printr.print(
f"FasterWhisper: CUDA cleanup failed during model unload: {e}",
server_only=True,
color=LogType.WARNING,
)

def __update_model(self):
# Unload the existing model first to free VRAM
self.__unload_model()

model_file = path.join(self.models_dir, self.settings.model_size)
model = model_file if path.exists(model_file) else self.settings.model_size

try:
self.model = WhisperModel(
model,
device=self.settings.device,
compute_type=self.settings.compute_type,
)
self.printr.print(
f"FasterWhisper initialized with model '{model}' (device: '{self.settings.device}').",
server_only=True,
color=LogType.POSITIVE,
)
except Exception as e:
self.printr.toast_error(
f"Failed to initialize FasterWhisper with model {model_file}. Error: {e}"
)

def transcribe(
self,
config: FasterWhisperSttConfig,
Expand Down Expand Up @@ -127,28 +114,5 @@ def transcribe(

return None

def update_settings(self, settings: FasterWhisperSettings):
if self.settings == settings:
self.printr.print("FasterWhisper settings unchanged.", server_only=True)
return

was_enabled = self.settings.enable
self.settings = settings

if not settings.enable:
if was_enabled:
self.printr.print(
"FasterWhisper disabled, unloading model...",
server_only=True,
)
self.__unload_model()
return

self.printr.print(
"FasterWhisper settings updated, reloading model...",
server_only=True,
)
self.__update_model()

def validate(self, errors: list[WingmanInitializationError]):
pass
Loading
Loading