diff --git a/Installer/ElevenLabsKeyDlg.cpp b/Installer/ElevenLabsKeyDlg.cpp new file mode 100644 index 0000000..8891c7a --- /dev/null +++ b/Installer/ElevenLabsKeyDlg.cpp @@ -0,0 +1,83 @@ +#include "framework.h" +#include "Installer.h" +#include "RegKey.h" + +INT_PTR CALLBACK ElevenLabsKeyDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) +{ + UNREFERENCED_PARAMETER(lParam); + switch (message) + { + case WM_INITDIALOG: + { + RegKey key; + key.Open(HKEY_CURRENT_USER, L"Software\\NaturalVoiceSAPIAdapter\\Enumerator", KEY_QUERY_VALUE); + + SetDlgItemTextW(hDlg, IDC_ELEVENLABS_API_KEY, key.GetString(L"ElevenLabsApiKey").c_str()); + + // Populate model combobox + HWND hModel = GetDlgItem(hDlg, IDC_ELEVENLABS_MODEL); + const LPCWSTR models[] = { + L"eleven_multilingual_v2", + L"eleven_turbo_v2_5", + L"eleven_turbo_v2", + L"eleven_monolingual_v1", + }; + for (auto* m : models) + SendMessageW(hModel, CB_ADDSTRING, 0, (LPARAM)m); + + std::wstring curModel = key.GetString(L"ElevenLabsModel"); + if (curModel.empty()) curModel = L"eleven_multilingual_v2"; + int sel = (int)SendMessageW(hModel, CB_FINDSTRINGEXACT, (WPARAM)-1, (LPARAM)curModel.c_str()); + if (sel < 0) + { + // Model not in list — add it and select it + sel = (int)SendMessageW(hModel, CB_ADDSTRING, 0, (LPARAM)curModel.c_str()); + } + SendMessageW(hModel, CB_SETCURSEL, sel >= 0 ? sel : 0, 0); + + return TRUE; + } + + case WM_COMMAND: + switch (LOWORD(wParam)) + { + case IDOK: + { + RegKey key; + key.Create(HKEY_CURRENT_USER, L"Software\\NaturalVoiceSAPIAdapter\\Enumerator", KEY_SET_VALUE); + + WCHAR buf[512]; + GetDlgItemTextW(hDlg, IDC_ELEVENLABS_API_KEY, buf, 512); + key.SetString(L"ElevenLabsApiKey", buf); + + HWND hModel = GetDlgItem(hDlg, IDC_ELEVENLABS_MODEL); + int sel = (int)SendMessageW(hModel, CB_GETCURSEL, 0, 0); + if (sel >= 0) + { + SendMessageW(hModel, CB_GETLBTEXT, sel, (LPARAM)buf); + key.SetString(L"ElevenLabsModel", buf); + } + + EndDialog(hDlg, IDOK); + return TRUE; + } + case IDCANCEL: + EndDialog(hDlg, IDCANCEL); + return TRUE; + } + break; + + case WM_NOTIFY: + switch (((LPNMHDR)lParam)->code) + { + case NM_CLICK: + case NM_RETURN: + ShellExecuteW(nullptr, nullptr, + L"https://elevenlabs.io/app/settings/api-keys", + nullptr, nullptr, SW_SHOWNORMAL); + break; + } + break; + } + return (INT_PTR)FALSE; +} diff --git a/Installer/Installer.rc b/Installer/Installer.rc index 2f616fa..5818eb3 100644 Binary files a/Installer/Installer.rc and b/Installer/Installer.rc differ diff --git a/Installer/Installer.vcxproj b/Installer/Installer.vcxproj index ecab19f..d6b2c86 100644 --- a/Installer/Installer.vcxproj +++ b/Installer/Installer.vcxproj @@ -164,6 +164,8 @@ + + diff --git a/Installer/Installer.vcxproj.filters b/Installer/Installer.vcxproj.filters index a015e38..4a3fe89 100644 --- a/Installer/Installer.vcxproj.filters +++ b/Installer/Installer.vcxproj.filters @@ -38,6 +38,12 @@ Source Files + + Source Files + + + Source Files + Source Files diff --git a/Installer/MainDlg.cpp b/Installer/MainDlg.cpp index f7ff859..6eccb81 100644 --- a/Installer/MainDlg.cpp +++ b/Installer/MainDlg.cpp @@ -8,6 +8,8 @@ INT_PTR CALLBACK AboutDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam); INT_PTR CALLBACK LangDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam); INT_PTR CALLBACK AzureKeyDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam); +INT_PTR CALLBACK PollyKeyDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam); +INT_PTR CALLBACK ElevenLabsKeyDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam); void Register(bool is64Bit); void Unregister(bool is64Bit); @@ -47,8 +49,12 @@ static void UpdateEnableStates(HWND hDlg) EnableWindow(GetDlgItem(hDlg, IDC_BROWSE_LOCAL_VOICE), localEnabled); BOOL edgeEnabled = IsDlgButtonChecked(hDlg, IDC_CHK_EDGE_VOICES) == BST_CHECKED; BOOL azureEnabled = IsDlgButtonChecked(hDlg, IDC_CHK_AZURE_VOICES) == BST_CHECKED; + BOOL pollyEnabled = IsDlgButtonChecked(hDlg, IDC_CHK_POLLY_VOICES) == BST_CHECKED; + BOOL elevenLabsEnabled = IsDlgButtonChecked(hDlg, IDC_CHK_ELEVENLABS_VOICES) == BST_CHECKED; EnableWindow(GetDlgItem(hDlg, IDC_SET_AZURE_KEY), azureEnabled); - BOOL onlineEnabled = edgeEnabled || azureEnabled; + EnableWindow(GetDlgItem(hDlg, IDC_SET_POLLY_KEY), pollyEnabled); + EnableWindow(GetDlgItem(hDlg, IDC_SET_ELEVENLABS_KEY), elevenLabsEnabled); + BOOL onlineEnabled = edgeEnabled || azureEnabled || pollyEnabled || elevenLabsEnabled; EnableWindow(GetDlgItem(hDlg, IDC_STATIC_INCLUDED_LANGUAGES), onlineEnabled); EnableWindow(GetDlgItem(hDlg, IDC_INCLUDED_LANGUAGES), onlineEnabled); EnableWindow(GetDlgItem(hDlg, IDC_CHANGE_LANGUAGES), onlineEnabled); @@ -66,7 +72,18 @@ static void UpdateDisplay(HWND hDlg) key.GetDword(L"NoEdgeVoices") ? BST_UNCHECKED : BST_CHECKED); CheckDlgButton(hDlg, IDC_CHK_AZURE_VOICES, key.GetDword(L"NoAzureVoices") - || (key.GetString(L"AzureVoiceKey").empty() && key.GetString(L"AzureVoiceRegion").empty()) + || key.GetString(L"AzureVoiceKey").empty() + || key.GetString(L"AzureVoiceRegion").empty() + ? BST_UNCHECKED : BST_CHECKED); + CheckDlgButton(hDlg, IDC_CHK_POLLY_VOICES, + key.GetDword(L"NoPollyVoices") + || key.GetString(L"PollyAccessKey").empty() + || key.GetString(L"PollySecretKey").empty() + || key.GetString(L"PollyRegion").empty() + ? BST_UNCHECKED : BST_CHECKED); + CheckDlgButton(hDlg, IDC_CHK_ELEVENLABS_VOICES, + key.GetDword(L"NoElevenLabsVoices") + || key.GetString(L"ElevenLabsApiKey").empty() ? BST_UNCHECKED : BST_CHECKED); SetDlgItemTextW(hDlg, IDC_LOCAL_VOICE_PATH, key.GetString(L"NarratorVoicePath").c_str()); @@ -157,6 +174,8 @@ static void SaveChanges(HWND hDlg) key.SetDword(L"NoNarratorVoices", IsDlgButtonChecked(hDlg, IDC_CHK_NARRATOR_VOICES) == BST_UNCHECKED); key.SetDword(L"NoEdgeVoices", IsDlgButtonChecked(hDlg, IDC_CHK_EDGE_VOICES) == BST_UNCHECKED); key.SetDword(L"NoAzureVoices", IsDlgButtonChecked(hDlg, IDC_CHK_AZURE_VOICES) == BST_UNCHECKED); + key.SetDword(L"NoPollyVoices", IsDlgButtonChecked(hDlg, IDC_CHK_POLLY_VOICES) == BST_UNCHECKED); + key.SetDword(L"NoElevenLabsVoices", IsDlgButtonChecked(hDlg, IDC_CHK_ELEVENLABS_VOICES) == BST_UNCHECKED); WCHAR path[MAX_PATH]; GetDlgItemTextW(hDlg, IDC_LOCAL_VOICE_PATH, path, MAX_PATH); key.SetString(L"NarratorVoicePath", path); @@ -212,6 +231,8 @@ INT_PTR CALLBACK MainDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) case IDC_CHK_NARRATOR_VOICES: case IDC_CHK_EDGE_VOICES: case IDC_CHK_AZURE_VOICES: + case IDC_CHK_POLLY_VOICES: + case IDC_CHK_ELEVENLABS_VOICES: UpdateEnableStates(hDlg); SaveChanges(hDlg); break; @@ -224,6 +245,14 @@ INT_PTR CALLBACK MainDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) case IDC_SET_AZURE_KEY: DialogBoxParamW(nullptr, MAKEINTRESOURCEW(IDD_AZUREKEY), hDlg, AzureKeyDlg, 0); break; + case IDC_SET_POLLY_KEY: + DialogBoxParamW(nullptr, MAKEINTRESOURCEW(IDD_POLLYKEY), hDlg, PollyKeyDlg, 0); + UpdateDisplay(hDlg); + break; + case IDC_SET_ELEVENLABS_KEY: + DialogBoxParamW(nullptr, MAKEINTRESOURCEW(IDD_ELEVENKEY), hDlg, ElevenLabsKeyDlg, 0); + UpdateDisplay(hDlg); + break; case IDC_CHANGE_LANGUAGES: DialogBoxParamW(nullptr, MAKEINTRESOURCEW(IDD_LANG), hDlg, LangDlg, 0); UpdateDisplay(hDlg); diff --git a/Installer/PollyKeyDlg.cpp b/Installer/PollyKeyDlg.cpp new file mode 100644 index 0000000..ab4288b --- /dev/null +++ b/Installer/PollyKeyDlg.cpp @@ -0,0 +1,79 @@ +#include "framework.h" +#include "Installer.h" +#include "RegKey.h" + +INT_PTR CALLBACK PollyKeyDlg(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) +{ + UNREFERENCED_PARAMETER(lParam); + switch (message) + { + case WM_INITDIALOG: + { + RegKey key; + key.Open(HKEY_CURRENT_USER, L"Software\\NaturalVoiceSAPIAdapter\\Enumerator", KEY_QUERY_VALUE); + + SetDlgItemTextW(hDlg, IDC_POLLY_ACCESS_KEY, key.GetString(L"PollyAccessKey").c_str()); + SetDlgItemTextW(hDlg, IDC_POLLY_SECRET_KEY, key.GetString(L"PollySecretKey").c_str()); + SetDlgItemTextW(hDlg, IDC_POLLY_REGION, key.GetString(L"PollyRegion").c_str()); + + // Populate engine combobox + HWND hEngine = GetDlgItem(hDlg, IDC_POLLY_ENGINE); + const LPCWSTR engines[] = { L"generative", L"neural", L"long-form", L"standard" }; + for (auto* e : engines) + SendMessageW(hEngine, CB_ADDSTRING, 0, (LPARAM)e); + + std::wstring curEngine = key.GetString(L"PollyEngine"); + if (curEngine.empty()) curEngine = L"neural"; + int sel = (int)SendMessageW(hEngine, CB_FINDSTRINGEXACT, (WPARAM)-1, (LPARAM)curEngine.c_str()); + SendMessageW(hEngine, CB_SETCURSEL, sel >= 0 ? sel : 1 /*neural*/, 0); + + return TRUE; + } + + case WM_COMMAND: + switch (LOWORD(wParam)) + { + case IDOK: + { + RegKey key; + key.Create(HKEY_CURRENT_USER, L"Software\\NaturalVoiceSAPIAdapter\\Enumerator", KEY_SET_VALUE); + + WCHAR buf[512]; + GetDlgItemTextW(hDlg, IDC_POLLY_ACCESS_KEY, buf, 512); + key.SetString(L"PollyAccessKey", buf); + GetDlgItemTextW(hDlg, IDC_POLLY_SECRET_KEY, buf, 512); + key.SetString(L"PollySecretKey", buf); + GetDlgItemTextW(hDlg, IDC_POLLY_REGION, buf, 512); + key.SetString(L"PollyRegion", buf); + + HWND hEngine = GetDlgItem(hDlg, IDC_POLLY_ENGINE); + int sel = (int)SendMessageW(hEngine, CB_GETCURSEL, 0, 0); + if (sel >= 0) + { + SendMessageW(hEngine, CB_GETLBTEXT, sel, (LPARAM)buf); + key.SetString(L"PollyEngine", buf); + } + + EndDialog(hDlg, IDOK); + return TRUE; + } + case IDCANCEL: + EndDialog(hDlg, IDCANCEL); + return TRUE; + } + break; + + case WM_NOTIFY: + switch (((LPNMHDR)lParam)->code) + { + case NM_CLICK: + case NM_RETURN: + ShellExecuteW(nullptr, nullptr, + L"https://console.aws.amazon.com/iam/home#/users", + nullptr, nullptr, SW_SHOWNORMAL); + break; + } + break; + } + return (INT_PTR)FALSE; +} diff --git a/Installer/resource.h b/Installer/resource.h index 437534f..3078e98 100644 --- a/Installer/resource.h +++ b/Installer/resource.h @@ -1,6 +1,6 @@ //{{NO_DEPENDENCIES}} -// Microsoft Visual C++ ÉúģÉĩİüšŽÎÄžþĄĢ -// đĐ Installer.rc ĘđÓà +// Microsoft Visual C++ ïŋ―ïŋ―ïŋ―Éĩİïŋ―ïŋ―ïŋ―ïŋ―Äžïŋ―ïŋ―ïŋ― +// ïŋ―ïŋ― Installer.rc Ęđïŋ―ïŋ― // #define IDC_MYICON 2 #define IDD_MAIN 102 @@ -59,16 +59,30 @@ #define IDC_AZURE_REGION 1031 #define IDC_STATIC_INCLUDED_LANGUAGES 1032 #define IDC_LANG_MULTILINGUAL 1033 +#define IDC_CHK_POLLY_VOICES 1034 +#define IDC_SET_POLLY_KEY 1035 +#define IDC_POLLY_LINK 1036 +#define IDC_POLLY_ACCESS_KEY 1037 +#define IDC_POLLY_SECRET_KEY 1038 +#define IDC_POLLY_REGION 1039 +#define IDC_POLLY_ENGINE 1040 +#define IDD_POLLYKEY 141 +#define IDD_ELEVENKEY 142 +#define IDC_CHK_ELEVENLABS_VOICES 1041 +#define IDC_SET_ELEVENLABS_KEY 1042 +#define IDC_ELEVENLABS_LINK 1043 +#define IDC_ELEVENLABS_API_KEY 1044 +#define IDC_ELEVENLABS_MODEL 1045 #define IDC_STATIC -1 // Next default values for new objects -// +// #ifdef APSTUDIO_INVOKED #ifndef APSTUDIO_READONLY_SYMBOLS #define _APS_NO_MFC 1 -#define _APS_NEXT_RESOURCE_VALUE 141 +#define _APS_NEXT_RESOURCE_VALUE 143 #define _APS_NEXT_COMMAND_VALUE 32771 -#define _APS_NEXT_CONTROL_VALUE 1034 +#define _APS_NEXT_CONTROL_VALUE 1046 #define _APS_NEXT_SYMED_VALUE 110 #endif #endif diff --git a/NaturalVoiceSAPIAdapter/AmazonPollyAPI.cpp b/NaturalVoiceSAPIAdapter/AmazonPollyAPI.cpp new file mode 100644 index 0000000..a6f7a6d --- /dev/null +++ b/NaturalVoiceSAPIAdapter/AmazonPollyAPI.cpp @@ -0,0 +1,517 @@ +#include "pch.h" +#define ASIO_STANDALONE 1 +#include +#include +#include "AmazonPollyAPI.h" +#include "NetUtils.h" +#include "StrUtils.h" +#include "Logger.h" +#include +#include +#include +#include +#include + +// ───────────────────────────────────────────────────────────────────────────── +// Internal crypto helpers +// ───────────────────────────────────────────────────────────────────────────── + +static std::string HmacSha256Raw(std::string_view key, std::string_view data) +{ + unsigned char hash[32]; + unsigned int len = 32; + HMAC(EVP_sha256(), + key.data(), static_cast(key.size()), + reinterpret_cast(data.data()), data.size(), + hash, &len); + return std::string(reinterpret_cast(hash), len); +} + +static std::string Sha256Hex(std::string_view data) +{ + unsigned char hash[32]; + SHA256(reinterpret_cast(data.data()), data.size(), hash); + std::string out(64, '0'); + for (int i = 0; i < 32; ++i) + std::format_to(out.begin() + i * 2, "{:02x}", hash[i]); + return out; +} + +static std::string HmacSha256Hex(std::string_view key, std::string_view data) +{ + auto raw = HmacSha256Raw(key, data); + std::string out(64, '0'); + for (int i = 0; i < 32; ++i) + std::format_to(out.begin() + i * 2, "{:02x}", + static_cast(raw[i])); + return out; +} + +// ───────────────────────────────────────────────────────────────────────────── +// AWS Signature V4 +// ───────────────────────────────────────────────────────────────────────────── + +void AmazonPollyAPI::SetCredentials(std::string accessKeyId, + std::string secretKey, + std::string region) +{ + m_accessKeyId = std::move(accessKeyId); + m_secretKey = std::move(secretKey); + m_region = std::move(region); +} + +AmazonPollyAPI::SigV4Result AmazonPollyAPI::ComputeSigV4( + const std::string& method, + const std::string& path, + const std::string& query, + const std::string& host, + const std::string& body) const +{ + SYSTEMTIME st; + GetSystemTime(&st); + + const std::string date = std::format("{:04}{:02}{:02}", + st.wYear, st.wMonth, st.wDay); + const std::string datetime = std::format("{:04}{:02}{:02}T{:02}{:02}{:02}Z", + st.wYear, st.wMonth, st.wDay, st.wHour, st.wMinute, st.wSecond); + + const std::string bodyHash = Sha256Hex(body); + const std::string signedHdrs = "content-type;host;x-amz-date"; + const std::string service = "polly"; + + // Step 1 – canonical request + const std::string canonicalReq = + method + "\n" + + path + "\n" + + query + "\n" + + "content-type:application/json\n" + + "host:" + host + "\n" + + "x-amz-date:" + datetime + "\n" + + "\n" + + signedHdrs + "\n" + + bodyHash; + + // Step 2 – string to sign + const std::string credScope = date + "/" + m_region + "/" + service + "/aws4_request"; + const std::string strToSign = + "AWS4-HMAC-SHA256\n" + + datetime + "\n" + + credScope + "\n" + + Sha256Hex(canonicalReq); + + // Step 3 – derived signing key + const std::string kDate = HmacSha256Raw("AWS4" + m_secretKey, date); + const std::string kRegion = HmacSha256Raw(kDate, m_region); + const std::string kService = HmacSha256Raw(kRegion, service); + const std::string kSigning = HmacSha256Raw(kService,"aws4_request"); + + // Step 4 – signature + const std::string signature = HmacSha256Hex(kSigning, strToSign); + + // Step 5 – Authorization header + const std::string auth = + "AWS4-HMAC-SHA256 Credential=" + m_accessKeyId + "/" + credScope + + ", SignedHeaders=" + signedHdrs + + ", Signature=" + signature; + + return { datetime, auth }; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Minimal HTTP/1.1 helpers (ASIO + OpenSSL, same stack as NetUtils) +// ───────────────────────────────────────────────────────────────────────────── + +struct ParsedHttpResponse +{ + int statusCode = 0; + std::map headers; + std::string body; // raw bytes (may be binary) +}; + +static ParsedHttpResponse ParseHttpResponse(const std::string& raw) +{ + ParsedHttpResponse res; + + const size_t headerEnd = raw.find("\r\n\r\n"); + if (headerEnd == std::string::npos) + throw std::runtime_error("Polly: invalid HTTP response (no header delimiter)"); + + // Status line + const size_t statusLineEnd = raw.find("\r\n"); + if (statusLineEnd > 9) + res.statusCode = std::stoi(raw.substr(9, 3)); + + // Headers + size_t pos = statusLineEnd + 2; + while (pos < headerEnd) + { + const size_t next = raw.find("\r\n", pos); + const size_t end = (next == std::string::npos) ? headerEnd : next; + const std::string line = raw.substr(pos, end - pos); + const size_t colon = line.find(':'); + if (colon != std::string::npos) + { + std::string k = line.substr(0, colon); + std::string v = line.substr(colon + 1); + if (!v.empty() && v.front() == ' ') v.erase(v.begin()); + // Lowercase key for easy lookup + for (char& c : k) c = static_cast(std::tolower(static_cast(c))); + res.headers[k] = std::move(v); + } + pos = end + 2; + } + + // Body: after \r\n\r\n + res.body = raw.substr(headerEnd + 4); + + // Dechunk if Transfer-Encoding: chunked + auto it = res.headers.find("transfer-encoding"); + if (it != res.headers.end() && + it->second.find("chunked") != std::string::npos) + { + std::string dechunked; + size_t p = 0; + while (p < res.body.size()) + { + // Chunk size line + const size_t nl = res.body.find("\r\n", p); + if (nl == std::string::npos) break; + const size_t chunkSize = std::stoul(res.body.substr(p, nl - p), nullptr, 16); + if (chunkSize == 0) break; + p = nl + 2; + if (p + chunkSize > res.body.size()) break; + dechunked.append(res.body, p, chunkSize); + p += chunkSize + 2; // skip trailing \r\n + } + res.body = std::move(dechunked); + } + + return res; +} + +// Open an SSL stream to host:443 and run the lambda with it. +// Registers a stop_callback that closes the socket on cancellation. +template +static auto WithSslStream(const std::string& host, + std::stop_token stopToken, + Func&& fn) + -> decltype(fn(std::declval&>())) +{ + asio::io_context ioctx; + asio::ssl::context sslctx(asio::ssl::context::sslv23_client); + asio::ssl::stream stream(ioctx, sslctx); + + // Cancel via socket close when stop is requested + std::stop_callback stopCb(stopToken, [&stream]() { + asio::error_code ec; + stream.lowest_layer().close(ec); + }); + + if (stopToken.stop_requested()) + return {}; + + auto resolved = asio::ip::tcp::resolver(ioctx).resolve(host, "443"); + stream.next_layer().connect(*resolved); + stream.handshake(asio::ssl::stream_base::client); + + return fn(stream); +} + +static std::string HttpsRequest(const std::string& method, + const std::string& host, + const std::string& pathAndQuery, + const std::string& body, + const std::string& datetime, + const std::string& authorization, + std::stop_token stopToken = {}) +{ + return WithSslStream(host, stopToken, + [&](asio::ssl::stream& stream) -> std::string + { + std::string req = + method + " " + pathAndQuery + " HTTP/1.1\r\n" + "Host: " + host + "\r\n" + "Content-Type: application/json\r\n" + "X-Amz-Date: " + datetime + "\r\n" + "Authorization: " + authorization + "\r\n" + "Connection: close\r\n"; + if (!body.empty()) + req += "Content-Length: " + std::to_string(body.size()) + "\r\n"; + req += "\r\n"; + req += body; + + asio::write(stream, asio::buffer(req)); + + std::string response; + asio::error_code ec; + asio::read(stream, asio::dynamic_string_buffer(response), ec); + + if (ec != asio::error::eof && + ec != asio::ssl::error::stream_truncated && + ec) + { + if (stopToken.stop_requested()) return {}; + asio::detail::throw_error(ec); + } + return response; + }); +} + +static std::string UrlEncodeRFC3986(std::string_view value) +{ + static constexpr char hex[] = "0123456789ABCDEF"; + std::string encoded; + encoded.reserve(value.size()); + for (unsigned char c : value) + { + if ((c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == '-' || c == '_' || c == '.' || c == '~') + { + encoded.push_back(static_cast(c)); + } + else + { + encoded.push_back('%'); + encoded.push_back(hex[c >> 4]); + encoded.push_back(hex[c & 0x0F]); + } + } + return encoded; +} + +// ───────────────────────────────────────────────────────────────────────────── +// SSML preprocessing for Polly +// Polly does not support (uses instead), +// does not support , and has very limited +// support (none at all for generative/long-form engines). +// We do minimal string-level cleanup here so that BuildSSML output is accepted. +// ───────────────────────────────────────────────────────────────────────────── + +static std::string PreprocessSsmlForPolly(std::string ssml) +{ + // Remove tags – replace with nothing. + // They are always self-closing and generated by BuildSSML in one consistent form. + for (;;) + { + const size_t start = ssml.find("", start); + if (end == std::string::npos) break; + ssml.erase(start, end - start + 2); + } + + // Remove ... – keep inner text. + for (;;) + { + // Look for the sapi phoneme opening tag + const size_t tagStart = ssml.find("', tagStart); + if (tagEnd == std::string::npos) break; + + const size_t closeTag = ssml.find("", tagEnd); + if (closeTag == std::string::npos) + { + // Malformed – just remove the opening tag + ssml.erase(tagStart, tagEnd - tagStart + 1); + } + else + { + // Keep the inner text, remove both tags + const std::string innerText = ssml.substr(tagEnd + 1, closeTag - tagEnd - 1); + ssml.replace(tagStart, closeTag - tagStart + 10 /**/, innerText); + } + } + + // Remove ... tags – keep inner text. + // Polly support by engine: + // generative / long-form : no support at all → always errors + // neural : rate and volume only; pitch causes 400 + // standard : rate, pitch, volume – but values from SAPI rarely + // translate correctly anyway + // Stripping universally is safer than receiving a 400 "Unsupported feature". + for (;;) + { + const size_t tagStart = ssml.find("', tagStart); + if (tagEnd == std::string::npos) break; + + // Self-closing – just remove the tag + if (ssml[tagEnd - 1] == '/') + { + ssml.erase(tagStart, tagEnd - tagStart + 1); + continue; + } + + const size_t closeTag = ssml.find("", tagEnd); + if (closeTag == std::string::npos) + { + // Malformed – just remove the opening tag + ssml.erase(tagStart, tagEnd - tagStart + 1); + } + else + { + // Keep the inner text, remove both tags + const std::string innerText = ssml.substr(tagEnd + 1, closeTag - tagEnd - 1); + ssml.replace(tagStart, closeTag - tagStart + 10 /**/, innerText); + } + } + + return ssml; +} + +// ───────────────────────────────────────────────────────────────────────────── +// SpeakAsync / DoSpeakAsync +// ───────────────────────────────────────────────────────────────────────────── + +std::future AmazonPollyAPI::SpeakAsync(const std::wstring& ssml, + const std::string& voiceId, + const std::string& engine) +{ + m_ssml = ssml; // wstring_view into TTSEngine::m_ssml + m_voiceId = voiceId; + m_engine = engine; + m_waveBytesWritten = 0; + m_stopSource = {}; + + return std::async(std::launch::async, + std::bind(&AmazonPollyAPI::DoSpeakAsync, this)); +} + +void AmazonPollyAPI::Stop() +{ + m_stopSource.request_stop(); +} + +void AmazonPollyAPI::DoSpeakAsync() +{ + const std::string host = "polly." + m_region + ".amazonaws.com"; + const std::string path = "/v1/speech"; + + // Prepare SSML: convert to UTF-8 and strip Polly-incompatible tags + const std::string ssmlUtf8 = PreprocessSsmlForPolly(WStringToUTF8(m_ssml)); + + const nlohmann::json bodyJson = { + {"Engine", m_engine}, + {"OutputFormat", "mp3"}, + {"SampleRate", "24000"}, + {"Text", ssmlUtf8}, + {"TextType", "ssml"}, + {"VoiceId", m_voiceId} + }; + const std::string body = bodyJson.dump(); + + LogDebug("Polly: Speak request: engine={} voice={}", m_engine, m_voiceId); + LogTrace("Polly: Request body: {}", body); + + const auto sig = ComputeSigV4("POST", path, "", host, body); + + if (m_stopSource.stop_requested()) return; + + const std::string rawResponse = HttpsRequest( + "POST", host, path, body, + sig.datetime, sig.authorization, + m_stopSource.get_token()); + + if (m_stopSource.stop_requested() || rawResponse.empty()) return; + + const auto resp = ParseHttpResponse(rawResponse); + + if (resp.statusCode != 200) + { + LogTrace("Polly: Error response body: {}", resp.body.substr(0, 500)); + std::string errMsg; + try + { + auto errJson = nlohmann::json::parse(resp.body); + errMsg = errJson.value("message", resp.body.substr(0, 300)); + } + catch (...) { errMsg = resp.body.substr(0, 300); } + throw std::runtime_error( + "Polly API error " + std::to_string(resp.statusCode) + ": " + errMsg); + } + + LogDebug("Polly: Response received, {} bytes of MP3", resp.body.size()); + + // Decode MP3 -> PCM -> deliver via AudioReceivedCallback. Keep chunks small + // to avoid large blocking SAPI writes in game hosts. + Mp3Decoder mp3; + constexpr size_t MP3_DECODE_CHUNK_SIZE = 2048; + for (size_t offset = 0; offset < resp.body.size();) + { + if (m_stopSource.stop_requested()) return; + + const size_t chunkSize = std::min(MP3_DECODE_CHUNK_SIZE, resp.body.size() - offset); + mp3.Convert( + reinterpret_cast(resp.body.data() + offset), + chunkSize, + [this](BYTE* data, uint32_t size) + { + if (!AudioReceivedCallback || m_stopSource.stop_requested()) return; + const int written = AudioReceivedCallback(data, size); + m_waveBytesWritten += static_cast(written); + }); + offset += chunkSize; + } + + if (SessionEndCallback) + SessionEndCallback(m_waveBytesWritten); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Voice list +// ───────────────────────────────────────────────────────────────────────────── + +nlohmann::json AmazonPollyAPI::GetVoiceList(const std::string& accessKeyId, + const std::string& secretKey, + const std::string& region, + const std::string& engine) +{ + AmazonPollyAPI tmp; + tmp.SetCredentials(accessKeyId, secretKey, region); + + const std::string host = "polly." + region + ".amazonaws.com"; + const std::string path = "/v1/voices"; + nlohmann::json allVoices = nlohmann::json::array(); + std::string nextToken; + + for (;;) + { + std::string query = "Engine=" + UrlEncodeRFC3986(engine); + if (!nextToken.empty()) + query += "&NextToken=" + UrlEncodeRFC3986(nextToken); + + const auto sig = tmp.ComputeSigV4("GET", path, query, host, ""); + + const std::string rawResponse = HttpsRequest( + "GET", host, path + "?" + query, "", + sig.datetime, sig.authorization); + + const auto resp = ParseHttpResponse(rawResponse); + + if (resp.statusCode != 200) + { + LogTrace("Polly: Voice list error response: {}", resp.body.substr(0, 500)); + throw std::runtime_error( + "Polly voice list error " + std::to_string(resp.statusCode) + + ": " + resp.body.substr(0, 200)); + } + + const auto json = nlohmann::json::parse(resp.body); + LogTrace("Polly: Voice list response page: {}", resp.body.substr(0, 2000)); + + for (const auto& voice : json.at("Voices")) + allVoices.push_back(voice); + + if (json.contains("NextToken") && json["NextToken"].is_string()) + nextToken = json["NextToken"].get(); + else + break; + } + + LogTrace("Polly: Voice list fetched: {} voices for engine={}", allVoices.size(), engine); + return allVoices; +} diff --git a/NaturalVoiceSAPIAdapter/AmazonPollyAPI.h b/NaturalVoiceSAPIAdapter/AmazonPollyAPI.h new file mode 100644 index 0000000..ded9fc5 --- /dev/null +++ b/NaturalVoiceSAPIAdapter/AmazonPollyAPI.h @@ -0,0 +1,67 @@ +#pragma once +#include +#include +#include +#include +#include +#include "Mp3Decoder.h" +#include + +// Amazon Polly TTS REST API client. +// Interface mirrors SpeechRestAPI so TTSEngine can handle both uniformly. +class AmazonPollyAPI +{ +public: + // Audio/event callbacks – same signatures as SpeechRestAPI + std::function AudioReceivedCallback; + std::function WordBoundaryCallback; + std::function SentenceBoundaryCallback; + std::function BookmarkCallback; + std::function SessionEndCallback; + + void SetCredentials(std::string accessKeyId, std::string secretKey, std::string region); + + // ssml – SSML built by BuildSSML(); must remain valid until the future is done + // voiceId – Polly voice ID, e.g. "Joanna" + // engine – "neural" | "standard" | "long-form" | "generative" + std::future SpeakAsync(const std::wstring& ssml, + const std::string& voiceId, + const std::string& engine); + void Stop(); + + uint64_t GetWaveBytesWritten() const noexcept { return m_waveBytesWritten; } + + // Fetch the list of available voices directly from Polly (no local cache). + // Returns a JSON array in the same shape as the Polly /v1/voices response. + static nlohmann::json GetVoiceList(const std::string& accessKeyId, + const std::string& secretKey, + const std::string& region, + const std::string& engine = "neural"); + +private: + std::string m_accessKeyId; + std::string m_secretKey; + std::string m_region; + + std::wstring_view m_ssml; // view into TTSEngine::m_ssml – valid while future runs + std::string m_voiceId; + std::string m_engine; + + std::stop_source m_stopSource; + uint64_t m_waveBytesWritten = 0; + + // AWS SigV4 result + struct SigV4Result + { + std::string datetime; // YYYYMMDDTHHmmssZ + std::string authorization; // full Authorization header value + }; + + SigV4Result ComputeSigV4(const std::string& method, + const std::string& path, + const std::string& query, + const std::string& host, + const std::string& body) const; + + void DoSpeakAsync(); +}; diff --git a/NaturalVoiceSAPIAdapter/ElevenLabsAPI.cpp b/NaturalVoiceSAPIAdapter/ElevenLabsAPI.cpp new file mode 100644 index 0000000..a4c41b6 --- /dev/null +++ b/NaturalVoiceSAPIAdapter/ElevenLabsAPI.cpp @@ -0,0 +1,470 @@ +#include "pch.h" +#define ASIO_STANDALONE 1 +#include +#include +#include "ElevenLabsAPI.h" +#include "StrUtils.h" +#include "Logger.h" +#include + +static constexpr const char* ELEVENLABS_HOST = "api.elevenlabs.io"; + +// ───────────────────────────────────────────────────────────────────────────── +// Minimal HTTP/1.1 helpers (same ASIO + OpenSSL stack as AmazonPollyAPI) +// ───────────────────────────────────────────────────────────────────────────── + +struct EL_ParsedHttpResponse +{ + int statusCode = 0; + std::string body; +}; + +static EL_ParsedHttpResponse EL_ParseHttpResponse(const std::string& raw) +{ + EL_ParsedHttpResponse res; + + const size_t headerEnd = raw.find("\r\n\r\n"); + if (headerEnd == std::string::npos) + throw std::runtime_error("ElevenLabs: invalid HTTP response (no header delimiter)"); + + // Status line: "HTTP/1.1 200 OK" + if (raw.size() > 12) + res.statusCode = std::stoi(raw.substr(9, 3)); + + // Collect headers needed for dechunking + std::string transferEncoding; + size_t pos = raw.find("\r\n") + 2; + while (pos < headerEnd) + { + const size_t next = raw.find("\r\n", pos); + const size_t end = (next == std::string::npos) ? headerEnd : next; + const std::string line = raw.substr(pos, end - pos); + const size_t colon = line.find(':'); + if (colon != std::string::npos) + { + std::string k = line.substr(0, colon); + std::string v = line.substr(colon + 1); + if (!v.empty() && v.front() == ' ') v.erase(v.begin()); + for (char& c : k) c = static_cast(std::tolower(static_cast(c))); + if (k == "transfer-encoding") transferEncoding = v; + } + pos = end + 2; + } + + res.body = raw.substr(headerEnd + 4); + + // Dechunk if Transfer-Encoding: chunked + if (transferEncoding.find("chunked") != std::string::npos) + { + std::string dechunked; + size_t p = 0; + while (p < res.body.size()) + { + const size_t nl = res.body.find("\r\n", p); + if (nl == std::string::npos) break; + const size_t chunkSize = std::stoul(res.body.substr(p, nl - p), nullptr, 16); + if (chunkSize == 0) break; + p = nl + 2; + if (p + chunkSize > res.body.size()) break; + dechunked.append(res.body, p, chunkSize); + p += chunkSize + 2; + } + res.body = std::move(dechunked); + } + + return res; +} + +template +static auto EL_WithSslStream(const std::string& host, + std::stop_token stopToken, + Func&& fn) + -> decltype(fn(std::declval&>())) +{ + asio::io_context ioctx; + asio::ssl::context sslctx(asio::ssl::context::sslv23_client); + asio::ssl::stream stream(ioctx, sslctx); + + std::stop_callback stopCb(stopToken, [&stream]() { + asio::error_code ec; + stream.lowest_layer().close(ec); + }); + + if (stopToken.stop_requested()) return {}; + + auto resolved = asio::ip::tcp::resolver(ioctx).resolve(host, "443"); + stream.next_layer().connect(*resolved); + stream.handshake(asio::ssl::stream_base::client); + + return fn(stream); +} + +static std::string EL_HttpsRequest(const std::string& method, + const std::string& host, + const std::string& pathAndQuery, + const std::string& body, + const std::string& apiKey, + std::stop_token stopToken = {}) +{ + return EL_WithSslStream(host, stopToken, + [&](asio::ssl::stream& stream) -> std::string + { + std::string req = + method + " " + pathAndQuery + " HTTP/1.1\r\n" + "Host: " + host + "\r\n" + "xi-api-key: " + apiKey + "\r\n" + "Content-Type: application/json\r\n" + "Connection: close\r\n"; + if (!body.empty()) + req += "Content-Length: " + std::to_string(body.size()) + "\r\n"; + req += "\r\n"; + req += body; + + asio::write(stream, asio::buffer(req)); + + std::string response; + asio::error_code ec; + asio::read(stream, asio::dynamic_string_buffer(response), ec); + + if (ec != asio::error::eof && + ec != asio::ssl::error::stream_truncated && + ec) + { + if (stopToken.stop_requested()) return {}; + asio::detail::throw_error(ec); + } + return response; + }); +} + +static void EL_AppendUtf8Codepoint(std::string& out, uint32_t cp) +{ + if (cp <= 0x7F) + { + out.push_back(static_cast(cp)); + } + else if (cp <= 0x7FF) + { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + else if (cp <= 0xFFFF) + { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + else if (cp <= 0x10FFFF) + { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } +} + +static bool EL_TryParseEntityCodepoint(std::string_view entity, uint32_t& cp) +{ + if (entity.size() < 2 || entity[0] != '#') + return false; + + const bool isHex = entity.size() >= 3 && (entity[1] == 'x' || entity[1] == 'X'); + const size_t firstDigit = isHex ? 2 : 1; + if (firstDigit >= entity.size()) + return false; + + cp = 0; + for (size_t i = firstDigit; i < entity.size(); ++i) + { + unsigned digit; + const char c = entity[i]; + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (isHex && c >= 'a' && c <= 'f') + digit = c - 'a' + 10; + else if (isHex && c >= 'A' && c <= 'F') + digit = c - 'A' + 10; + else + return false; + + if ((!isHex && digit >= 10) || cp > 0x10FFFF / (isHex ? 16 : 10)) + return false; + + cp = cp * (isHex ? 16 : 10) + digit; + } + + return cp <= 0x10FFFF && !(cp >= 0xD800 && cp <= 0xDFFF); +} + +static std::string EL_DecodeXmlEntities(std::string_view text) +{ + std::string decoded; + decoded.reserve(text.size()); + + for (size_t i = 0; i < text.size(); ++i) + { + if (text[i] != '&') + { + decoded.push_back(text[i]); + continue; + } + + const size_t semi = text.find(';', i + 1); + if (semi == std::string_view::npos || semi - i > 16) + { + decoded.push_back(text[i]); + continue; + } + + const std::string_view entity = text.substr(i + 1, semi - i - 1); + if (entity == "amp") + decoded.push_back('&'); + else if (entity == "lt") + decoded.push_back('<'); + else if (entity == "gt") + decoded.push_back('>'); + else if (entity == "quot") + decoded.push_back('"'); + else if (entity == "apos") + decoded.push_back('\''); + else + { + uint32_t cp; + if (!EL_TryParseEntityCodepoint(entity, cp)) + { + decoded.append(text.data() + i, semi - i + 1); + i = semi; + continue; + } + EL_AppendUtf8Codepoint(decoded, cp); + } + + i = semi; + } + + return decoded; +} + +static std::string EL_UrlEncode(std::string_view value) +{ + static constexpr char hex[] = "0123456789ABCDEF"; + std::string encoded; + encoded.reserve(value.size()); + for (unsigned char c : value) + { + if ((c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == '-' || c == '_' || c == '.' || c == '~') + { + encoded.push_back(static_cast(c)); + } + else + { + encoded.push_back('%'); + encoded.push_back(hex[c >> 4]); + encoded.push_back(hex[c & 0x0F]); + } + } + return encoded; +} + +// ───────────────────────────────────────────────────────────────────────────── +// SSML → plain text +// ElevenLabs HTTP endpoint does not support SSML. +// Strip all XML tags and normalize whitespace. +// ───────────────────────────────────────────────────────────────────────────── + +static std::string SsmlToPlainText(std::wstring_view ssml) +{ + const std::string utf8 = WStringToUTF8(ssml); + std::string result; + result.reserve(utf8.size()); + + bool inTag = false; + for (unsigned char c : utf8) + { + if (c == '<') { inTag = true; continue; } + if (c == '>') { inTag = false; continue; } + if (!inTag) result += static_cast(c); + } + + const std::string decoded = EL_DecodeXmlEntities(result); + + // Collapse runs of whitespace to a single space and trim ends + std::string normalized; + normalized.reserve(decoded.size()); + bool lastWasSpace = true; // true trims leading whitespace + for (unsigned char c : decoded) + { + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + { + if (!lastWasSpace) { normalized += ' '; lastWasSpace = true; } + } + else + { + normalized += static_cast(c); + lastWasSpace = false; + } + } + if (!normalized.empty() && normalized.back() == ' ') + normalized.pop_back(); + + return normalized; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Extract error message from ElevenLabs error JSON: +// {"detail": {"message": "..."}} or {"detail": "..."} +// ───────────────────────────────────────────────────────────────────────────── + +static std::string EL_ExtractErrorMessage(const std::string& body) +{ + try + { + auto j = nlohmann::json::parse(body); + if (j.contains("detail")) + { + auto& detail = j["detail"]; + if (detail.is_string()) + return detail.get(); + if (detail.is_object() && detail.contains("message")) + return detail["message"].get(); + } + return body.substr(0, 300); + } + catch (...) { return body.substr(0, 300); } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Credentials +// ───────────────────────────────────────────────────────────────────────────── + +void ElevenLabsAPI::SetCredentials(std::string apiKey, std::string model) +{ + m_apiKey = std::move(apiKey); + m_model = std::move(model); +} + +// ───────────────────────────────────────────────────────────────────────────── +// SpeakAsync / DoSpeakAsync +// ───────────────────────────────────────────────────────────────────────────── + +std::future ElevenLabsAPI::SpeakAsync(const std::wstring& ssml, + const std::string& voiceId) +{ + m_ssml = ssml; + m_voiceId = voiceId; + m_waveBytesWritten = 0; + m_stopSource = {}; + + return std::async(std::launch::async, + std::bind(&ElevenLabsAPI::DoSpeakAsync, this)); +} + +void ElevenLabsAPI::Stop() +{ + m_stopSource.request_stop(); +} + +void ElevenLabsAPI::DoSpeakAsync() +{ + const std::string path = + "/v1/text-to-speech/" + m_voiceId + "?output_format=pcm_24000"; + + const std::string text = SsmlToPlainText(m_ssml); + if (text.empty()) + return; // nothing to speak + + const nlohmann::json bodyJson = { + {"text", text}, + {"model_id", m_model} + }; + const std::string body = bodyJson.dump(); + + LogDebug("ElevenLabs: Speak request: model={} voice={}", m_model, m_voiceId); + LogTrace("ElevenLabs: Request body: {}", body); + + if (m_stopSource.stop_requested()) return; + + const std::string rawResponse = EL_HttpsRequest( + "POST", ELEVENLABS_HOST, path, body, m_apiKey, + m_stopSource.get_token()); + + if (m_stopSource.stop_requested() || rawResponse.empty()) return; + + const auto resp = EL_ParseHttpResponse(rawResponse); + + if (resp.statusCode != 200) + { + LogTrace("ElevenLabs: Error response body: {}", resp.body.substr(0, 500)); + throw std::runtime_error( + "ElevenLabs API error " + std::to_string(resp.statusCode) + + ": " + EL_ExtractErrorMessage(resp.body)); + } + + LogDebug("ElevenLabs: Response received, {} bytes of PCM", resp.body.size()); + + // PCM 24kHz 16-bit mono – deliver directly without decoding + if (!AudioReceivedCallback || resp.body.empty()) return; + + const auto* data = reinterpret_cast(resp.body.data()); + const auto size = static_cast(resp.body.size()); + + const int written = AudioReceivedCallback(const_cast(data), size); + m_waveBytesWritten += static_cast(written); + + if (SessionEndCallback) + SessionEndCallback(m_waveBytesWritten); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Voice list (GET /v2/voices, paginated) +// ───────────────────────────────────────────────────────────────────────────── + +nlohmann::json ElevenLabsAPI::GetVoiceList(const std::string& apiKey) +{ + nlohmann::json allVoices = nlohmann::json::array(); + std::string nextPageToken; + + for (;;) + { + std::string path = "/v2/voices?page_size=100&include_total_count=false"; + if (!nextPageToken.empty()) + path += "&next_page_token=" + EL_UrlEncode(nextPageToken); + + const std::string rawResponse = EL_HttpsRequest( + "GET", ELEVENLABS_HOST, path, "", apiKey); + + if (rawResponse.empty()) + throw std::runtime_error("ElevenLabs: empty response fetching voice list"); + + const auto resp = EL_ParseHttpResponse(rawResponse); + + if (resp.statusCode != 200) + { + LogTrace("ElevenLabs: Voice list error response: {}", resp.body.substr(0, 500)); + throw std::runtime_error( + "ElevenLabs voice list error " + std::to_string(resp.statusCode) + + ": " + EL_ExtractErrorMessage(resp.body)); + } + + const auto json = nlohmann::json::parse(resp.body); + LogTrace("ElevenLabs: Voice list page response (first 2000 chars): {}", + resp.body.substr(0, 2000)); + + for (const auto& voice : json.at("voices")) + allVoices.push_back(voice); + + const bool hasMore = json.value("has_more", false); + if (!hasMore) break; + + if (json.contains("next_page_token") && json["next_page_token"].is_string()) + nextPageToken = json["next_page_token"].get(); + else + break; + } + + LogDebug("ElevenLabs: Voice list fetched: {} voices total", allVoices.size()); + return allVoices; +} diff --git a/NaturalVoiceSAPIAdapter/ElevenLabsAPI.h b/NaturalVoiceSAPIAdapter/ElevenLabsAPI.h new file mode 100644 index 0000000..46eb62b --- /dev/null +++ b/NaturalVoiceSAPIAdapter/ElevenLabsAPI.h @@ -0,0 +1,48 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +// ElevenLabs TTS REST API client. +// Authentication: xi-api-key HTTP header. +// Output audio: PCM 24 kHz 16-bit mono (output_format=pcm_24000) – no decoding needed. +// SSML: not supported by ElevenLabs; all XML tags are stripped, plain text is sent. +class ElevenLabsAPI +{ +public: + // Audio output callback – returns number of bytes consumed. + std::function AudioReceivedCallback; + + // Called when synthesis finishes, with total PCM bytes written. + std::function SessionEndCallback; + + void SetCredentials(std::string apiKey, std::string model); + + // ssml – SSML built by BuildSSML(); XML tags are stripped, plain text is sent. + // The referenced wstring must remain valid until the returned future completes. + // voiceId – ElevenLabs voice ID, e.g. "JBFqnCBsd6RMkjVDRZzb" + std::future SpeakAsync(const std::wstring& ssml, + const std::string& voiceId); + void Stop(); + + uint64_t GetWaveBytesWritten() const noexcept { return m_waveBytesWritten; } + + // Fetch all voices for the given API key (handles pagination automatically). + // Returns a JSON array of voice objects matching the /v2/voices response schema. + static nlohmann::json GetVoiceList(const std::string& apiKey); + +private: + std::string m_apiKey; + std::string m_model; + + std::wstring_view m_ssml; // view into TTSEngine::m_ssml – valid while future runs + std::string m_voiceId; + + std::stop_source m_stopSource; + uint64_t m_waveBytesWritten = 0; + + void DoSpeakAsync(); +}; diff --git a/NaturalVoiceSAPIAdapter/Mp3Decoder.cpp b/NaturalVoiceSAPIAdapter/Mp3Decoder.cpp index 97709f5..e01ebc6 100644 --- a/NaturalVoiceSAPIAdapter/Mp3Decoder.cpp +++ b/NaturalVoiceSAPIAdapter/Mp3Decoder.cpp @@ -128,7 +128,7 @@ void Mp3Decoder::Init(const BYTE* pMp3Chunk, DWORD cbChunkSize) MMRESULT mmr = acmStreamOpen(&m_hAcm, nullptr, &mp3fmt.wfx, &m_wavefmt, nullptr, 0, 0, 0); if (mmr) throw std::system_error(mmr, mci_category()); - m_cbMp3Buf = cbChunkSize; + m_cbMp3Buf = std::max(cbChunkSize, 16384); m_pMp3Buf = std::make_unique_for_overwrite(m_cbMp3Buf); mmr = acmStreamSize(m_hAcm, m_cbMp3Buf, &m_cbWavBuf, ACM_STREAMSIZEF_SOURCE); diff --git a/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj b/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj index 1c44094..070f5c4 100644 --- a/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj +++ b/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj @@ -392,6 +392,8 @@ + + @@ -421,6 +423,8 @@ + + false diff --git a/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj.filters b/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj.filters index 20d9b90..573b9c7 100644 --- a/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj.filters +++ b/NaturalVoiceSAPIAdapter/NaturalVoiceSAPIAdapter.vcxproj.filters @@ -49,6 +49,12 @@ Generated Files + + Header Files + + + Header Files + Header Files\Exceptions @@ -174,6 +180,12 @@ Source Files\SpeechRestAPI + + Source Files + + + Source Files + diff --git a/NaturalVoiceSAPIAdapter/SpeechRestAPI.cpp b/NaturalVoiceSAPIAdapter/SpeechRestAPI.cpp index 2f7265a..639a275 100644 --- a/NaturalVoiceSAPIAdapter/SpeechRestAPI.cpp +++ b/NaturalVoiceSAPIAdapter/SpeechRestAPI.cpp @@ -9,6 +9,13 @@ std::unique_ptr g_pConnectionPool; static std::once_flag s_initOnce; +static inline DWORD _GetTickCount() +{ +#pragma warning (disable: 28159) + return GetTickCount(); +#pragma warning (default: 28159) +} + static std::string MakeRandomUuid() { GUID guid; @@ -50,6 +57,8 @@ std::future SpeechRestAPI::SpeakAsync(const std::wstring& ssml) m_stopSource = {}; m_firstDataReceived = false; m_allDataReceived = false; + m_lastBinaryMessageTicks = 0; + m_lastMp3ProcessTicks = 0; auto fut = std::async(std::launch::async, std::bind(&SpeechRestAPI::DoSpeakAsync, this)); @@ -191,6 +200,15 @@ void SpeechRestAPI::Mp3ProcessLoop(BlockingQueue& queue, std::stop_ if (!msg.has_value()) return; + DWORD now = _GetTickCount(); + if (m_lastMp3ProcessTicks != 0) + { + DWORD gapMs = now - m_lastMp3ProcessTicks; + if (gapMs > 500) + LogDebug("Rest API: MP3 processing gap: {}ms", gapMs); + } + m_lastMp3ProcessTicks = now; + // msg is the whole message (including header) from server // after "Path:audio\r\n" are audio binary data // Note that the first 2 bytes are not part of the header string @@ -207,8 +225,12 @@ void SpeechRestAPI::Mp3ProcessLoop(BlockingQueue& queue, std::stop_ } // Sending audio data to SAPI can block, so do this without lock + DWORD convertStartTicks = _GetTickCount(); mp3Decoder.Convert(reinterpret_cast(mp3data.data()), mp3data.size(), std::bind_front(&SpeechRestAPI::ProcessWaveData, this, std::ref(mp3Decoder.GetWaveFormat()))); + DWORD convertMs = _GetTickCount() - convertStartTicks; + if (convertMs > 200) + LogDebug("Rest API: MP3 chunk processing took {}ms for {} bytes", convertMs, mp3data.size()); } } @@ -279,7 +301,16 @@ void SpeechRestAPI::OnMessage(BlockingQueue& queue, WSConnectionPtr if (msg->get_opcode() == websocketpp::frame::opcode::binary) { // If the message is binary, place this message in the queue to let the MP3 thread process it - queue.push(std::move(msg->get_raw_payload())); + auto payload = msg->get_raw_payload(); + DWORD now = _GetTickCount(); + if (m_lastBinaryMessageTicks != 0) + { + DWORD gapMs = now - m_lastBinaryMessageTicks; + if (gapMs > 500) + LogDebug("Rest API: Binary audio message gap: {}ms, payload {} bytes", gapMs, payload.size()); + } + m_lastBinaryMessageTicks = now; + queue.push(std::move(payload)); } else { @@ -424,4 +455,4 @@ size_t SpeechRestAPI::FindWord(const std::string& utf8Word, size_t& lastPos) startpos = ssml.find('>', wordPos + word.size()); } return ssml.npos; -} \ No newline at end of file +} diff --git a/NaturalVoiceSAPIAdapter/SpeechRestAPI.h b/NaturalVoiceSAPIAdapter/SpeechRestAPI.h index d5ba6ca..de5829f 100644 --- a/NaturalVoiceSAPIAdapter/SpeechRestAPI.h +++ b/NaturalVoiceSAPIAdapter/SpeechRestAPI.h @@ -70,6 +70,8 @@ class SpeechRestAPI // then use another thread for sending the data to SAPI. bool m_firstDataReceived = false; bool m_allDataReceived = false; + DWORD m_lastBinaryMessageTicks = 0; + DWORD m_lastMp3ProcessTicks = 0; void DoSpeakAsync(); void Mp3ProcessLoop(BlockingQueue& queue, std::stop_token token); @@ -81,4 +83,4 @@ class SpeechRestAPI void OnClose(BlockingQueue& queue, const WSConnectionPtr& conn); void OnSynthEvent(const nlohmann::json& metadata); size_t FindWord(const std::string& utf8Word, size_t& lastPos); -}; \ No newline at end of file +}; diff --git a/NaturalVoiceSAPIAdapter/TTSEngine.cpp b/NaturalVoiceSAPIAdapter/TTSEngine.cpp index b002b50..b1fd0c8 100644 --- a/NaturalVoiceSAPIAdapter/TTSEngine.cpp +++ b/NaturalVoiceSAPIAdapter/TTSEngine.cpp @@ -80,7 +80,7 @@ STDMETHODIMP CTTSEngine::Speak(DWORD /*dwSpeakFlags*/, { return E_INVALIDARG; } - if (!m_synthesizer && !m_restApi) + if (!m_synthesizer && !m_restApi && !m_pollyApi && !m_elevenLabsApi) { return SPERR_UNINITIALIZED; } @@ -106,8 +106,12 @@ STDMETHODIMP CTTSEngine::Speak(DWORD /*dwSpeakFlags*/, pOutputSite->GetEventInterest(&eventInterests); if (m_synthesizer) SetupSynthesizerEvents(eventInterests); - else + else if (m_restApi) SetupRestAPIEvents(eventInterests); + else if (m_pollyApi) + SetupPollyEvents(eventInterests); + else + SetupElevenLabsEvents(eventInterests); // Clear m_pOutputSite automatically when Speak is completed ScopeGuard siteDeleter([this]() @@ -140,10 +144,18 @@ STDMETHODIMP CTTSEngine::Speak(DWORD /*dwSpeakFlags*/, { future = std::async(std::launch::async, [this]() { CheckSynthesisResult(m_synthesizer->SpeakSsml(m_ssml)); }); } - else + else if (m_restApi) { future = m_restApi->SpeakAsync(m_ssml); } + else if (m_pollyApi) + { + future = m_pollyApi->SpeakAsync(m_ssml, m_pollyVoiceId, m_pollyEngine); + } + else + { + future = m_elevenLabsApi->SpeakAsync(m_ssml, m_elevenLabsVoiceId); + } while (!(pOutputSite->GetActions() & SPVES_ABORT) && future.wait_for(std::chrono::milliseconds(0)) == std::future_status::timeout) @@ -177,8 +189,12 @@ STDMETHODIMP CTTSEngine::Speak(DWORD /*dwSpeakFlags*/, future.wait(); }); } - else + else if (m_restApi) m_restApi->Stop(); + else if (m_pollyApi) + m_pollyApi->Stop(); + else if (m_elevenLabsApi) + m_elevenLabsApi->Stop(); m_lastSpeakCompletedTicks = 0; } @@ -274,9 +290,13 @@ void CTTSEngine::InitVoice() && InitCloudVoiceSynthesizer(pConfigKey)) return; } + if (InitPollyVoice(pConfigKey)) + return; + if (InitElevenLabsVoice(pConfigKey)) + return; if (InitCloudVoiceRestAPI(pConfigKey)) return; - + throw std::invalid_argument("Invalid NaturalVoiceConfig configuration."); } @@ -433,6 +453,74 @@ bool CTTSEngine::InitCloudVoiceRestAPI(ISpDataKey* pConfigKey) return true; } +bool CTTSEngine::InitPollyVoice(ISpDataKey* pConfigKey) +{ + CSpDynamicString pszVoiceId, pszAccessKey, pszSecretKey, pszRegion, pszEngine; + if (CheckHrNotFound(pConfigKey->GetStringValue(L"PollyVoiceId", &pszVoiceId)) + || CheckHrNotFound(pConfigKey->GetStringValue(L"AccessKey", &pszAccessKey)) + || CheckHrNotFound(pConfigKey->GetStringValue(L"SecretKey", &pszSecretKey)) + || CheckHrNotFound(pConfigKey->GetStringValue(L"Region", &pszRegion))) + return false; + + m_pollyVoiceId = WStringToUTF8(pszVoiceId.m_psz); + m_pollyEngine = "neural"; // default if not specified + if (!CheckHrNotFound(pConfigKey->GetStringValue(L"Engine", &pszEngine))) + m_pollyEngine = WStringToUTF8(pszEngine.m_psz); + + m_pollyApi = std::make_unique(); + m_pollyApi->SetCredentials( + WStringToUTF8(pszAccessKey.m_psz), + WStringToUTF8(pszSecretKey.m_psz), + WStringToUTF8(pszRegion.m_psz)); + + // ErrorMode::ProbeForError: Polly has no lightweight probe; skip it. + // A real error will surface on the first SpeakAsync call. + + LogInfo("Polly voice created: {} ({})", m_pollyVoiceId, m_pollyEngine); + return true; +} + +void CTTSEngine::SetupPollyEvents(ULONGLONG /*interests*/) +{ + // Polly returns raw MP3 audio only – no word/sentence/bookmark events. + m_pollyApi->AudioReceivedCallback = std::bind_front(&CTTSEngine::OnAudioData, this); + m_pollyApi->WordBoundaryCallback = nullptr; + m_pollyApi->SentenceBoundaryCallback = nullptr; + m_pollyApi->BookmarkCallback = nullptr; + m_pollyApi->SessionEndCallback = nullptr; +} + +bool CTTSEngine::InitElevenLabsVoice(ISpDataKey* pConfigKey) +{ + CSpDynamicString pszVoiceId, pszApiKey, pszModel; + if (CheckHrNotFound(pConfigKey->GetStringValue(L"ElevenLabsVoiceId", &pszVoiceId)) + || CheckHrNotFound(pConfigKey->GetStringValue(L"ApiKey", &pszApiKey))) + return false; + + std::string model = "eleven_multilingual_v2"; // default + if (!CheckHrNotFound(pConfigKey->GetStringValue(L"Model", &pszModel))) + model = WStringToUTF8(pszModel.m_psz); + + m_elevenLabsVoiceId = WStringToUTF8(pszVoiceId.m_psz); + + m_elevenLabsApi = std::make_unique(); + m_elevenLabsApi->SetCredentials( + WStringToUTF8(pszApiKey.m_psz), + std::move(model)); + + // No lightweight probe for ElevenLabs – errors will surface on first SpeakAsync. + + LogInfo("ElevenLabs voice created: {}", m_elevenLabsVoiceId); + return true; +} + +void CTTSEngine::SetupElevenLabsEvents(ULONGLONG /*interests*/) +{ + // ElevenLabs HTTP endpoint returns raw PCM – no word/sentence/bookmark events. + m_elevenLabsApi->AudioReceivedCallback = std::bind_front(&CTTSEngine::OnAudioData, this); + m_elevenLabsApi->SessionEndCallback = nullptr; +} + // Returns the trailing silence (zero) wave data length, in bytes template static size_t GetTrailingSilenceLengthMono(BYTE* waveData, size_t length) @@ -459,7 +547,12 @@ static size_t GetTrailingSilenceLengthMono(BYTE* waveData, size_t length) int CTTSEngine::OnAudioData(uint8_t* data, uint32_t len) { - std::lock_guard lock(m_outputSiteMutex); + DWORD lockWaitStartTicks = _GetTickCount(); + std::unique_lock lock(m_outputSiteMutex); + DWORD lockWaitMs = _GetTickCount() - lockWaitStartTicks; + if (lockWaitMs > 50) + LogDebug("Speak: Audio output lock wait took {}ms", lockWaitMs); + if (!m_pOutputSite) { LogWarn("Speak: Audio write with invalid OutputSite, ignored"); @@ -467,6 +560,17 @@ int CTTSEngine::OnAudioData(uint8_t* data, uint32_t len) } ULONG written = 0; + auto writeWithTiming = [this](const void* buffer, ULONG bytes, ULONG* written, const char* label) + { + DWORD writeStartTicks = _GetTickCount(); + HRESULT hr = m_pOutputSite->Write(buffer, bytes, written); + DWORD writeMs = _GetTickCount() - writeStartTicks; + if (writeMs > 200) + LogDebug("Speak: OutputSite write '{}' took {}ms for {} bytes", label, writeMs, bytes); + else if (writeMs > 50) + LogTrace("Speak: OutputSite write '{}': {} bytes in {}ms", label, bytes, writeMs); + return hr; + }; if (m_onlineDelayOptimization) { @@ -489,7 +593,7 @@ int CTTSEngine::OnAudioData(uint8_t* data, uint32_t len) LogDebug("Speak: Compensate for the previous trailing {}ms silence", silenceMs - passedMs); // Write the compensated silence auto mem = std::make_unique(m_compensatedSilentBytes); // zeroed mem - m_pOutputSite->Write(mem.get(), m_compensatedSilentBytes, &written); + writeWithTiming(mem.get(), m_compensatedSilentBytes, &written, "compensated silence"); } } m_lastSilentBytes = 0; @@ -515,13 +619,13 @@ int CTTSEngine::OnAudioData(uint8_t* data, uint32_t len) if (m_lastSilentBytes != 0) { auto mem = std::make_unique(m_lastSilentBytes); // zeroed mem - m_pOutputSite->Write(mem.get(), m_lastSilentBytes, &written); + writeWithTiming(mem.get(), m_lastSilentBytes, &written, "held silence"); } m_lastSilentBytes = silentBytes; } } - HRESULT hr = m_pOutputSite->Write(data, len - m_lastSilentBytes, &written); + HRESULT hr = writeWithTiming(data, len - m_lastSilentBytes, &written, "audio"); // Assumes that the data can be either entirely written or not written at all // because some implementations do not set the written bytes correctly if (SUCCEEDED(hr)) @@ -1042,6 +1146,16 @@ bool CTTSEngine::BuildSSML(const SPVTEXTFRAG* pTextFragList) } else if (pTextFrag->ulTextLen >= 3) // opening tag { + // Skip tags to prevent nesting inside the root . + // Some callers (e.g. .NET System.Speech) wrap their SSML in a root; + // SAPI may forward it as SPVA_ParseUnknownTag when it doesn't recognise the + // namespace/version attributes, which would produce invalid SSML sent to Azure. + auto nameBegin = std::find_if_not(tag.begin() + 1, tag.end() - 1, iswspace); + auto nameEnd = std::find_if(nameBegin, tag.end() - 1, + [](wchar_t c) { return iswspace(c) || c == L'>' || c == L'/'; }); + if (EqualsIgnoreCase(std::wstring_view(nameBegin, nameEnd), L"speak")) + break; + m_ssml.append(pTextFrag->pTextStart, pTextFrag->ulTextLen); customTags.emplace_back(pTextFrag->pTextStart, pTextFrag->ulTextLen); // add to tag list } diff --git a/NaturalVoiceSAPIAdapter/TTSEngine.h b/NaturalVoiceSAPIAdapter/TTSEngine.h index fe378b6..7f1ccf2 100644 --- a/NaturalVoiceSAPIAdapter/TTSEngine.h +++ b/NaturalVoiceSAPIAdapter/TTSEngine.h @@ -6,6 +6,8 @@ #include "pch.h" #include #include "SpeechRestAPI.h" +#include "AmazonPollyAPI.h" +#include "ElevenLabsAPI.h" #include "Logger.h" #include "SapiException.h" #include "Mp3Decoder.h" @@ -121,6 +123,11 @@ END_COM_MAP() CComPtr m_phoneConverter; std::shared_ptr m_synthesizer; std::unique_ptr m_restApi; + std::unique_ptr m_pollyApi; + std::string m_pollyVoiceId; + std::string m_pollyEngine; + std::unique_ptr m_elevenLabsApi; + std::string m_elevenLabsVoiceId; std::future m_lastCancellingFuture; ErrorMode m_errorMode = ErrorMode::ProbeForError; @@ -164,9 +171,13 @@ END_COM_MAP() bool InitLocalVoice(ISpDataKey* pConfigKey); bool InitCloudVoiceSynthesizer(ISpDataKey* pConfigKey); bool InitCloudVoiceRestAPI(ISpDataKey* pConfigKey); + bool InitPollyVoice(ISpDataKey* pConfigKey); + bool InitElevenLabsVoice(ISpDataKey* pConfigKey); void SetupSynthesizerEvents(ULONGLONG interests); void ClearSynthesizerEvents(); void SetupRestAPIEvents(ULONGLONG interests); + void SetupPollyEvents(ULONGLONG interests); + void SetupElevenLabsEvents(ULONGLONG interests); void AppendTextFragToSsml(const SPVTEXTFRAG* pTextFrag); void AppendPhonemesToSsml(const SPPHONEID* pPhoneIds); diff --git a/NaturalVoiceSAPIAdapter/TaskScheduler.h b/NaturalVoiceSAPIAdapter/TaskScheduler.h index 82e397b..69e0bfb 100644 --- a/NaturalVoiceSAPIAdapter/TaskScheduler.h +++ b/NaturalVoiceSAPIAdapter/TaskScheduler.h @@ -13,20 +13,23 @@ // before this DLL is unloaded. class TaskScheduler { +public: + using TaskHandle = HANDLE; + private: // Timer queues CANNOT be created in DllMain, otherwise deadlocks would happen on Windows XP // So we create the timer queue on first use HANDLE hTimerQueue = nullptr; + std::once_flag initFlag; public: void Initialize() { - if (!hTimerQueue) - { + std::call_once(initFlag, [this]() { hTimerQueue = CreateTimerQueue(); if (!hTimerQueue) throw std::system_error(GetLastError(), std::system_category()); - } + }); } ~TaskScheduler() @@ -44,10 +47,11 @@ class TaskScheduler // delete tuples that haven't been deleted by callback functions std::lock_guard lock(deleterMutex); - for (auto& [pTuple, pFunc] : deleters) + for (auto& [hTask, data] : tasks) { - pFunc(pTuple); + data.second(data.first); } + tasks.clear(); } private: @@ -66,7 +70,7 @@ class TaskScheduler }; typedef void (*DataDeleterFunc)(PVOID); - std::unordered_map deleters; + std::unordered_map> tasks; std::mutex deleterMutex; template @@ -83,7 +87,14 @@ class TaskScheduler // so remove it from the deleter list TaskScheduler& scheduler = *pData->pScheduler; std::lock_guard lock(scheduler.deleterMutex); - scheduler.deleters.erase(param); + for (auto it = scheduler.tasks.begin(); it != scheduler.tasks.end(); ++it) + { + if (it->second.first == param) + { + scheduler.tasks.erase(it); + break; + } + } } auto& tup = pData->tuple; @@ -109,8 +120,6 @@ class TaskScheduler } public: - using TaskHandle = HANDLE; - template requires std::invocable TaskHandle StartNewTask(DWORD delayMs, DWORD periodMs, Func&& func, Args&&... args) { @@ -123,6 +132,7 @@ class TaskScheduler std::forward(func), std::forward(args)... ); + std::lock_guard lock(deleterMutex); HANDLE hTimer; if (!CreateTimerQueueTimer(&hTimer, hTimerQueue, GetTimerQueueProc(std::make_index_sequence<1 + sizeof...(Args)>()), @@ -134,8 +144,7 @@ class TaskScheduler throw std::system_error(GetLastError(), std::system_category()); } - std::lock_guard lock(deleterMutex); - deleters.emplace(pData.get(), &DataDeleter); + tasks.emplace(hTimer, std::make_pair(pData.get(), &DataDeleter)); pData.release(); return hTimer; @@ -156,5 +165,14 @@ class TaskScheduler void CancelTask(TaskHandle hTask, bool waitForTask) { (void)DeleteTimerQueueTimer(hTimerQueue, hTask, waitForTask ? INVALID_HANDLE_VALUE : nullptr); + if (waitForTask) + { + std::lock_guard lock(deleterMutex); + if (auto it = tasks.find(hTask); it != tasks.end()) + { + it->second.second(it->second.first); + tasks.erase(it); + } + } } }; \ No newline at end of file diff --git a/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.cpp b/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.cpp index a7b7eb1..36d43d9 100644 --- a/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.cpp +++ b/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.cpp @@ -1,6 +1,8 @@ ïŧŋ// VoiceTokenEnumerator.cpp: CVoiceTokenEnumerator įš„åŪžįŽ° #include "pch.h" #include "VoiceTokenEnumerator.h" +#include "AmazonPollyAPI.h" +#include "ElevenLabsAPI.h" #include #include "SpeechServiceConstants.h" #include "NetUtils.h" @@ -197,6 +199,38 @@ HRESULT CVoiceTokenEnumerator::FinalConstruct() noexcept for (auto& token : onlineTokens) s_cachedTokens.push_back(std::move(token.second)); + + if (!key.GetDword(L"NoPollyVoices")) + { + std::wstring pollyAccessKey = key.GetString(L"PollyAccessKey"); + std::wstring pollySecretKey = key.GetString(L"PollySecretKey"); + std::wstring pollyRegion = key.GetString(L"PollyRegion"); + std::wstring pollyEngine = key.GetString(L"PollyEngine"); + if (pollyEngine.empty()) pollyEngine = L"neural"; + if (!pollyAccessKey.empty() && !pollySecretKey.empty() && !pollyRegion.empty()) + { + TokenMap pollyTokens; + EnumPollyVoices(pollyTokens, langFlags, languages, + pollyAccessKey, pollySecretKey, pollyRegion, pollyEngine, errorMode); + for (auto& token : pollyTokens) + s_cachedTokens.push_back(std::move(token.second)); + } + } + + if (!key.GetDword(L"NoElevenLabsVoices")) + { + std::wstring elevenLabsApiKey = key.GetString(L"ElevenLabsApiKey"); + std::wstring elevenLabsModel = key.GetString(L"ElevenLabsModel"); + if (elevenLabsModel.empty()) elevenLabsModel = L"eleven_multilingual_v2"; + if (!elevenLabsApiKey.empty()) + { + TokenMap elTokens; + EnumElevenLabsVoices(elTokens, langFlags, languages, + elevenLabsApiKey, elevenLabsModel, errorMode); + for (auto& token : elTokens) + s_cachedTokens.push_back(std::move(token.second)); + } + } } if (!s_isCacheTaskScheduled) @@ -856,4 +890,365 @@ void CVoiceTokenEnumerator::EnumAzureVoices(TokenMap& tokens, DWORD langFlags, c { return MakeAzureVoiceToken(json, key, region, errorMode); }); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Amazon Polly voice enumeration +// Registry values under the enumerator config key: +// NoPollyVoices (DWORD) – set to 1 to disable +// PollyAccessKey (string) – AWS access key ID +// PollySecretKey (string) – AWS secret access key +// PollyRegion (string) – AWS region, e.g. "us-east-1" +// PollyEngine (string) – "neural" (default) | "standard" | "long-form" | "generative" +// ───────────────────────────────────────────────────────────────────────────── + +static std::shared_ptr MakePollyVoiceToken( + const nlohmann::json& json, + const std::wstring& accessKey, + const std::wstring& secretKey, + const std::wstring& region, + const std::wstring& engine, + ErrorMode errorMode = ErrorMode::ProbeForError +) +{ + // Polly voice list entry fields: Id, Name, LanguageCode, LanguageName, Gender, SupportedEngines + std::wstring localeName = UTF8ToWString(json.at("LanguageCode").get()); + std::wstring languageIds = LanguageIDsFromLocaleName(localeName); + if (languageIds.empty()) + return {}; + + std::wstring voiceId = UTF8ToWString(json.at("Id").get()); + std::wstring voiceName = UTF8ToWString(json.at("Name").get()); + std::wstring langName = UTF8ToWString(json.at("LanguageName").get()); + std::wstring gender = UTF8ToWString(json.at("Gender").get()); + + std::wstring shortFriendlyName = L"Amazon " + voiceName; + std::wstring friendlyName = shortFriendlyName + L" - " + langName; + + // Registry key name: "Polly-Joanna-neural" + std::wstring regName = L"Polly-" + voiceId + L"-" + engine; + + return std::shared_ptr(new DataKeyData { + .path = regName, + .values = { + { L"", std::move(friendlyName) }, + { L"CLSID", L"{013AB33B-AD1A-401C-8BEE-F6E2B046A94E}" } + }, + .subkeys = { + { L"Attributes", { + .path = regName + L"\\Attributes", + .values = { + { L"Name", std::move(shortFriendlyName) }, + { L"Gender", std::move(gender) }, + { L"Age", L"Adult" }, + { L"Language", std::move(languageIds) }, + { L"Locale", std::move(localeName) }, + { L"Vendor", L"Amazon" }, + { L"NaturalVoiceType", L"Polly;Cloud" } + } + } }, + { L"NaturalVoiceConfig", { + .path = regName + L"\\NaturalVoiceConfig", + .values = { + { L"ErrorMode", std::to_wstring(static_cast(errorMode)) }, + { L"PollyVoiceId", voiceId }, + { L"AccessKey", accessKey }, + { L"SecretKey", secretKey }, + { L"Region", region }, + { L"Engine", engine } + } + } } + } + }); +} + +void CVoiceTokenEnumerator::EnumPollyVoices( + TokenMap& tokens, + DWORD langFlags, + const std::vector& languages, + const std::wstring& accessKey, + const std::wstring& secretKey, + const std::wstring& region, + const std::wstring& engine, + ErrorMode errorMode) +{ + try + { + const auto voices = AmazonPollyAPI::GetVoiceList( + WStringToUTF8(accessKey), WStringToUTF8(secretKey), + WStringToUTF8(region), WStringToUTF8(engine)); + + bool universalSupported = IsUniversalPhoneConverterSupported(); + std::set supportedLangs; + if (!universalSupported) + supportedLangs = GetSupportedLanguageIDs(); + + std::set userLangs; + if (!(langFlags & Lang_AllLanguages) && languages.empty()) + userLangs = GetUserPreferredLanguageIDs(false); + + for (const auto& voice : voices) + { + std::wstring locale = UTF8ToWString(voice.at("LanguageCode").get()); + LANGID langid = LangIDFromLocaleName(locale.c_str()); + if (!universalSupported && !supportedLangs.contains(langid)) + continue; + + if (!(langFlags & Lang_AllLanguages)) + { + if (languages.empty()) + { + if (!userLangs.contains(langid)) + continue; + } + else + { + if (!IsLanguageInList(locale, languages)) + continue; + } + } + + std::string voiceId = voice.at("Id").get(); + auto token = MakePollyVoiceToken(voice, accessKey, secretKey, region, engine, errorMode); + if (token) + tokens.try_emplace("Polly-" + voiceId + "-" + WStringToUTF8(engine), std::move(token)); + } + } + catch (const std::bad_alloc&) + { + throw; + } + catch (const std::system_error& ex) + { + LogWarn("Voice enum: Cannot get Polly voice list: {}", ex); + } + catch (const std::exception& ex) + { + LogWarn("Voice enum: Cannot get Polly voice list: {}", ex); + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// ElevenLabs voice enumeration +// Registry values under the enumerator config key: +// NoElevenLabsVoices (DWORD) – set to 1 to disable +// ElevenLabsApiKey (string) – xi-api-key +// ElevenLabsModel (string) – model_id, default "eleven_multilingual_v2" +// ───────────────────────────────────────────────────────────────────────────── + +// Map ISO 639-1 code or language name (lowercase) to BCP-47 locale. +// Falls back to "en-US" for unknown values. +static std::wstring EL_LangToLocale(std::string lang) +{ + for (char& c : lang) c = static_cast(std::tolower(static_cast(c))); + + // ISO 639-1 codes first, then common English names + static const std::pair s_map[] = { + {"en", L"en-US"}, {"english", L"en-US"}, + {"de", L"de-DE"}, {"german", L"de-DE"}, + {"es", L"es-ES"}, {"spanish", L"es-ES"}, + {"fr", L"fr-FR"}, {"french", L"fr-FR"}, + {"it", L"it-IT"}, {"italian", L"it-IT"}, + {"pt", L"pt-BR"}, {"portuguese", L"pt-BR"}, + {"pl", L"pl-PL"}, {"polish", L"pl-PL"}, + {"nl", L"nl-NL"}, {"dutch", L"nl-NL"}, + {"ar", L"ar-SA"}, {"arabic", L"ar-SA"}, + {"zh", L"zh-CN"}, {"chinese", L"zh-CN"}, + {"ja", L"ja-JP"}, {"japanese", L"ja-JP"}, + {"ko", L"ko-KR"}, {"korean", L"ko-KR"}, + {"ru", L"ru-RU"}, {"russian", L"ru-RU"}, + {"hi", L"hi-IN"}, {"hindi", L"hi-IN"}, + {"tr", L"tr-TR"}, {"turkish", L"tr-TR"}, + {"sv", L"sv-SE"}, {"swedish", L"sv-SE"}, + {"nb", L"nb-NO"}, {"no", L"nb-NO"}, {"norwegian", L"nb-NO"}, + {"da", L"da-DK"}, {"danish", L"da-DK"}, + {"fi", L"fi-FI"}, {"finnish", L"fi-FI"}, + {"cs", L"cs-CZ"}, {"czech", L"cs-CZ"}, + {"sk", L"sk-SK"}, {"slovak", L"sk-SK"}, + {"ro", L"ro-RO"}, {"romanian", L"ro-RO"}, + {"hu", L"hu-HU"}, {"hungarian", L"hu-HU"}, + {"el", L"el-GR"}, {"greek", L"el-GR"}, + {"he", L"he-IL"}, {"hebrew", L"he-IL"}, + {"id", L"id-ID"}, {"indonesian", L"id-ID"}, + {"ms", L"ms-MY"}, {"malay", L"ms-MY"}, + {"th", L"th-TH"}, {"thai", L"th-TH"}, + {"vi", L"vi-VN"}, {"vietnamese", L"vi-VN"}, + {"uk", L"uk-UA"}, {"ukrainian", L"uk-UA"}, + {"bg", L"bg-BG"}, {"bulgarian", L"bg-BG"}, + {"hr", L"hr-HR"}, {"croatian", L"hr-HR"}, + {"ca", L"ca-ES"}, {"catalan", L"ca-ES"}, + }; + for (auto& [code, locale] : s_map) + if (lang == code) return locale; + + return L"en-US"; // default +} + +// Determine a SAPI-compatible BCP-47 locale for an ElevenLabs voice JSON object. +// 1. verified_languages[0].locale (most reliable – BCP-47 directly) +// 2. labels["language"] (ISO 639-1 code or English name) +// 3. "en-US" (fallback) +static std::wstring EL_GetVoiceLocale(const nlohmann::json& voice) +{ + // 1. verified_languages + if (voice.contains("verified_languages") && voice["verified_languages"].is_array()) + { + const auto& vl = voice["verified_languages"]; + if (!vl.empty()) + { + const auto& first = vl[0]; + if (first.contains("locale") && first["locale"].is_string()) + { + std::string locale = first["locale"].get(); + if (!locale.empty()) + return UTF8ToWString(locale); + } + } + } + + // 2. labels["language"] + if (voice.contains("labels") && voice["labels"].is_object()) + { + const auto& labels = voice["labels"]; + auto it = labels.find("language"); + if (it != labels.end() && it->is_string()) + { + std::string lang = it->get(); + if (!lang.empty()) + return EL_LangToLocale(lang); + } + } + + // 3. Default + return L"en-US"; +} + +static std::shared_ptr MakeElevenLabsVoiceToken( + const nlohmann::json& json, + const std::wstring& apiKey, + const std::wstring& model, + ErrorMode errorMode = ErrorMode::ProbeForError) +{ + std::wstring voiceId = UTF8ToWString(json.at("voice_id").get()); + std::wstring name = UTF8ToWString(json.value("name", "Unknown")); + + std::wstring localeName = EL_GetVoiceLocale(json); + std::wstring languageIds = LanguageIDsFromLocaleName(localeName); + if (languageIds.empty()) + return {}; + + // Gender from labels["gender"], if present + std::wstring gender; + if (json.contains("labels") && json["labels"].is_object()) + { + auto& labels = json["labels"]; + auto it = labels.find("gender"); + if (it != labels.end() && it->is_string()) + gender = UTF8ToWString(it->get()); + } + // Capitalise first letter to match SAPI convention (Female / Male) + if (!gender.empty()) + gender[0] = static_cast(std::towupper(gender[0])); + + std::wstring shortFriendlyName = L"ElevenLabs " + name; + std::wstring friendlyName = shortFriendlyName + L" - " + localeName; + + // Registry key: "ElevenLabs-{voice_id}" + std::wstring regName = L"ElevenLabs-" + voiceId; + + return std::shared_ptr(new DataKeyData { + .path = regName, + .values = { + { L"", std::move(friendlyName) }, + { L"CLSID", L"{013AB33B-AD1A-401C-8BEE-F6E2B046A94E}" } + }, + .subkeys = { + { L"Attributes", { + .path = regName + L"\\Attributes", + .values = { + { L"Name", std::move(shortFriendlyName) }, + { L"Gender", std::move(gender) }, + { L"Age", L"Adult" }, + { L"Language", std::move(languageIds) }, + { L"Locale", std::move(localeName) }, + { L"Vendor", L"ElevenLabs" }, + { L"NaturalVoiceType", L"ElevenLabs;Cloud" } + } + } }, + { L"NaturalVoiceConfig", { + .path = regName + L"\\NaturalVoiceConfig", + .values = { + { L"ErrorMode", std::to_wstring(static_cast(errorMode)) }, + { L"ElevenLabsVoiceId", voiceId }, + { L"ApiKey", apiKey }, + { L"Model", model } + } + } } + } + }); +} + +void CVoiceTokenEnumerator::EnumElevenLabsVoices( + TokenMap& tokens, + DWORD langFlags, + const std::vector& languages, + const std::wstring& apiKey, + const std::wstring& model, + ErrorMode errorMode) +{ + try + { + const auto voices = ElevenLabsAPI::GetVoiceList(WStringToUTF8(apiKey)); + + bool universalSupported = IsUniversalPhoneConverterSupported(); + std::set supportedLangs; + if (!universalSupported) + supportedLangs = GetSupportedLanguageIDs(); + + std::set userLangs; + if (!(langFlags & Lang_AllLanguages) && languages.empty()) + userLangs = GetUserPreferredLanguageIDs(false); + + for (const auto& voice : voices) + { + if (!voice.contains("voice_id") || !voice["voice_id"].is_string()) + continue; + + std::wstring locale = EL_GetVoiceLocale(voice); + LANGID langid = LangIDFromLocaleName(locale.c_str()); + if (!universalSupported && !supportedLangs.contains(langid)) + continue; + + if (!(langFlags & Lang_AllLanguages)) + { + if (languages.empty()) + { + if (!userLangs.contains(langid)) + continue; + } + else + { + if (!IsLanguageInList(locale, languages)) + continue; + } + } + + const std::string voiceId = voice["voice_id"].get(); + auto token = MakeElevenLabsVoiceToken(voice, apiKey, model, errorMode); + if (token) + tokens.try_emplace("ElevenLabs-" + voiceId, std::move(token)); + } + } + catch (const std::bad_alloc&) + { + throw; + } + catch (const std::system_error& ex) + { + LogWarn("Voice enum: Cannot get ElevenLabs voice list: {}", ex); + } + catch (const std::exception& ex) + { + LogWarn("Voice enum: Cannot get ElevenLabs voice list: {}", ex); + } } \ No newline at end of file diff --git a/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.h b/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.h index c1435ad..e63ae88 100644 --- a/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.h +++ b/NaturalVoiceSAPIAdapter/VoiceTokenEnumerator.h @@ -53,6 +53,12 @@ END_COM_MAP() ErrorMode errorMode); static void EnumAzureVoices(TokenMap& tokens, DWORD langFlags, const std::vector& languages, const std::wstring& key, const std::wstring& region, ErrorMode errorMode); + static void EnumPollyVoices(TokenMap& tokens, DWORD langFlags, const std::vector& languages, + const std::wstring& accessKey, const std::wstring& secretKey, + const std::wstring& region, const std::wstring& engine, ErrorMode errorMode); + static void EnumElevenLabsVoices(TokenMap& tokens, DWORD langFlags, + const std::vector& languages, + const std::wstring& apiKey, const std::wstring& model, ErrorMode errorMode); }; OBJECT_ENTRY_AUTO(__uuidof(VoiceTokenEnumerator), CVoiceTokenEnumerator) diff --git a/NaturalVoiceSAPIAdapter/WSConnectionPool.cpp b/NaturalVoiceSAPIAdapter/WSConnectionPool.cpp index d1bb7fa..e779941 100644 --- a/NaturalVoiceSAPIAdapter/WSConnectionPool.cpp +++ b/NaturalVoiceSAPIAdapter/WSConnectionPool.cpp @@ -337,8 +337,8 @@ void WSConnectionPool::SetConnectionHandlers(HostInfo& info, WSConnection* wrapp info.lastException = nullptr; LogDebug("Connection pool: Connection {} closed, removed from pool ({}/{})", hdl, info.connections.size(), info.count); - RemoveConnection(info, wrapper); info.connectionChanged.notify_all(); + RemoveConnection(info, wrapper); }); conn->set_fail_handler([&info, wrapper](websocketpp::connection_hdl hdl) @@ -360,8 +360,8 @@ void WSConnectionPool::SetConnectionHandlers(HostInfo& info, WSConnection* wrapp if (llResponse != 0) s_responseTimeDelta.store(llResponse - llNow, std::memory_order_relaxed); } - RemoveConnection(info, wrapper); info.connectionChanged.notify_all(); + RemoveConnection(info, wrapper); }); } diff --git a/NaturalVoiceSAPIAdapter/pch.h b/NaturalVoiceSAPIAdapter/pch.h index 9660927..98bfde2 100644 --- a/NaturalVoiceSAPIAdapter/pch.h +++ b/NaturalVoiceSAPIAdapter/pch.h @@ -1,4 +1,4 @@ -ïŧŋ// pch.h: čŋ™æ˜ŊéĒ„įž–čŊ‘æ ‡åĪīæ–‡äŧķ。 +// pch.h: čŋ™æ˜ŊéĒ„įž–čŊ‘æ ‡åĪīæ–‡äŧķ。 // ä–đåˆ—å‡šįš„æ–‡äŧķäŧ…įž–čŊ‘äļ€æŽĄïžŒæéŦ˜äš†å°†æĨį”Ÿæˆįš„į”Ÿæˆæ€§čƒ―ã€‚ // čŋ™čŋ˜å°†å―ąå“ IntelliSense æ€§čƒ―ïžŒåŒ…æ‹ŽäŧĢ᠁åŪŒæˆå’ŒčŪļåΚäŧĢ᠁æĩč§ˆåŠŸčƒ―。 // ä―†æ˜ŊåĶ‚æžœæ­ĪåĪ„åˆ—å‡šįš„æ–‡äŧķäļ­įš„äŧŧä―•äļ€äļŠåœĻį”Ÿæˆäđ‹é—ī有æ›ī新åۃäŧŽå…ĻéƒĻéƒ―å°†čĒŦé‡æ–°įž–čŊ‘。 @@ -9,5 +9,6 @@ // æ·ŧ加č́åœĻæ­ĪåĪ„éĒ„įž–čŊ‘įš„æ ‡åĪī #include "framework.h" +#include #endif //PCH_H