diff --git a/README.md b/README.md index c3c570a..3ee26d3 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ unSpeech lets you use various online TTS with OpenAI-compatible API. - [Alibaba Cloud Model Studio / 阿里云百炼 / CosyVoice](https://www.alibabacloud.com/en/product/modelstudio) - [Volcano Engine / 火山引擎语音技术](https://www.volcengine.com/product/voice-tech) - [ElevenLabs](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) +- [MiniMax](https://platform.minimaxi.com/docs/guides/speech-t2a-http) - [Koemotion (by Rinna)](https://koemotion.rinna.co.jp/) ## Getting Started diff --git a/pkg/backend/backend.go b/pkg/backend/backend.go index bd93b1d..355e291 100644 --- a/pkg/backend/backend.go +++ b/pkg/backend/backend.go @@ -10,6 +10,7 @@ import ( "github.com/moeru-ai/unspeech/pkg/backend/elevenlabs" "github.com/moeru-ai/unspeech/pkg/backend/koemotion" "github.com/moeru-ai/unspeech/pkg/backend/microsoft" + "github.com/moeru-ai/unspeech/pkg/backend/minimax" "github.com/moeru-ai/unspeech/pkg/backend/openai" "github.com/moeru-ai/unspeech/pkg/backend/types" "github.com/moeru-ai/unspeech/pkg/backend/volcengine" @@ -37,6 +38,8 @@ func Speech(c echo.Context) mo.Result[any] { return volcengine.HandleSpeech(c, utils.ResultToOption(options)) case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio": return alibaba.HandleSpeech(c, utils.ResultToOption(options)) + case "minimax", "minimax-tts": + return minimax.HandleSpeech(c, utils.ResultToOption(options)) default: return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend")) } @@ -63,6 +66,8 @@ func Voices(c echo.Context) mo.Result[any] { return volcengine.HandleVoices(c, utils.ResultToOption(options)) case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio": return alibaba.HandleVoices(c, utils.ResultToOption(options)) + case "minimax", "minimax-tts": + return minimax.HandleVoices(c, utils.ResultToOption(options)) default: return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend")) } diff --git a/pkg/backend/minimax/speech.go b/pkg/backend/minimax/speech.go new file mode 100644 index 0000000..d3ce5ad --- /dev/null +++ b/pkg/backend/minimax/speech.go @@ -0,0 +1,377 @@ +package minimax + +import ( + "bytes" + "encoding/hex" + "encoding/json" + "io" + "log/slog" + "net/http" + "strings" + + "github.com/labstack/echo/v4" + "github.com/moeru-ai/unspeech/pkg/apierrors" + "github.com/moeru-ai/unspeech/pkg/backend/types" + "github.com/moeru-ai/unspeech/pkg/utils" + "github.com/samber/mo" +) + +// VoiceSetting MiniMax voice settings +type VoiceSetting struct { + VoiceID string `json:"voice_id"` + Speed float64 `json:"speed,omitempty"` + Vol float64 `json:"vol,omitempty"` + Pitch float64 `json:"pitch,omitempty"` + Emotion string `json:"emotion,omitempty"` + TextNormalization string `json:"text_normalization,omitempty"` + LatexRead string `json:"latex_read,omitempty"` +} + +// AudioSetting MiniMax audio settings +type AudioSetting struct { + SampleRate int `json:"sample_rate,omitempty"` + Bitrate int `json:"bitrate,omitempty"` + Format string `json:"format,omitempty"` + Channel int `json:"channel,omitempty"` +} + +// TTSRequest MiniMax TTS request +type TTSRequest struct { + Model string `json:"model"` + Text string `json:"text"` + Stream bool `json:"stream,omitempty"` + VoiceSetting *VoiceSetting `json:"voice_setting,omitempty"` + AudioSetting *AudioSetting `json:"audio_setting,omitempty"` + OutputFormat string `json:"output_format,omitempty"` +} + +// TTSResponseData MiniMax TTS response data +type TTSResponseData struct { + Audio string `json:"audio"` + SubtitleFile string `json:"subtitle_file,omitempty"` + Status int `json:"status"` +} + +// TTSResponseExtraInfo MiniMax TTS response extra info +type TTSResponseExtraInfo struct { + AudioLength int `json:"audio_length"` + AudioSampleRate int `json:"audio_sample_rate"` + AudioSize int `json:"audio_size"` + Bitrate int `json:"bitrate"` + AudioFormat string `json:"audio_format"` + AudioChannel int `json:"audio_channel"` + WordCount int `json:"word_count"` + UsageCharacters int `json:"usage_characters"` +} + +// TTSResponseBaseResp MiniMax TTS response base info +type TTSResponseBaseResp struct { + StatusCode int `json:"status_code"` + StatusMsg string `json:"status_msg"` +} + +// TTSResponse MiniMax TTS response +type TTSResponse struct { + Data TTSResponseData `json:"data"` + TraceID string `json:"trace_id"` + ExtraInfo TTSResponseExtraInfo `json:"extra_info"` + BaseResp TTSResponseBaseResp `json:"base_resp"` +} + +// HandleSpeech handles MiniMax TTS requests +func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) mo.Result[any] { + opts := options.MustGet() + + // Get token + token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ") + + // Get stream parameter from ExtraBody + stream := utils.GetByJSONPath[bool](opts.ExtraBody, "{ .stream }") + + // If streaming, use streaming handler + if stream { + return handleStreamingSpeech(c, token, opts) + } + + // Build MiniMax request + reqBody := TTSRequest{ + Model: opts.Model, + Text: opts.Input, + Stream: false, + OutputFormat: "hex", + } + + // Set voice_id from user input + if opts.Voice != "" { + reqBody.VoiceSetting = &VoiceSetting{ + VoiceID: opts.Voice, + } + } + + // Build voice settings from ExtraBody + buildVoiceSettings(opts.ExtraBody, &reqBody) + + // Build audio settings from ExtraBody + buildAudioSettings(opts.ExtraBody, &reqBody) + + // Serialize request body + jsonBytes, err := json.Marshal(reqBody) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // Create request + req, err := http.NewRequestWithContext( + c.Request().Context(), + http.MethodPost, + "https://api.minimaxi.com/v1/t2a_v2", + bytes.NewBuffer(jsonBytes), + ) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // Set request headers + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + // Send request + resp, err := http.DefaultClient.Do(req) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + defer func() { _ = resp.Body.Close() }() + + // Check HTTP status code + if resp.StatusCode >= 400 && resp.StatusCode < 600 { + return handleHTTPError(resp) + } + + // Parse response + var ttsResp TTSResponse + err = json.NewDecoder(resp.Body).Decode(&ttsResp) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + // Check business status code + if ttsResp.BaseResp.StatusCode != 0 { + return mo.Err[any](handleMinimaxError(ttsResp.BaseResp.StatusCode, ttsResp.BaseResp.StatusMsg)) + } + + // Decode hex audio + audioBytes, err := hex.DecodeString(ttsResp.Data.Audio) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail("failed to decode hex audio: "+err.Error()).WithError(err).WithCaller()) + } + + // Determine content type + contentType := getContentType(reqBody.AudioSetting) + + return mo.Ok[any](c.Blob(http.StatusOK, contentType, audioBytes)) +} + +// buildVoiceSettings builds VoiceSetting from ExtraBody +func buildVoiceSettings(extraBody map[string]any, reqBody *TTSRequest) { + if speed := utils.GetByJSONPath[*float64](extraBody, "{ .speed }"); speed != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Speed = *speed + } + + if vol := utils.GetByJSONPath[*float64](extraBody, "{ .vol }"); vol != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Vol = *vol + } + + if pitch := utils.GetByJSONPath[*float64](extraBody, "{ .pitch }"); pitch != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Pitch = *pitch + } + + if emotion := utils.GetByJSONPath[*string](extraBody, "{ .emotion }"); emotion != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Emotion = *emotion + } +} + +// buildAudioSettings builds AudioSetting from ExtraBody +func buildAudioSettings(extraBody map[string]any, reqBody *TTSRequest) { + if sampleRate := utils.GetByJSONPath[*int](extraBody, "{ .sample_rate }"); sampleRate != nil { + reqBody.AudioSetting = &AudioSetting{} + reqBody.AudioSetting.SampleRate = *sampleRate + } + + if bitrate := utils.GetByJSONPath[*int](extraBody, "{ .bitrate }"); bitrate != nil { + if reqBody.AudioSetting == nil { + reqBody.AudioSetting = &AudioSetting{} + } + reqBody.AudioSetting.Bitrate = *bitrate + } + + if format := utils.GetByJSONPath[*string](extraBody, "{ .format }"); format != nil { + if reqBody.AudioSetting == nil { + reqBody.AudioSetting = &AudioSetting{} + } + reqBody.AudioSetting.Format = *format + } +} + +// getContentType returns MIME type based on audio format +func getContentType(audioSetting *AudioSetting) string { + if audioSetting == nil || audioSetting.Format == "" { + return "audio/mp3" + } + + contentTypes := map[string]string{ + "pcm": "audio/pcm", + "wav": "audio/wav", + "flac": "audio/flac", + "mp3": "audio/mp3", + } + + if ct, ok := contentTypes[audioSetting.Format]; ok { + return ct + } + return "audio/mp3" +} + +// handleHTTPError handles HTTP errors from upstream +func handleHTTPError(resp *http.Response) mo.Result[any] { + switch { + case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + case strings.HasPrefix(resp.Header.Get("Content-Type"), "text/"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewTextResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + default: + slog.Warn("unknown upstream error with unknown Content-Type", + slog.Int("status", resp.StatusCode), + slog.String("content_type", resp.Header.Get("Content-Type")), + ) + } + return mo.Err[any](apierrors.NewUpstreamError(resp.StatusCode)) +} + +// handleMinimaxError handles MiniMax error codes +func handleMinimaxError(code int, msg string) *apierrors.Error { + var httpStatus int + switch code { + case 1004: // Auth failed + httpStatus = http.StatusUnauthorized + case 1002, 1039: // Rate limit + httpStatus = http.StatusTooManyRequests + case 1042, 2013: // Invalid parameter + httpStatus = http.StatusBadRequest + case 1001: // Timeout + httpStatus = http.StatusGatewayTimeout + default: + httpStatus = http.StatusBadGateway + } + return apierrors.NewUpstreamError(httpStatus).WithDetailf("minimax error: %d - %s", code, msg) +} + +// handleStreamingSpeech handles streaming TTS requests +func handleStreamingSpeech(c echo.Context, token string, opts types.SpeechRequestOptions) mo.Result[any] { + // Build MiniMax request + reqBody := TTSRequest{ + Model: opts.Model, + Text: opts.Input, + Stream: true, + OutputFormat: "hex", + } + + // Set voice_id + if opts.Voice != "" { + reqBody.VoiceSetting = &VoiceSetting{ + VoiceID: opts.Voice, + } + } + + // Build settings from ExtraBody + buildVoiceSettings(opts.ExtraBody, &reqBody) + buildAudioSettings(opts.ExtraBody, &reqBody) + + // Serialize request body + jsonBytes, err := json.Marshal(reqBody) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // Create request + req, err := http.NewRequestWithContext( + c.Request().Context(), + http.MethodPost, + "https://api.minimaxi.com/v1/t2a_v2", + bytes.NewBuffer(jsonBytes), + ) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // Set request headers + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + // Send request + resp, err := http.DefaultClient.Do(req) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + defer func() { _ = resp.Body.Close() }() + + // Check HTTP status code + if resp.StatusCode >= 400 && resp.StatusCode < 600 { + return handleHTTPError(resp) + } + + // Streaming: read response until status == 2 + decoder := json.NewDecoder(resp.Body) + audioHex := new(strings.Builder) + + for { + var ttsResp TTSResponse + if err := decoder.Decode(&ttsResp); err != nil { + if err == io.EOF { + break + } + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + // Check business status code + if ttsResp.BaseResp.StatusCode != 0 { + return mo.Err[any](handleMinimaxError(ttsResp.BaseResp.StatusCode, ttsResp.BaseResp.StatusMsg)) + } + + // Append audio data + audioHex.WriteString(ttsResp.Data.Audio) + + // status == 2 means synthesis complete + if ttsResp.Data.Status == 2 { + break + } + } + + // Decode hex audio + audioBytes, err := hex.DecodeString(audioHex.String()) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail("failed to decode hex audio: "+err.Error()).WithError(err).WithCaller()) + } + + // Determine content type + contentType := getContentType(reqBody.AudioSetting) + + return mo.Ok[any](c.Blob(http.StatusOK, contentType, audioBytes)) +} diff --git a/pkg/backend/minimax/voices.go b/pkg/backend/minimax/voices.go new file mode 100644 index 0000000..6e37afd --- /dev/null +++ b/pkg/backend/minimax/voices.go @@ -0,0 +1,191 @@ +package minimax + +import ( + "bytes" + "encoding/json" + "log/slog" + "net/http" + "strings" + + "github.com/labstack/echo/v4" + "github.com/moeru-ai/unspeech/pkg/apierrors" + "github.com/moeru-ai/unspeech/pkg/backend/types" + "github.com/moeru-ai/unspeech/pkg/utils" + "github.com/samber/mo" +) + +// GetVoiceReq Request for getting voice list +type GetVoiceReq struct { + VoiceType string `json:"voice_type"` +} + +// SystemVoice System voice +type SystemVoice struct { + VoiceID string `json:"voice_id"` + VoiceName string `json:"voice_name"` + Description []string `json:"description"` +} + +// VoiceCloning Voice cloning +type VoiceCloning struct { + VoiceID string `json:"voice_id"` + Description []string `json:"description"` + CreatedTime string `json:"created_time"` +} + +// VoiceGeneration Voice generation +type VoiceGeneration struct { + VoiceID string `json:"voice_id"` + Description []string `json:"description"` + CreatedTime string `json:"created_time"` +} + +// BaseResp Base response +type BaseResp struct { + StatusCode int `json:"status_code"` + StatusMsg string `json:"status_msg"` +} + +// GetVoiceResp Response for getting voice list +type GetVoiceResp struct { + SystemVoice []SystemVoice `json:"system_voice"` + VoiceCloning []VoiceCloning `json:"voice_cloning"` + VoiceGeneration []VoiceGeneration `json:"voice_generation"` + BaseResp BaseResp `json:"base_resp"` +} + +var ( + // Supported audio formats + formats = []types.VoiceFormat{ + {Name: "MP3", Extension: ".mp3", MimeType: "audio/mpeg"}, + {Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm"}, + {Name: "FLAC", Extension: ".flac", MimeType: "audio/flac"}, + {Name: "WAV", Extension: ".wav", MimeType: "audio/wav"}, + } +) + +// HandleVoices handles getting voice list requests +func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) mo.Result[any] { + // Get token + token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ") + + // Build request + reqBody := GetVoiceReq{ + VoiceType: "all", + } + + jsonBytes, err := json.Marshal(reqBody) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // Create request + req, err := http.NewRequestWithContext( + c.Request().Context(), + http.MethodPost, + "https://api.minimaxi.com/v1/get_voice", + bytes.NewReader(jsonBytes), + ) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // Set request headers + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + // Send request + resp, err := http.DefaultClient.Do(req) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + defer func() { _ = resp.Body.Close() }() + + // Check HTTP status code + if resp.StatusCode >= 400 && resp.StatusCode < 600 { + switch { + case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + default: + slog.Warn("unknown upstream error with unknown Content-Type", + slog.Int("status", resp.StatusCode), + slog.String("content_type", resp.Header.Get("Content-Type")), + ) + } + } + + // Parse response + var voiceResp GetVoiceResp + err = json.NewDecoder(resp.Body).Decode(&voiceResp) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + // Check business status code + if voiceResp.BaseResp.StatusCode != 0 { + return mo.Err[any](handleMinimaxError(voiceResp.BaseResp.StatusCode, voiceResp.BaseResp.StatusMsg)) + } + + // Convert voice list + voices := make([]types.Voice, 0, len(voiceResp.SystemVoice)+len(voiceResp.VoiceCloning)+len(voiceResp.VoiceGeneration)) + + // Add system voices + for _, v := range voiceResp.SystemVoice { + voices = append(voices, types.Voice{ + ID: v.VoiceID, + Name: v.VoiceName, + Description: strings.Join(v.Description, ", "), + Labels: map[string]any{ + "type": "system", + }, + Tags: []string{"system"}, + Languages: []types.VoiceLanguage{{Title: "Auto", Code: "auto"}}, + Formats: formats, + CompatibleModels: []string{"speech-2.8-turbo", "speech-2.8-hd", "speech-2.6-turbo", "speech-2.6-hd"}, + PredefinedOptions: map[string]any{}, + }) + } + + // Add voice cloning voices + for _, v := range voiceResp.VoiceCloning { + voices = append(voices, types.Voice{ + ID: v.VoiceID, + Name: v.VoiceID, + Description: strings.Join(v.Description, ", "), + Labels: map[string]any{ + "type": "voice_cloning", + "createdTime": v.CreatedTime, + }, + Tags: []string{"voice_cloning"}, + Languages: []types.VoiceLanguage{{Title: "Auto", Code: "auto"}}, + Formats: formats, + CompatibleModels: []string{"speech-2.8-turbo", "speech-2.8-hd"}, + PredefinedOptions: map[string]any{}, + }) + } + + // Add voice generation voices + for _, v := range voiceResp.VoiceGeneration { + voices = append(voices, types.Voice{ + ID: v.VoiceID, + Name: v.VoiceID, + Description: strings.Join(v.Description, ", "), + Labels: map[string]any{ + "type": "voice_generation", + "createdTime": v.CreatedTime, + }, + Tags: []string{"voice_generation"}, + Languages: []types.VoiceLanguage{{Title: "Auto", Code: "auto"}}, + Formats: formats, + CompatibleModels: []string{"speech-2.8-turbo", "speech-2.8-hd"}, + PredefinedOptions: map[string]any{}, + }) + } + + return mo.Ok[any](types.ListVoicesResponse{ + Voices: voices, + }) +} diff --git a/pkg/backend/types/types.go b/pkg/backend/types/types.go index db2bcb3..f5e0f20 100644 --- a/pkg/backend/types/types.go +++ b/pkg/backend/types/types.go @@ -27,7 +27,7 @@ type OpenAISpeechRequestOptions struct { // The speed of the generated audio. // Select a value from 0.25 to 4.0. // 1.0 is the default. - Speed int `json:"speed,omitempty"` + Speed float64 `json:"speed,omitempty"` // Extension: allows you to add custom content to body. ExtraBody map[string]any `json:"extra_body,omitempty"` diff --git a/sdk/typescript/README.md b/sdk/typescript/README.md index 0892294..e76c998 100644 --- a/sdk/typescript/README.md +++ b/sdk/typescript/README.md @@ -51,6 +51,7 @@ import { createUnAlibabaCloud, createUnElevenLabs, createUnMicrosoft, + createUnMinimax, createUnSpeech, createUnVolcengine, } from 'unspeech' @@ -62,6 +63,7 @@ When using - [Alibaba Cloud Model Studio / 阿里云百炼 / CosyVoice](https://www.alibabacloud.com/en/product/modelstudio) - [Volcano Engine / 火山引擎语音技术](https://www.volcengine.com/product/voice-tech) - [ElevenLabs](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) +- [MiniMax](https://platform.minimaxi.com/docs/guides/speech-t2a-http) providers, [SSML](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup) is supported to control in fine grain level for pitch, volume, rate, etc. diff --git a/sdk/typescript/src/backend/index.ts b/sdk/typescript/src/backend/index.ts index 1ad2965..58b3abe 100644 --- a/sdk/typescript/src/backend/index.ts +++ b/sdk/typescript/src/backend/index.ts @@ -7,6 +7,7 @@ export * from './alibabacloud' export * from './deepgram' export * from './elevenlabs' export * from './microsoft' +export * from './minimax' export * from './volcengine' /** @see {@link https://github.com/moeru-ai/unspeech} */ @@ -26,7 +27,7 @@ export function createUnSpeech(apiKey: string, baseURL = 'http://localhost:5933/ | 'ali' | 'alibaba' | 'alibaba-model-studio' - | 'aliyun' | 'bailian' | 'deepgram' | 'elevenlabs' | 'koemotion' | 'openai' + | 'aliyun' | 'bailian' | 'deepgram' | 'elevenlabs' | 'koemotion' | 'minimax' | 'openai' } > = { voice: (options) => { @@ -60,6 +61,7 @@ export function createUnSpeech(apiKey: string, baseURL = 'http://localhost:5933/ | `deepgram/${string}` | `elevenlabs/${string}` | `koemotion/${string}` + | `minimax/${string}` | `openai/${string}` | `volcano/${string}` | `volcengine/${string}`, diff --git a/sdk/typescript/src/backend/minimax.ts b/sdk/typescript/src/backend/minimax.ts new file mode 100644 index 0000000..482553e --- /dev/null +++ b/sdk/typescript/src/backend/minimax.ts @@ -0,0 +1,146 @@ +import type { SpeechProviderWithExtraOptions } from '@xsai-ext/providers/utils' + +import type { UnSpeechOptions, VoiceProviderWithExtraOptions } from '../types' + +import { merge } from '@xsai-ext/providers/utils' +import { objCamelToSnake } from '@xsai/shared' + +/** + * MiniMax TTS API options + * @see https://platform.minimaxi.com/docs/guides/speech-t2a-http + */ +export interface UnMinimaxOptions { + /** + * Speech speed. Range: 0.5-2.0 + * @default 1.0 + */ + speed?: number + /** + * Volume. Range: 0-10 + * @default 1.0 + */ + vol?: number + /** + * Pitch adjustment. Range: -12 to 12 + * @default 0 + */ + pitch?: number + /** + * Emotion setting + * @see https://platform.minimaxi.com/docs/guides/speech-t2a-http#emotion + * @example "happy" | "sad" | "angry" | "fearful" | "disgusted" | "surprised" | "calm" | "fluent" | "whisper" + */ + emotion?: string + /** + * Enable streaming output + * @default false + */ + stream?: boolean + /** + * Sample rate for audio output + * @example 8000 | 16000 | 22050 | 24000 | 32000 | 44100 + */ + sampleRate?: number + /** + * Audio bitrate + * @example 32000 | 64000 | 128000 | 256000 + */ + bitrate?: number + /** + * Audio format + * @example "mp3" | "pcm" | "flac" | "wav" + * @default "mp3" + */ + format?: 'mp3' | 'pcm' | 'flac' | 'wav' + /** + * Audio channel + * @example 1 | 2 + * @default 1 + */ + channel?: number +} + +/** + * MiniMax TTS models + * @see https://platform.minimaxi.com/docs/guides/speech-t2a-http#model + */ +export type MinimaxModel = + | 'speech-2.8-hd' + | 'speech-2.8-turbo' + | 'speech-2.6-hd' + | 'speech-2.6-turbo' + | 'speech-02-hd' + | 'speech-02-turbo' + | 'speech-01-hd' + | 'speech-01-turbo' + +/** + * [MiniMax](https://platform.minimaxi.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech) + * + * @param apiKey - MiniMax API Key + * @param baseURL - UnSpeech Instance URL + * @returns SpeechProviderWithExtraOptions & VoiceProviderWithExtraOptions + */ +export function createUnMinimax(apiKey: string, baseURL = 'http://localhost:5933/v1/') { + const toUnSpeechOptions = ({ + speed, + vol, + pitch, + emotion, + stream, + sampleRate, + bitrate, + format, + channel, + }: UnMinimaxOptions): UnSpeechOptions => ({ + extraBody: objCamelToSnake({ + speed, + vol, + pitch, + emotion, + stream, + sampleRate, + bitrate, + format, + channel, + }), + }) + + const speechProvider: SpeechProviderWithExtraOptions< + `minimax/${MinimaxModel}`, + UnMinimaxOptions + > = { + speech: (model, options) => ({ + ...(options ? toUnSpeechOptions(options) : {}), + apiKey, + baseURL, + model: `minimax/${model}`, + }), + } + + const voiceProvider: VoiceProviderWithExtraOptions< + UnMinimaxOptions + > = { + voice: (options) => { + let adjustedBaseURL = baseURL + if (adjustedBaseURL.endsWith('v1/')) { + adjustedBaseURL = adjustedBaseURL.slice(0, -3) + } + else if (adjustedBaseURL.endsWith('v1')) { + adjustedBaseURL = adjustedBaseURL.slice(0, -2) + } + + return { + query: 'provider=minimax', + ...(options ? toUnSpeechOptions(options) : {}), + apiKey, + baseURL: adjustedBaseURL, + } + }, + } + + return merge( + speechProvider, + voiceProvider, + ) +} diff --git a/sdk/typescript/src/index.ts b/sdk/typescript/src/index.ts index 0a43b7b..3e41b5f 100644 --- a/sdk/typescript/src/index.ts +++ b/sdk/typescript/src/index.ts @@ -4,6 +4,7 @@ export { createUnDeepgram, createUnElevenLabs, createUnMicrosoft, + createUnMinimax, createUnSpeech, createUnVolcengine, } from './backend'