From 080920ee66b118bb98d24cfde99a754baeadf9cf Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 03:10:45 +0800 Subject: [PATCH 1/9] feat(minimax): add minimax package with type definitions --- pkg/backend/minimax/speech.go | 64 +++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 pkg/backend/minimax/speech.go diff --git a/pkg/backend/minimax/speech.go b/pkg/backend/minimax/speech.go new file mode 100644 index 0000000..566182f --- /dev/null +++ b/pkg/backend/minimax/speech.go @@ -0,0 +1,64 @@ +// Package minimax provides MiniMax TTS integration +package minimax + +// VoiceSetting MiniMax 音色设置 +type VoiceSetting struct { + VoiceID string `json:"voice_id"` + Speed float64 `json:"speed,omitempty"` + Vol float64 `json:"vol,omitempty"` + Pitch float64 `json:"pitch,omitempty"` + Emotion string `json:"emotion,omitempty"` + TextNormalization string `json:"text_normalization,omitempty"` + LatexRead string `json:"latex_read,omitempty"` +} + +// AudioSetting MiniMax 音频设置 +type AudioSetting struct { + SampleRate int `json:"sample_rate,omitempty"` + Bitrate int `json:"bitrate,omitempty"` + Format string `json:"format,omitempty"` + Channel int `json:"channel,omitempty"` +} + +// TTSRequest MiniMax TTS 请求 +type TTSRequest struct { + Model string `json:"model"` + Text string `json:"text"` + Stream bool `json:"stream,omitempty"` + VoiceSetting *VoiceSetting `json:"voice_setting,omitempty"` + AudioSetting *AudioSetting `json:"audio_setting,omitempty"` + OutputFormat string `json:"output_format,omitempty"` +} + +// TTSResponseData MiniMax TTS 响应数据 +type TTSResponseData struct { + Audio string `json:"audio"` + SubtitleFile string `json:"subtitle_file,omitempty"` + Status int `json:"status"` +} + +// TTSResponseExtraInfo MiniMax TTS 响应额外信息 +type TTSResponseExtraInfo struct { + AudioLength int `json:"audio_length"` + AudioSampleRate int `json:"audio_sample_rate"` + AudioSize int `json:"audio_size"` + Bitrate int `json:"bitrate"` + AudioFormat string `json:"audio_format"` + AudioChannel int `json:"audio_channel"` + WordCount int `json:"word_count"` + UsageCharacters int `json:"usage_characters"` +} + +// TTSResponseBaseResp MiniMax TTS 响应基础信息 +type TTSResponseBaseResp struct { + StatusCode int `json:"status_code"` + StatusMsg string `json:"status_msg"` +} + +// TTSResponse MiniMax TTS 响应 +type TTSResponse struct { + Data TTSResponseData `json:"data"` + TraceID string `json:"trace_id"` + ExtraInfo TTSResponseExtraInfo `json:"extra_info"` + BaseResp TTSResponseBaseResp `json:"base_resp"` +} From 51e53eaf34237695ed1942253d67b13e01d6d46f Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 03:13:00 +0800 Subject: [PATCH 2/9] feat(minimax): implement HandleSpeech with non-streaming and streaming TTS --- pkg/backend/minimax/speech.go | 347 +++++++++++++++++++++++++++++++++- 1 file changed, 346 insertions(+), 1 deletion(-) diff --git a/pkg/backend/minimax/speech.go b/pkg/backend/minimax/speech.go index 566182f..d328549 100644 --- a/pkg/backend/minimax/speech.go +++ b/pkg/backend/minimax/speech.go @@ -1,6 +1,21 @@ -// Package minimax provides MiniMax TTS integration package minimax +import ( + "bytes" + "encoding/hex" + "encoding/json" + "log/slog" + "net/http" + "strings" + + "github.com/labstack/echo/v4" + "github.com/moeru-ai/unspeech/pkg/apierrors" + "github.com/moeru-ai/unspeech/pkg/backend/types" + "github.com/moeru-ai/unspeech/pkg/utils" + "github.com/samber/lo" + "github.com/samber/mo" +) + // VoiceSetting MiniMax 音色设置 type VoiceSetting struct { VoiceID string `json:"voice_id"` @@ -62,3 +77,333 @@ type TTSResponse struct { ExtraInfo TTSResponseExtraInfo `json:"extra_info"` BaseResp TTSResponseBaseResp `json:"base_resp"` } + +// HandleSpeech 处理 MiniMax TTS 请求 +func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) mo.Result[any] { + opts := options.MustGet() + + // 获取 token + token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ") + + // 从 ExtraBody 获取 stream 参数 + stream := utils.GetByJSONPath[bool](opts.ExtraBody, "{ .stream }") + + // 如果是流式请求,使用流式处理 + if stream { + return handleStreamingSpeech(c, token, opts) + } + + // 构建 MiniMax 请求 + reqBody := TTSRequest{ + Model: opts.Model, + Text: opts.Input, + Stream: false, + OutputFormat: "hex", + } + + // 设置 voice_id(从用户传入的 voice 字段) + voiceID := opts.Voice + if voiceID != "" { + reqBody.VoiceSetting = &VoiceSetting{ + VoiceID: voiceID, + } + } + + // 从 ExtraBody 获取其他参数 + if speed := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .speed }"); speed != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Speed = *speed + } + + if vol := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .vol }"); vol != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Vol = *vol + } + + if pitch := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .pitch }"); pitch != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Pitch = *pitch + } + + if emotion := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .emotion }"); emotion != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Emotion = *emotion + } + + // 音频设置 + if sampleRate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .sample_rate }"); sampleRate != nil { + reqBody.AudioSetting = &AudioSetting{} + reqBody.AudioSetting.SampleRate = *sampleRate + } + + if bitrate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .bitrate }"); bitrate != nil { + if reqBody.AudioSetting == nil { + reqBody.AudioSetting = &AudioSetting{} + } + reqBody.AudioSetting.Bitrate = *bitrate + } + + if format := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .format }"); format != nil { + if reqBody.AudioSetting == nil { + reqBody.AudioSetting = &AudioSetting{} + } + reqBody.AudioSetting.Format = *format + } + + // 序列化请求体 + jsonBytes, err := json.Marshal(reqBody) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // 创建请求 + req, err := http.NewRequestWithContext( + c.Request().Context(), + http.MethodPost, + "https://api.minimaxi.com/v1/t2a_v2", + bytes.NewBuffer(jsonBytes), + ) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // 设置请求头 + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + // 发送请求 + resp, err := http.DefaultClient.Do(req) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + defer func() { _ = resp.Body.Close() }() + + // 检查 HTTP 状态码 + if resp.StatusCode >= 400 && resp.StatusCode < 600 { + switch { + case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + case strings.HasPrefix(resp.Header.Get("Content-Type"), "text/"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewTextResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + default: + slog.Warn("unknown upstream error with unknown Content-Type", + slog.Int("status", resp.StatusCode), + slog.String("content_type", resp.Header.Get("Content-Type")), + ) + } + } + + // 解析响应 + var ttsResp TTSResponse + err = json.NewDecoder(resp.Body).Decode(&ttsResp) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + // 检查业务状态码 + if ttsResp.BaseResp.StatusCode != 0 { + return mo.Err[any](handleMinimaxError(ttsResp.BaseResp.StatusCode, ttsResp.BaseResp.StatusMsg)) + } + + // 解码 hex 音频 + audioBytes, err := hex.DecodeString(ttsResp.Data.Audio) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail("failed to decode hex audio: "+err.Error()).WithError(err).WithCaller()) + } + + // 确定 Content-Type + contentType := "audio/mp3" + if reqBody.AudioSetting != nil && reqBody.AudioSetting.Format != "" { + contentType = lo.Ternary(reqBody.AudioSetting.Format == "pcm", "audio/pcm", + lo.Ternary(reqBody.AudioSetting.Format == "wav", "audio/wav", + lo.Ternary(reqBody.AudioSetting.Format == "flac", "audio/flac", "audio/mp3"))) + } + + return mo.Ok[any](c.Blob(http.StatusOK, contentType, audioBytes)) +} + +// handleMinimaxError 处理 MiniMax 错误码 +func handleMinimaxError(code int, msg string) *apierrors.Error { + var httpStatus int + switch code { + case 1004: // 鉴权失败 + httpStatus = http.StatusUnauthorized + case 1002, 1039: // 限流 + httpStatus = http.StatusTooManyRequests + case 1042, 2013: // 参数错误 + httpStatus = http.StatusBadRequest + case 1001: // 超时 + httpStatus = http.StatusGatewayTimeout + default: + httpStatus = http.StatusBadGateway + } + return apierrors.NewUpstreamError(httpStatus).WithDetailf("minimax error: %d - %s", code, msg) +} + +// handleStreamingSpeech 处理流式 TTS 请求 +func handleStreamingSpeech(c echo.Context, token string, opts types.SpeechRequestOptions) mo.Result[any] { + // 构建 MiniMax 请求 + reqBody := TTSRequest{ + Model: opts.Model, + Text: opts.Input, + Stream: true, + OutputFormat: "hex", + } + + // 设置 voice_id + voiceID := opts.Voice + if voiceID != "" { + reqBody.VoiceSetting = &VoiceSetting{ + VoiceID: voiceID, + } + } + + // 从 ExtraBody 获取其他参数 + if speed := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .speed }"); speed != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Speed = *speed + } + + if vol := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .vol }"); vol != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Vol = *vol + } + + if pitch := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .pitch }"); pitch != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Pitch = *pitch + } + + if emotion := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .emotion }"); emotion != nil { + if reqBody.VoiceSetting == nil { + reqBody.VoiceSetting = &VoiceSetting{} + } + reqBody.VoiceSetting.Emotion = *emotion + } + + // 音频设置 + if sampleRate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .sample_rate }"); sampleRate != nil { + reqBody.AudioSetting = &AudioSetting{} + reqBody.AudioSetting.SampleRate = *sampleRate + } + + if bitrate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .bitrate }"); bitrate != nil { + if reqBody.AudioSetting == nil { + reqBody.AudioSetting = &AudioSetting{} + } + reqBody.AudioSetting.Bitrate = *bitrate + } + + if format := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .format }"); format != nil { + if reqBody.AudioSetting == nil { + reqBody.AudioSetting = &AudioSetting{} + } + reqBody.AudioSetting.Format = *format + } + + // 序列化请求体 + jsonBytes, err := json.Marshal(reqBody) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // 创建请求 + req, err := http.NewRequestWithContext( + c.Request().Context(), + http.MethodPost, + "https://api.minimaxi.com/v1/t2a_v2", + bytes.NewBuffer(jsonBytes), + ) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // 设置请求头 + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + // 发送请求 + resp, err := http.DefaultClient.Do(req) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + defer func() { _ = resp.Body.Close() }() + + // 检查 HTTP 状态码 + if resp.StatusCode >= 400 && resp.StatusCode < 600 { + switch { + case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + default: + slog.Warn("unknown upstream error with unknown Content-Type", + slog.Int("status", resp.StatusCode), + slog.String("content_type", resp.Header.Get("Content-Type")), + ) + } + } + + // 流式处理:持续读取响应直到 status == 2 + decoder := json.NewDecoder(resp.Body) + audioHex := new(strings.Builder) + + for { + var ttsResp TTSResponse + if err := decoder.Decode(&ttsResp); err != nil { + if err.Error() == "EOF" { + break + } + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + // 检查业务状态码 + if ttsResp.BaseResp.StatusCode != 0 { + return mo.Err[any](handleMinimaxError(ttsResp.BaseResp.StatusCode, ttsResp.BaseResp.StatusMsg)) + } + + // 追加音频数据 + audioHex.WriteString(ttsResp.Data.Audio) + + // status == 2 表示合成结束 + if ttsResp.Data.Status == 2 { + break + } + } + + // 解码 hex 音频 + audioBytes, err := hex.DecodeString(audioHex.String()) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail("failed to decode hex audio: "+err.Error()).WithError(err).WithCaller()) + } + + // 确定 Content-Type + contentType := "audio/mp3" + if reqBody.AudioSetting != nil && reqBody.AudioSetting.Format != "" { + contentType = lo.Ternary(reqBody.AudioSetting.Format == "pcm", "audio/pcm", + lo.Ternary(reqBody.AudioSetting.Format == "wav", "audio/wav", + lo.Ternary(reqBody.AudioSetting.Format == "flac", "audio/flac", "audio/mp3"))) + } + + return mo.Ok[any](c.Blob(http.StatusOK, contentType, audioBytes)) +} From b24e974ef360cb890649ff783c61b11e30762dbc Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 03:14:11 +0800 Subject: [PATCH 3/9] feat(minimax): implement HandleVoices for voice list --- pkg/backend/minimax/voices.go | 190 ++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 pkg/backend/minimax/voices.go diff --git a/pkg/backend/minimax/voices.go b/pkg/backend/minimax/voices.go new file mode 100644 index 0000000..9603c72 --- /dev/null +++ b/pkg/backend/minimax/voices.go @@ -0,0 +1,190 @@ +package minimax + +import ( + "encoding/json" + "log/slog" + "net/http" + "strings" + + "github.com/labstack/echo/v4" + "github.com/moeru-ai/unspeech/pkg/apierrors" + "github.com/moeru-ai/unspeech/pkg/backend/types" + "github.com/moeru-ai/unspeech/pkg/utils" + "github.com/samber/mo" +) + +// GetVoiceReq 获取音色列表请求 +type GetVoiceReq struct { + VoiceType string `json:"voice_type"` +} + +// SystemVoice 系统音色 +type SystemVoice struct { + VoiceID string `json:"voice_id"` + VoiceName string `json:"voice_name"` + Description []string `json:"description"` +} + +// VoiceCloning 快速复刻音色 +type VoiceCloning struct { + VoiceID string `json:"voice_id"` + Description []string `json:"description"` + CreatedTime string `json:"created_time"` +} + +// VoiceGeneration 文生音色 +type VoiceGeneration struct { + VoiceID string `json:"voice_id"` + Description []string `json:"description"` + CreatedTime string `json:"created_time"` +} + +// BaseResp 基础响应 +type BaseResp struct { + StatusCode int `json:"status_code"` + StatusMsg string `json:"status_msg"` +} + +// GetVoiceResp 获取音色列表响应 +type GetVoiceResp struct { + SystemVoice []SystemVoice `json:"system_voice"` + VoiceCloning []VoiceCloning `json:"voice_cloning"` + VoiceGeneration []VoiceGeneration `json:"voice_generation"` + BaseResp BaseResp `json:"base_resp"` +} + +var ( + // 支持的音频格式 + formats = []types.VoiceFormat{ + {Name: "MP3", Extension: ".mp3", MimeType: "audio/mpeg"}, + {Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm"}, + {Name: "FLAC", Extension: ".flac", MimeType: "audio/flac"}, + {Name: "WAV", Extension: ".wav", MimeType: "audio/wav"}, + } +) + +// HandleVoices 处理获取音色列表请求 +func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) mo.Result[any] { + // 获取 token + token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ") + + // 构建请求 + reqBody := GetVoiceReq{ + VoiceType: "all", + } + + jsonBytes, err := json.Marshal(reqBody) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // 创建请求 + req, err := http.NewRequestWithContext( + c.Request().Context(), + http.MethodPost, + "https://api.minimaxi.com/v1/get_voice", + strings.NewReader(string(jsonBytes)), + ) + if err != nil { + return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) + } + + // 设置请求头 + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + // 发送请求 + resp, err := http.DefaultClient.Do(req) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + defer func() { _ = resp.Body.Close() }() + + // 检查 HTTP 状态码 + if resp.StatusCode >= 400 && resp.StatusCode < 600 { + switch { + case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + default: + slog.Warn("unknown upstream error with unknown Content-Type", + slog.Int("status", resp.StatusCode), + slog.String("content_type", resp.Header.Get("Content-Type")), + ) + } + } + + // 解析响应 + var voiceResp GetVoiceResp + err = json.NewDecoder(resp.Body).Decode(&voiceResp) + if err != nil { + return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) + } + + // 检查业务状态码 + if voiceResp.BaseResp.StatusCode != 0 { + return mo.Err[any](handleMinimaxError(voiceResp.BaseResp.StatusCode, voiceResp.BaseResp.StatusMsg)) + } + + // 转换音色列表 + voices := make([]types.Voice, 0, len(voiceResp.SystemVoice)+len(voiceResp.VoiceCloning)+len(voiceResp.VoiceGeneration)) + + // 添加系统音色 + for _, v := range voiceResp.SystemVoice { + voices = append(voices, types.Voice{ + ID: v.VoiceID, + Name: v.VoiceName, + Description: strings.Join(v.Description, ", "), + Labels: map[string]any{ + "type": "system", + }, + Tags: []string{"system"}, + Languages: []types.VoiceLanguage{{Title: "Auto", Code: "auto"}}, + Formats: formats, + CompatibleModels: []string{"speech-2.8-turbo", "speech-2.8-hd", "speech-2.6-turbo", "speech-2.6-hd"}, + PredefinedOptions: map[string]any{}, + }) + } + + // 添加快速复刻音色 + for _, v := range voiceResp.VoiceCloning { + voices = append(voices, types.Voice{ + ID: v.VoiceID, + Name: v.VoiceID, + Description: strings.Join(v.Description, ", "), + Labels: map[string]any{ + "type": "voice_cloning", + "createdTime": v.CreatedTime, + }, + Tags: []string{"voice_cloning"}, + Languages: []types.VoiceLanguage{{Title: "Auto", Code: "auto"}}, + Formats: formats, + CompatibleModels: []string{"speech-2.8-turbo", "speech-2.8-hd"}, + PredefinedOptions: map[string]any{}, + }) + } + + // 添加文生音色 + for _, v := range voiceResp.VoiceGeneration { + voices = append(voices, types.Voice{ + ID: v.VoiceID, + Name: v.VoiceID, + Description: strings.Join(v.Description, ", "), + Labels: map[string]any{ + "type": "voice_generation", + "createdTime": v.CreatedTime, + }, + Tags: []string{"voice_generation"}, + Languages: []types.VoiceLanguage{{Title: "Auto", Code: "auto"}}, + Formats: formats, + CompatibleModels: []string{"speech-2.8-turbo", "speech-2.8-hd"}, + PredefinedOptions: map[string]any{}, + }) + } + + return mo.Ok[any](types.ListVoicesResponse{ + Voices: voices, + }) +} From 9bca8fcca9bd40690332c2db3c8a196270a35020 Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 03:15:20 +0800 Subject: [PATCH 4/9] feat(backend): add minimax routing --- pkg/backend/backend.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/backend/backend.go b/pkg/backend/backend.go index bd93b1d..355e291 100644 --- a/pkg/backend/backend.go +++ b/pkg/backend/backend.go @@ -10,6 +10,7 @@ import ( "github.com/moeru-ai/unspeech/pkg/backend/elevenlabs" "github.com/moeru-ai/unspeech/pkg/backend/koemotion" "github.com/moeru-ai/unspeech/pkg/backend/microsoft" + "github.com/moeru-ai/unspeech/pkg/backend/minimax" "github.com/moeru-ai/unspeech/pkg/backend/openai" "github.com/moeru-ai/unspeech/pkg/backend/types" "github.com/moeru-ai/unspeech/pkg/backend/volcengine" @@ -37,6 +38,8 @@ func Speech(c echo.Context) mo.Result[any] { return volcengine.HandleSpeech(c, utils.ResultToOption(options)) case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio": return alibaba.HandleSpeech(c, utils.ResultToOption(options)) + case "minimax", "minimax-tts": + return minimax.HandleSpeech(c, utils.ResultToOption(options)) default: return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend")) } @@ -63,6 +66,8 @@ func Voices(c echo.Context) mo.Result[any] { return volcengine.HandleVoices(c, utils.ResultToOption(options)) case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio": return alibaba.HandleVoices(c, utils.ResultToOption(options)) + case "minimax", "minimax-tts": + return minimax.HandleVoices(c, utils.ResultToOption(options)) default: return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend")) } From 6994a581b94b3dcc324754ed49d7e9ed4e1823c3 Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 03:48:05 +0800 Subject: [PATCH 5/9] feat(sdk): add MiniMax TypeScript SDK support --- sdk/typescript/README.md | 2 + sdk/typescript/src/backend/index.ts | 4 +- sdk/typescript/src/backend/minimax.ts | 145 ++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 sdk/typescript/src/backend/minimax.ts diff --git a/sdk/typescript/README.md b/sdk/typescript/README.md index 0892294..e76c998 100644 --- a/sdk/typescript/README.md +++ b/sdk/typescript/README.md @@ -51,6 +51,7 @@ import { createUnAlibabaCloud, createUnElevenLabs, createUnMicrosoft, + createUnMinimax, createUnSpeech, createUnVolcengine, } from 'unspeech' @@ -62,6 +63,7 @@ When using - [Alibaba Cloud Model Studio / 阿里云百炼 / CosyVoice](https://www.alibabacloud.com/en/product/modelstudio) - [Volcano Engine / 火山引擎语音技术](https://www.volcengine.com/product/voice-tech) - [ElevenLabs](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) +- [MiniMax](https://platform.minimaxi.com/docs/guides/speech-t2a-http) providers, [SSML](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup) is supported to control in fine grain level for pitch, volume, rate, etc. diff --git a/sdk/typescript/src/backend/index.ts b/sdk/typescript/src/backend/index.ts index 1ad2965..58b3abe 100644 --- a/sdk/typescript/src/backend/index.ts +++ b/sdk/typescript/src/backend/index.ts @@ -7,6 +7,7 @@ export * from './alibabacloud' export * from './deepgram' export * from './elevenlabs' export * from './microsoft' +export * from './minimax' export * from './volcengine' /** @see {@link https://github.com/moeru-ai/unspeech} */ @@ -26,7 +27,7 @@ export function createUnSpeech(apiKey: string, baseURL = 'http://localhost:5933/ | 'ali' | 'alibaba' | 'alibaba-model-studio' - | 'aliyun' | 'bailian' | 'deepgram' | 'elevenlabs' | 'koemotion' | 'openai' + | 'aliyun' | 'bailian' | 'deepgram' | 'elevenlabs' | 'koemotion' | 'minimax' | 'openai' } > = { voice: (options) => { @@ -60,6 +61,7 @@ export function createUnSpeech(apiKey: string, baseURL = 'http://localhost:5933/ | `deepgram/${string}` | `elevenlabs/${string}` | `koemotion/${string}` + | `minimax/${string}` | `openai/${string}` | `volcano/${string}` | `volcengine/${string}`, diff --git a/sdk/typescript/src/backend/minimax.ts b/sdk/typescript/src/backend/minimax.ts new file mode 100644 index 0000000..7246dbf --- /dev/null +++ b/sdk/typescript/src/backend/minimax.ts @@ -0,0 +1,145 @@ +import type { SpeechProviderWithExtraOptions } from '@xsai-ext/providers/utils' + +import type { UnSpeechOptions, VoiceProviderWithExtraOptions } from '../types' + +import { merge } from '@xsai-ext/providers/utils' +import { objCamelToSnake } from '@xsai/shared' + +/** + * MiniMax TTS API options + * @see https://platform.minimaxi.com/docs/guides/speech-t2a-http + */ +export interface UnMinimaxOptions { + /** + * Speech speed. Range: 0.5-2.0 + * @default 1.0 + */ + speed?: number + /** + * Volume. Range: 0-10 + * @default 1.0 + */ + vol?: number + /** + * Pitch adjustment. Range: -12 to 12 + * @default 0 + */ + pitch?: number + /** + * Emotion setting + * @see https://platform.minimaxi.com/docs/guides/speech-t2a-http#emotion + * @example "happy" | "sad" | "angry" | "fearful" | "disgusted" | "surprised" | "calm" | "fluent" | "whisper" + */ + emotion?: string + /** + * Enable streaming output + * @default false + */ + stream?: boolean + /** + * Sample rate for audio output + * @example 8000 | 16000 | 22050 | 24000 | 32000 | 44100 + */ + sampleRate?: number + /** + * Audio bitrate + * @example 32000 | 64000 | 128000 | 256000 + */ + bitrate?: number + /** + * Audio format + * @example "mp3" | "pcm" | "flac" | "wav" + * @default "mp3" + */ + format?: 'mp3' | 'pcm' | 'flac' | 'wav' + /** + * Audio channel + * @example 1 | 2 + * @default 1 + */ + channel?: number +} + +/** + * MiniMax TTS models + * @see https://platform.minimaxi.com/docs/guides/speech-t2a-http#model + */ +export type MinimaxModel = + | 'speech-2.8-hd' + | 'speech-2.8-turbo' + | 'speech-2.6-hd' + | 'speech-2.6-turbo' + | 'speech-02-hd' + | 'speech-02-turbo' + | 'speech-01-hd' + | 'speech-01-turbo' + +/** + * [MiniMax](https://platform.minimaxi.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech) + * + * @param apiKey - MiniMax API Key + * @param baseURL - UnSpeech Instance URL + * @returns SpeechProviderWithExtraOptions & VoiceProviderWithExtraOptions + */ +export function createUnMinimax(apiKey: string, baseURL = 'http://localhost:5933/v1/') { + const toUnSpeechOptions = ({ + speed, + vol, + pitch, + emotion, + stream, + sampleRate, + bitrate, + format, + channel, + }: UnMinimaxOptions): UnSpeechOptions => ({ + extraBody: objCamelToSnake({ + speed, + vol, + pitch, + emotion, + stream, + sampleRate, + bitrate, + format, + channel, + }), + }) + + const speechProvider: SpeechProviderWithExtraOptions< + `minimax/${MinimaxModel}`, + UnMinimaxOptions + > = { + speech: (model, options) => ({ + ...(options ? toUnSpeechOptions(options) : {}), + apiKey, + baseURL, + model: `minimax/${model}`, + }), + } + + const voiceProvider: VoiceProviderWithExtraOptions< + UnMinimaxOptions + > = { + voice: (options) => { + if (baseURL.endsWith('v1/')) { + baseURL = baseURL.slice(0, -3) + } + else if (baseURL.endsWith('v1')) { + baseURL = baseURL.slice(0, -2) + } + + return { + query: 'provider=minimax', + ...(options ? toUnSpeechOptions(options) : {}), + apiKey, + baseURL, + } + }, + } + + return merge( + speechProvider, + voiceProvider, + ) +} From 118fc313c466f29c26da1e680d0c200be822e15c Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 03:50:56 +0800 Subject: [PATCH 6/9] docs: add MiniMax to supported providers list --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c3c570a..3ee26d3 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ unSpeech lets you use various online TTS with OpenAI-compatible API. - [Alibaba Cloud Model Studio / 阿里云百炼 / CosyVoice](https://www.alibabacloud.com/en/product/modelstudio) - [Volcano Engine / 火山引擎语音技术](https://www.volcengine.com/product/voice-tech) - [ElevenLabs](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) +- [MiniMax](https://platform.minimaxi.com/docs/guides/speech-t2a-http) - [Koemotion (by Rinna)](https://koemotion.rinna.co.jp/) ## Getting Started From 2cdca2e5fe3e3c3d420dae70b947b57cddfc61ae Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 04:01:28 +0800 Subject: [PATCH 7/9] fix: change Speed field type from int to float64 The OpenAI API accepts speed values from 0.25 to 4.0, which requires a float type. Changed the Speed field to float64 to match the API specification and allow decimal values. Co-Authored-By: Claude Opus 4.6 --- pkg/backend/types/types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/backend/types/types.go b/pkg/backend/types/types.go index db2bcb3..f5e0f20 100644 --- a/pkg/backend/types/types.go +++ b/pkg/backend/types/types.go @@ -27,7 +27,7 @@ type OpenAISpeechRequestOptions struct { // The speed of the generated audio. // Select a value from 0.25 to 4.0. // 1.0 is the default. - Speed int `json:"speed,omitempty"` + Speed float64 `json:"speed,omitempty"` // Extension: allows you to add custom content to body. ExtraBody map[string]any `json:"extra_body,omitempty"` From ce021af4410e44567fad79bed67fd84e959efc75 Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 04:38:44 +0800 Subject: [PATCH 8/9] fix: address code review comments - Translate Chinese comments to English - Extract common functions (buildVoiceSettings, buildAudioSettings, getContentType, handleHTTPError) - Fix TypeScript baseURL side effect using local variable - Replace string() conversion with bytes.NewReader --- pkg/backend/minimax/speech.go | 304 ++++++++++++-------------- pkg/backend/minimax/voices.go | 43 ++-- sdk/typescript/src/backend/minimax.ts | 11 +- 3 files changed, 164 insertions(+), 194 deletions(-) diff --git a/pkg/backend/minimax/speech.go b/pkg/backend/minimax/speech.go index d328549..d3ce5ad 100644 --- a/pkg/backend/minimax/speech.go +++ b/pkg/backend/minimax/speech.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/hex" "encoding/json" + "io" "log/slog" "net/http" "strings" @@ -12,11 +13,10 @@ import ( "github.com/moeru-ai/unspeech/pkg/apierrors" "github.com/moeru-ai/unspeech/pkg/backend/types" "github.com/moeru-ai/unspeech/pkg/utils" - "github.com/samber/lo" "github.com/samber/mo" ) -// VoiceSetting MiniMax 音色设置 +// VoiceSetting MiniMax voice settings type VoiceSetting struct { VoiceID string `json:"voice_id"` Speed float64 `json:"speed,omitempty"` @@ -27,7 +27,7 @@ type VoiceSetting struct { LatexRead string `json:"latex_read,omitempty"` } -// AudioSetting MiniMax 音频设置 +// AudioSetting MiniMax audio settings type AudioSetting struct { SampleRate int `json:"sample_rate,omitempty"` Bitrate int `json:"bitrate,omitempty"` @@ -35,7 +35,7 @@ type AudioSetting struct { Channel int `json:"channel,omitempty"` } -// TTSRequest MiniMax TTS 请求 +// TTSRequest MiniMax TTS request type TTSRequest struct { Model string `json:"model"` Text string `json:"text"` @@ -45,14 +45,14 @@ type TTSRequest struct { OutputFormat string `json:"output_format,omitempty"` } -// TTSResponseData MiniMax TTS 响应数据 +// TTSResponseData MiniMax TTS response data type TTSResponseData struct { Audio string `json:"audio"` SubtitleFile string `json:"subtitle_file,omitempty"` Status int `json:"status"` } -// TTSResponseExtraInfo MiniMax TTS 响应额外信息 +// TTSResponseExtraInfo MiniMax TTS response extra info type TTSResponseExtraInfo struct { AudioLength int `json:"audio_length"` AudioSampleRate int `json:"audio_sample_rate"` @@ -64,13 +64,13 @@ type TTSResponseExtraInfo struct { UsageCharacters int `json:"usage_characters"` } -// TTSResponseBaseResp MiniMax TTS 响应基础信息 +// TTSResponseBaseResp MiniMax TTS response base info type TTSResponseBaseResp struct { StatusCode int `json:"status_code"` StatusMsg string `json:"status_msg"` } -// TTSResponse MiniMax TTS 响应 +// TTSResponse MiniMax TTS response type TTSResponse struct { Data TTSResponseData `json:"data"` TraceID string `json:"trace_id"` @@ -78,22 +78,22 @@ type TTSResponse struct { BaseResp TTSResponseBaseResp `json:"base_resp"` } -// HandleSpeech 处理 MiniMax TTS 请求 +// HandleSpeech handles MiniMax TTS requests func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) mo.Result[any] { opts := options.MustGet() - // 获取 token + // Get token token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ") - // 从 ExtraBody 获取 stream 参数 + // Get stream parameter from ExtraBody stream := utils.GetByJSONPath[bool](opts.ExtraBody, "{ .stream }") - // 如果是流式请求,使用流式处理 + // If streaming, use streaming handler if stream { return handleStreamingSpeech(c, token, opts) } - // 构建 MiniMax 请求 + // Build MiniMax request reqBody := TTSRequest{ Model: opts.Model, Text: opts.Input, @@ -101,70 +101,26 @@ func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) OutputFormat: "hex", } - // 设置 voice_id(从用户传入的 voice 字段) - voiceID := opts.Voice - if voiceID != "" { + // Set voice_id from user input + if opts.Voice != "" { reqBody.VoiceSetting = &VoiceSetting{ - VoiceID: voiceID, + VoiceID: opts.Voice, } } - // 从 ExtraBody 获取其他参数 - if speed := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .speed }"); speed != nil { - if reqBody.VoiceSetting == nil { - reqBody.VoiceSetting = &VoiceSetting{} - } - reqBody.VoiceSetting.Speed = *speed - } + // Build voice settings from ExtraBody + buildVoiceSettings(opts.ExtraBody, &reqBody) - if vol := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .vol }"); vol != nil { - if reqBody.VoiceSetting == nil { - reqBody.VoiceSetting = &VoiceSetting{} - } - reqBody.VoiceSetting.Vol = *vol - } + // Build audio settings from ExtraBody + buildAudioSettings(opts.ExtraBody, &reqBody) - if pitch := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .pitch }"); pitch != nil { - if reqBody.VoiceSetting == nil { - reqBody.VoiceSetting = &VoiceSetting{} - } - reqBody.VoiceSetting.Pitch = *pitch - } - - if emotion := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .emotion }"); emotion != nil { - if reqBody.VoiceSetting == nil { - reqBody.VoiceSetting = &VoiceSetting{} - } - reqBody.VoiceSetting.Emotion = *emotion - } - - // 音频设置 - if sampleRate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .sample_rate }"); sampleRate != nil { - reqBody.AudioSetting = &AudioSetting{} - reqBody.AudioSetting.SampleRate = *sampleRate - } - - if bitrate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .bitrate }"); bitrate != nil { - if reqBody.AudioSetting == nil { - reqBody.AudioSetting = &AudioSetting{} - } - reqBody.AudioSetting.Bitrate = *bitrate - } - - if format := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .format }"); format != nil { - if reqBody.AudioSetting == nil { - reqBody.AudioSetting = &AudioSetting{} - } - reqBody.AudioSetting.Format = *format - } - - // 序列化请求体 + // Serialize request body jsonBytes, err := json.Marshal(reqBody) if err != nil { return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) } - // 创建请求 + // Create request req, err := http.NewRequestWithContext( c.Request().Context(), http.MethodPost, @@ -175,11 +131,11 @@ func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) } - // 设置请求头 + // Set request headers req.Header.Set("Authorization", "Bearer "+token) req.Header.Set("Content-Type", "application/json") - // 发送请求 + // Send request resp, err := http.DefaultClient.Do(req) if err != nil { return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) @@ -187,146 +143,173 @@ func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) defer func() { _ = resp.Body.Close() }() - // 检查 HTTP 状态码 + // Check HTTP status code if resp.StatusCode >= 400 && resp.StatusCode < 600 { - switch { - case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): - return mo.Err[any](apierrors. - NewUpstreamError(resp.StatusCode). - WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) - case strings.HasPrefix(resp.Header.Get("Content-Type"), "text/"): - return mo.Err[any](apierrors. - NewUpstreamError(resp.StatusCode). - WithDetail(utils.NewTextResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) - default: - slog.Warn("unknown upstream error with unknown Content-Type", - slog.Int("status", resp.StatusCode), - slog.String("content_type", resp.Header.Get("Content-Type")), - ) - } + return handleHTTPError(resp) } - // 解析响应 + // Parse response var ttsResp TTSResponse err = json.NewDecoder(resp.Body).Decode(&ttsResp) if err != nil { return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) } - // 检查业务状态码 + // Check business status code if ttsResp.BaseResp.StatusCode != 0 { return mo.Err[any](handleMinimaxError(ttsResp.BaseResp.StatusCode, ttsResp.BaseResp.StatusMsg)) } - // 解码 hex 音频 + // Decode hex audio audioBytes, err := hex.DecodeString(ttsResp.Data.Audio) if err != nil { return mo.Err[any](apierrors.NewErrInternal().WithDetail("failed to decode hex audio: "+err.Error()).WithError(err).WithCaller()) } - // 确定 Content-Type - contentType := "audio/mp3" - if reqBody.AudioSetting != nil && reqBody.AudioSetting.Format != "" { - contentType = lo.Ternary(reqBody.AudioSetting.Format == "pcm", "audio/pcm", - lo.Ternary(reqBody.AudioSetting.Format == "wav", "audio/wav", - lo.Ternary(reqBody.AudioSetting.Format == "flac", "audio/flac", "audio/mp3"))) - } + // Determine content type + contentType := getContentType(reqBody.AudioSetting) return mo.Ok[any](c.Blob(http.StatusOK, contentType, audioBytes)) } -// handleMinimaxError 处理 MiniMax 错误码 -func handleMinimaxError(code int, msg string) *apierrors.Error { - var httpStatus int - switch code { - case 1004: // 鉴权失败 - httpStatus = http.StatusUnauthorized - case 1002, 1039: // 限流 - httpStatus = http.StatusTooManyRequests - case 1042, 2013: // 参数错误 - httpStatus = http.StatusBadRequest - case 1001: // 超时 - httpStatus = http.StatusGatewayTimeout - default: - httpStatus = http.StatusBadGateway - } - return apierrors.NewUpstreamError(httpStatus).WithDetailf("minimax error: %d - %s", code, msg) -} - -// handleStreamingSpeech 处理流式 TTS 请求 -func handleStreamingSpeech(c echo.Context, token string, opts types.SpeechRequestOptions) mo.Result[any] { - // 构建 MiniMax 请求 - reqBody := TTSRequest{ - Model: opts.Model, - Text: opts.Input, - Stream: true, - OutputFormat: "hex", - } - - // 设置 voice_id - voiceID := opts.Voice - if voiceID != "" { - reqBody.VoiceSetting = &VoiceSetting{ - VoiceID: voiceID, - } - } - - // 从 ExtraBody 获取其他参数 - if speed := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .speed }"); speed != nil { +// buildVoiceSettings builds VoiceSetting from ExtraBody +func buildVoiceSettings(extraBody map[string]any, reqBody *TTSRequest) { + if speed := utils.GetByJSONPath[*float64](extraBody, "{ .speed }"); speed != nil { if reqBody.VoiceSetting == nil { reqBody.VoiceSetting = &VoiceSetting{} } reqBody.VoiceSetting.Speed = *speed } - if vol := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .vol }"); vol != nil { + if vol := utils.GetByJSONPath[*float64](extraBody, "{ .vol }"); vol != nil { if reqBody.VoiceSetting == nil { reqBody.VoiceSetting = &VoiceSetting{} } reqBody.VoiceSetting.Vol = *vol } - if pitch := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .pitch }"); pitch != nil { + if pitch := utils.GetByJSONPath[*float64](extraBody, "{ .pitch }"); pitch != nil { if reqBody.VoiceSetting == nil { reqBody.VoiceSetting = &VoiceSetting{} } reqBody.VoiceSetting.Pitch = *pitch } - if emotion := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .emotion }"); emotion != nil { + if emotion := utils.GetByJSONPath[*string](extraBody, "{ .emotion }"); emotion != nil { if reqBody.VoiceSetting == nil { reqBody.VoiceSetting = &VoiceSetting{} } reqBody.VoiceSetting.Emotion = *emotion } +} - // 音频设置 - if sampleRate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .sample_rate }"); sampleRate != nil { +// buildAudioSettings builds AudioSetting from ExtraBody +func buildAudioSettings(extraBody map[string]any, reqBody *TTSRequest) { + if sampleRate := utils.GetByJSONPath[*int](extraBody, "{ .sample_rate }"); sampleRate != nil { reqBody.AudioSetting = &AudioSetting{} reqBody.AudioSetting.SampleRate = *sampleRate } - if bitrate := utils.GetByJSONPath[*int](opts.ExtraBody, "{ .bitrate }"); bitrate != nil { + if bitrate := utils.GetByJSONPath[*int](extraBody, "{ .bitrate }"); bitrate != nil { if reqBody.AudioSetting == nil { reqBody.AudioSetting = &AudioSetting{} } reqBody.AudioSetting.Bitrate = *bitrate } - if format := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .format }"); format != nil { + if format := utils.GetByJSONPath[*string](extraBody, "{ .format }"); format != nil { if reqBody.AudioSetting == nil { reqBody.AudioSetting = &AudioSetting{} } reqBody.AudioSetting.Format = *format } +} - // 序列化请求体 +// getContentType returns MIME type based on audio format +func getContentType(audioSetting *AudioSetting) string { + if audioSetting == nil || audioSetting.Format == "" { + return "audio/mp3" + } + + contentTypes := map[string]string{ + "pcm": "audio/pcm", + "wav": "audio/wav", + "flac": "audio/flac", + "mp3": "audio/mp3", + } + + if ct, ok := contentTypes[audioSetting.Format]; ok { + return ct + } + return "audio/mp3" +} + +// handleHTTPError handles HTTP errors from upstream +func handleHTTPError(resp *http.Response) mo.Result[any] { + switch { + case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + case strings.HasPrefix(resp.Header.Get("Content-Type"), "text/"): + return mo.Err[any](apierrors. + NewUpstreamError(resp.StatusCode). + WithDetail(utils.NewTextResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) + default: + slog.Warn("unknown upstream error with unknown Content-Type", + slog.Int("status", resp.StatusCode), + slog.String("content_type", resp.Header.Get("Content-Type")), + ) + } + return mo.Err[any](apierrors.NewUpstreamError(resp.StatusCode)) +} + +// handleMinimaxError handles MiniMax error codes +func handleMinimaxError(code int, msg string) *apierrors.Error { + var httpStatus int + switch code { + case 1004: // Auth failed + httpStatus = http.StatusUnauthorized + case 1002, 1039: // Rate limit + httpStatus = http.StatusTooManyRequests + case 1042, 2013: // Invalid parameter + httpStatus = http.StatusBadRequest + case 1001: // Timeout + httpStatus = http.StatusGatewayTimeout + default: + httpStatus = http.StatusBadGateway + } + return apierrors.NewUpstreamError(httpStatus).WithDetailf("minimax error: %d - %s", code, msg) +} + +// handleStreamingSpeech handles streaming TTS requests +func handleStreamingSpeech(c echo.Context, token string, opts types.SpeechRequestOptions) mo.Result[any] { + // Build MiniMax request + reqBody := TTSRequest{ + Model: opts.Model, + Text: opts.Input, + Stream: true, + OutputFormat: "hex", + } + + // Set voice_id + if opts.Voice != "" { + reqBody.VoiceSetting = &VoiceSetting{ + VoiceID: opts.Voice, + } + } + + // Build settings from ExtraBody + buildVoiceSettings(opts.ExtraBody, &reqBody) + buildAudioSettings(opts.ExtraBody, &reqBody) + + // Serialize request body jsonBytes, err := json.Marshal(reqBody) if err != nil { return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) } - // 创建请求 + // Create request req, err := http.NewRequestWithContext( c.Request().Context(), http.MethodPost, @@ -337,11 +320,11 @@ func handleStreamingSpeech(c echo.Context, token string, opts types.SpeechReques return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) } - // 设置请求头 + // Set request headers req.Header.Set("Authorization", "Bearer "+token) req.Header.Set("Content-Type", "application/json") - // 发送请求 + // Send request resp, err := http.DefaultClient.Do(req) if err != nil { return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) @@ -349,61 +332,46 @@ func handleStreamingSpeech(c echo.Context, token string, opts types.SpeechReques defer func() { _ = resp.Body.Close() }() - // 检查 HTTP 状态码 + // Check HTTP status code if resp.StatusCode >= 400 && resp.StatusCode < 600 { - switch { - case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): - return mo.Err[any](apierrors. - NewUpstreamError(resp.StatusCode). - WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error())) - default: - slog.Warn("unknown upstream error with unknown Content-Type", - slog.Int("status", resp.StatusCode), - slog.String("content_type", resp.Header.Get("Content-Type")), - ) - } + return handleHTTPError(resp) } - // 流式处理:持续读取响应直到 status == 2 + // Streaming: read response until status == 2 decoder := json.NewDecoder(resp.Body) audioHex := new(strings.Builder) for { var ttsResp TTSResponse if err := decoder.Decode(&ttsResp); err != nil { - if err.Error() == "EOF" { + if err == io.EOF { break } return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) } - // 检查业务状态码 + // Check business status code if ttsResp.BaseResp.StatusCode != 0 { return mo.Err[any](handleMinimaxError(ttsResp.BaseResp.StatusCode, ttsResp.BaseResp.StatusMsg)) } - // 追加音频数据 + // Append audio data audioHex.WriteString(ttsResp.Data.Audio) - // status == 2 表示合成结束 + // status == 2 means synthesis complete if ttsResp.Data.Status == 2 { break } } - // 解码 hex 音频 + // Decode hex audio audioBytes, err := hex.DecodeString(audioHex.String()) if err != nil { return mo.Err[any](apierrors.NewErrInternal().WithDetail("failed to decode hex audio: "+err.Error()).WithError(err).WithCaller()) } - // 确定 Content-Type - contentType := "audio/mp3" - if reqBody.AudioSetting != nil && reqBody.AudioSetting.Format != "" { - contentType = lo.Ternary(reqBody.AudioSetting.Format == "pcm", "audio/pcm", - lo.Ternary(reqBody.AudioSetting.Format == "wav", "audio/wav", - lo.Ternary(reqBody.AudioSetting.Format == "flac", "audio/flac", "audio/mp3"))) - } + // Determine content type + contentType := getContentType(reqBody.AudioSetting) return mo.Ok[any](c.Blob(http.StatusOK, contentType, audioBytes)) } diff --git a/pkg/backend/minimax/voices.go b/pkg/backend/minimax/voices.go index 9603c72..6e37afd 100644 --- a/pkg/backend/minimax/voices.go +++ b/pkg/backend/minimax/voices.go @@ -1,6 +1,7 @@ package minimax import ( + "bytes" "encoding/json" "log/slog" "net/http" @@ -13,39 +14,39 @@ import ( "github.com/samber/mo" ) -// GetVoiceReq 获取音色列表请求 +// GetVoiceReq Request for getting voice list type GetVoiceReq struct { VoiceType string `json:"voice_type"` } -// SystemVoice 系统音色 +// SystemVoice System voice type SystemVoice struct { VoiceID string `json:"voice_id"` VoiceName string `json:"voice_name"` Description []string `json:"description"` } -// VoiceCloning 快速复刻音色 +// VoiceCloning Voice cloning type VoiceCloning struct { VoiceID string `json:"voice_id"` Description []string `json:"description"` CreatedTime string `json:"created_time"` } -// VoiceGeneration 文生音色 +// VoiceGeneration Voice generation type VoiceGeneration struct { VoiceID string `json:"voice_id"` Description []string `json:"description"` CreatedTime string `json:"created_time"` } -// BaseResp 基础响应 +// BaseResp Base response type BaseResp struct { StatusCode int `json:"status_code"` StatusMsg string `json:"status_msg"` } -// GetVoiceResp 获取音色列表响应 +// GetVoiceResp Response for getting voice list type GetVoiceResp struct { SystemVoice []SystemVoice `json:"system_voice"` VoiceCloning []VoiceCloning `json:"voice_cloning"` @@ -54,7 +55,7 @@ type GetVoiceResp struct { } var ( - // 支持的音频格式 + // Supported audio formats formats = []types.VoiceFormat{ {Name: "MP3", Extension: ".mp3", MimeType: "audio/mpeg"}, {Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm"}, @@ -63,12 +64,12 @@ var ( } ) -// HandleVoices 处理获取音色列表请求 +// HandleVoices handles getting voice list requests func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) mo.Result[any] { - // 获取 token + // Get token token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ") - // 构建请求 + // Build request reqBody := GetVoiceReq{ VoiceType: "all", } @@ -78,22 +79,22 @@ func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) } - // 创建请求 + // Create request req, err := http.NewRequestWithContext( c.Request().Context(), http.MethodPost, "https://api.minimaxi.com/v1/get_voice", - strings.NewReader(string(jsonBytes)), + bytes.NewReader(jsonBytes), ) if err != nil { return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller()) } - // 设置请求头 + // Set request headers req.Header.Set("Authorization", "Bearer "+token) req.Header.Set("Content-Type", "application/json") - // 发送请求 + // Send request resp, err := http.DefaultClient.Do(req) if err != nil { return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) @@ -101,7 +102,7 @@ func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) defer func() { _ = resp.Body.Close() }() - // 检查 HTTP 状态码 + // Check HTTP status code if resp.StatusCode >= 400 && resp.StatusCode < 600 { switch { case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"): @@ -116,22 +117,22 @@ func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) } } - // 解析响应 + // Parse response var voiceResp GetVoiceResp err = json.NewDecoder(resp.Body).Decode(&voiceResp) if err != nil { return mo.Err[any](apierrors.NewErrBadGateway().WithDetail(err.Error()).WithError(err).WithCaller()) } - // 检查业务状态码 + // Check business status code if voiceResp.BaseResp.StatusCode != 0 { return mo.Err[any](handleMinimaxError(voiceResp.BaseResp.StatusCode, voiceResp.BaseResp.StatusMsg)) } - // 转换音色列表 + // Convert voice list voices := make([]types.Voice, 0, len(voiceResp.SystemVoice)+len(voiceResp.VoiceCloning)+len(voiceResp.VoiceGeneration)) - // 添加系统音色 + // Add system voices for _, v := range voiceResp.SystemVoice { voices = append(voices, types.Voice{ ID: v.VoiceID, @@ -148,7 +149,7 @@ func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) }) } - // 添加快速复刻音色 + // Add voice cloning voices for _, v := range voiceResp.VoiceCloning { voices = append(voices, types.Voice{ ID: v.VoiceID, @@ -166,7 +167,7 @@ func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) }) } - // 添加文生音色 + // Add voice generation voices for _, v := range voiceResp.VoiceGeneration { voices = append(voices, types.Voice{ ID: v.VoiceID, diff --git a/sdk/typescript/src/backend/minimax.ts b/sdk/typescript/src/backend/minimax.ts index 7246dbf..482553e 100644 --- a/sdk/typescript/src/backend/minimax.ts +++ b/sdk/typescript/src/backend/minimax.ts @@ -122,18 +122,19 @@ export function createUnMinimax(apiKey: string, baseURL = 'http://localhost:5933 UnMinimaxOptions > = { voice: (options) => { - if (baseURL.endsWith('v1/')) { - baseURL = baseURL.slice(0, -3) + let adjustedBaseURL = baseURL + if (adjustedBaseURL.endsWith('v1/')) { + adjustedBaseURL = adjustedBaseURL.slice(0, -3) } - else if (baseURL.endsWith('v1')) { - baseURL = baseURL.slice(0, -2) + else if (adjustedBaseURL.endsWith('v1')) { + adjustedBaseURL = adjustedBaseURL.slice(0, -2) } return { query: 'provider=minimax', ...(options ? toUnSpeechOptions(options) : {}), apiKey, - baseURL, + baseURL: adjustedBaseURL, } }, } From 21b9a2792921e75dff8f66b38f244ea0f0d30213 Mon Sep 17 00:00:00 2001 From: jhandsome Date: Wed, 11 Mar 2026 20:41:33 +0800 Subject: [PATCH 9/9] feat(sdk): export createUnMinimax from TypeScript SDK Co-Authored-By: Claude Opus 4.6 --- sdk/typescript/src/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/typescript/src/index.ts b/sdk/typescript/src/index.ts index 0a43b7b..3e41b5f 100644 --- a/sdk/typescript/src/index.ts +++ b/sdk/typescript/src/index.ts @@ -4,6 +4,7 @@ export { createUnDeepgram, createUnElevenLabs, createUnMicrosoft, + createUnMinimax, createUnSpeech, createUnVolcengine, } from './backend'