Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/backend/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/moeru-ai/unspeech/pkg/apierrors"
"github.com/moeru-ai/unspeech/pkg/backend/alibaba"
"github.com/moeru-ai/unspeech/pkg/backend/deepgram"
"github.com/moeru-ai/unspeech/pkg/backend/doubao"
"github.com/moeru-ai/unspeech/pkg/backend/elevenlabs"
"github.com/moeru-ai/unspeech/pkg/backend/koemotion"
"github.com/moeru-ai/unspeech/pkg/backend/microsoft"
Expand Down Expand Up @@ -37,6 +38,8 @@ func Speech(c echo.Context) mo.Result[any] {
return volcengine.HandleSpeech(c, utils.ResultToOption(options))
case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio":
return alibaba.HandleSpeech(c, utils.ResultToOption(options))
case "doubao", "bytedance", "volcengine-doubao", "doubao-tts":
return doubao.HandleSpeech(c, utils.ResultToOption(options))
default:
return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend"))
}
Expand All @@ -63,6 +66,8 @@ func Voices(c echo.Context) mo.Result[any] {
return volcengine.HandleVoices(c, utils.ResultToOption(options))
case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio":
return alibaba.HandleVoices(c, utils.ResultToOption(options))
case "doubao", "bytedance", "volcengine-doubao", "doubao-tts":
return doubao.HandleVoices(c, utils.ResultToOption(options))
default:
return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend"))
}
Expand Down
210 changes: 210 additions & 0 deletions pkg/backend/doubao/speech.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
package doubao

import (
"bytes"
"encoding/base64"
"encoding/json"
"log/slog"
"net/http"
"strings"

"github.com/google/uuid"
"github.com/labstack/echo/v4"
"github.com/moeru-ai/unspeech/pkg/apierrors"
"github.com/moeru-ai/unspeech/pkg/backend/types"
"github.com/moeru-ai/unspeech/pkg/utils"
"github.com/samber/lo"
"github.com/samber/mo"
)

// DoubaoSpeechRequest represents the request structure for Doubao TTS API v3
// Reference: https://www.volcengine.com/docs/6561/1329505
// Note: Although the protocol is v3, the endpoint URL remains /api/v1/tts
type DoubaoSpeechRequest struct {
App *DoubaoSpeechRequestApp `json:"app,omitempty"`
// User configuration
User *DoubaoSpeechRequestUser `json:"user,omitempty"`
// Audio synthesis parameters
Audio *DoubaoSpeechRequestAudio `json:"audio,omitempty"`
// Request metadata
Request *DoubaoSpeechRequestRequest `json:"request,omitempty"`
}

// DoubaoSpeechRequestApp contains application credentials
type DoubaoSpeechRequestApp struct {
AppID string `json:"appid,omitempty"`
Token string `json:"token,omitempty"`
Cluster string `json:"cluster,omitempty"`
}

// DoubaoSpeechRequestUser contains user information
type DoubaoSpeechRequestUser struct {
UserID string `json:"uid,omitempty"`
}

// DoubaoSpeechRequestAudio contains audio synthesis parameters
type DoubaoSpeechRequestAudio struct {
VoiceType string `json:"voice_type,omitempty"`
Encoding *string `json:"encoding,omitempty"`
SpeedRatio *float64 `json:"speed_ratio,omitempty"`
Rate *int `json:"rate,omitempty"`
BitRate *int `json:"bit_rate,omitempty"`
ExplicitLanguage *string `json:"explicit_language,omitempty"`
ContextLanguage *string `json:"context_language,omitempty"`
LoudnessRatio *float64 `json:"loudness_ratio,omitempty"`
Pitch *float64 `json:"pitch,omitempty"`
Emotion *string `json:"emotion,omitempty"`
ResourceID *string `json:"resource_id,omitempty"`
}

// DoubaoSpeechRequestRequest contains request metadata
type DoubaoSpeechRequestRequest struct {
RequestID string `json:"reqid,omitempty"`
Text string `json:"text"`
TextType *string `json:"text_type,omitempty"`
SilenceDuration *float64 `json:"silence_duration,omitempty"`
WithTimestamp *string `json:"with_timestamp,omitempty"`
Operation *string `json:"operation,omitempty"`
ExtraParam *string `json:"extra_param,omitempty"`
DisableMarkdownFilter *bool `json:"disable_markdown_filter,omitempty"`
EnableLatexTone *bool `json:"enable_latex_tn,omitempty"`
CacheConfig map[string]any `json:"cache_config,omitempty"`
UseCache *bool `json:"use_cache,omitempty"`
}

// HandleSpeech handles the speech synthesis request for Doubao TTS
func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) mo.Result[any] {
opts := options.MustGet()

// Extract token from Authorization header
token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ")

// Doubao TTS uses doubao_tts cluster by default
cluster := utils.GetByJSONPath[string](opts.ExtraBody, "{ .app.cluster }")
if cluster == "" {
cluster = "doubao_tts"
}

// Generate or get user ID
userID := utils.GetByJSONPath[string](opts.ExtraBody, "{ .user.uid }")
if userID == "" {
userID = uuid.New().String()
}

// Generate or get request ID
requestID := utils.GetByJSONPath[string](opts.ExtraBody, "{ .request.reqid }")
if requestID == "" {
requestID = uuid.New().String()
}

// Get operation type, default to "query"
operation := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.operation }")
if operation == nil || *operation == "" {
operation = lo.ToPtr("query")
}

// Get speed ratio, default to 1.0
speedRatio := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .audio.speed_ratio }")
if speedRatio == nil || *speedRatio == 0 {
speedRatio = lo.ToPtr(1.0)
}

// Build the request payload
reqBody := &DoubaoSpeechRequest{
App: &DoubaoSpeechRequestApp{
AppID: utils.GetByJSONPath[string](opts.ExtraBody, "{ .app.appid }"),
Token: token,
Cluster: cluster,
},
User: &DoubaoSpeechRequestUser{
UserID: userID,
},
Audio: &DoubaoSpeechRequestAudio{
VoiceType: opts.Voice,
Encoding: lo.Ternary(opts.ResponseFormat != "", lo.ToPtr(opts.ResponseFormat), lo.ToPtr("mp3")),
SpeedRatio: speedRatio,
Rate: utils.GetByJSONPath[*int](opts.ExtraBody, "{ .audio.rate }"),
BitRate: utils.GetByJSONPath[*int](opts.ExtraBody, "{ .audio.bit_rate }"),
ExplicitLanguage: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.explicit_language }"),
ContextLanguage: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.context_language }"),
LoudnessRatio: utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .audio.loudness_ratio }"),
Pitch: utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .audio.pitch }"),
Emotion: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.emotion }"),
ResourceID: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The PR description states that resource_id is required for Seed-TTS 2.0 and should default to seed-tts-2.0 if not specified. Currently, the code only passes ResourceID if it's present in opts.ExtraBody. If it's a mandatory field, it should be explicitly defaulted when missing to prevent potential API errors.

Suggested change
ResourceID: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"),
ResourceID: lo.Ternary(utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }") != nil, utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"), lo.ToPtr("seed-tts-2.0")),

},
Request: &DoubaoSpeechRequestRequest{
RequestID: requestID,
Text: opts.Input,
TextType: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.text_type }"),
SilenceDuration: utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .request.silence_duration }"),
WithTimestamp: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.with_timestamp }"),
Operation: operation,
ExtraParam: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.extra_param }"),
DisableMarkdownFilter: utils.GetByJSONPath[*bool](opts.ExtraBody, "{ .request.disable_markdown_filter }"),
EnableLatexTone: utils.GetByJSONPath[*bool](opts.ExtraBody, "{ .request.enable_latex_tn }"),
CacheConfig: utils.GetByJSONPath[map[string]any](opts.ExtraBody, "{ .request.cache_config }"),
},
}

jsonBytes, err := json.Marshal(reqBody)
if err != nil {
return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
}

req, err := http.NewRequestWithContext(c.Request().Context(), http.MethodPost, "https://openspeech.bytedance.com/api/v1/tts", bytes.NewBuffer(jsonBytes))
if err != nil {
return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
}

// Set authorization header - Doubao uses Bearer;token format
req.Header.Set("Authorization", "Bearer;"+token)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The Authorization header is constructed using Bearer; + token. Standard Bearer token authentication typically uses a space (Bearer ) as a separator. The strings.TrimPrefix on line 80 also uses Bearer (with a space). This inconsistency might lead to authentication failures with the upstream Doubao API if it expects the standard format.

Suggested change
req.Header.Set("Authorization", "Bearer;"+token)
req.Header.Set("Authorization", "Bearer "+token)

req.Header.Set("Content-Type", "application/json")

resp, err := http.DefaultClient.Do(req)
if err != nil {
return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
}

defer func() { _ = resp.Body.Close() }()

// Handle error responses
if resp.StatusCode >= 400 && resp.StatusCode < 600 {
switch {
case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"):
return mo.Err[any](apierrors.
NewUpstreamError(resp.StatusCode).
WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error()))
case strings.HasPrefix(resp.Header.Get("Content-Type"), "text/"):
return mo.Err[any](apierrors.
NewUpstreamError(resp.StatusCode).
WithDetail(utils.NewTextResponseError(resp.StatusCode, resp.Body).OrEmpty().Error()))
default:
slog.Warn("unknown upstream error with unknown Content-Type",
slog.Int("status", resp.StatusCode),
slog.String("content_type", resp.Header.Get("Content-Type")),
slog.String("content_length", resp.Header.Get("Content-Length")),
)
}
}

var resBody map[string]any

err = json.NewDecoder(resp.Body).Decode(&resBody)
if err != nil {
return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithError(err).WithCaller())
}

// Extract base64 encoded audio data from response
audioBase64String := utils.GetByJSONPath[string](resBody, "{ .data }")
if audioBase64String == "" {
return mo.Err[any](apierrors.NewErrInternal().WithDetail("upstream returned empty audio base64 string").WithCaller())
}

// Decode base64 audio to binary
audioBytes, err := base64.StdEncoding.DecodeString(audioBase64String)
if err != nil {
return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithError(err).WithCaller())
}

return mo.Ok[any](c.Blob(http.StatusOK, "audio/mp3", audioBytes))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The c.Blob function hardcodes the Content-Type header to audio/mp3. However, the Encoding field in the request payload (DoubaoSpeechRequestAudio) is dynamically set based on opts.ResponseFormat, allowing clients to request different audio formats (e.g., WAV, PCM). If a client requests a format other than MP3, the response header will incorrectly state audio/mp3, which can cause issues for the client trying to process the audio.

Suggested change
return mo.Ok[any](c.Blob(http.StatusOK, "audio/mp3", audioBytes))
return mo.Ok[any](c.Blob(http.StatusOK, "audio/"+lo.FromPtr(reqBody.Audio.Encoding), audioBytes))

}
85 changes: 85 additions & 0 deletions pkg/backend/doubao/voices.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package doubao

import (
_ "embed"
"encoding/json"

"github.com/labstack/echo/v4"
"github.com/moeru-ai/unspeech/pkg/apierrors"
"github.com/moeru-ai/unspeech/pkg/backend/types"
"github.com/samber/mo"
)

var (
//go:embed voices.json
voicesJSON string
)

// VoicesResponseItem represents a voice in the voices.json file
type VoicesResponseItem struct {
Name string `json:"name"`
PreviewAudioURL string `json:"preview_audio_url"`
Model string `json:"model"`
Voice string `json:"voice"`
Scenarios []string `json:"scenarios"`
Language string `json:"language"`
Bitrate string `json:"bitrate"`
Format string `json:"format"`
Comment on lines +26 to +27
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The Bitrate and Format fields in the VoicesResponseItem struct are populated from voices.json but are not utilized when constructing the types.VoiceFormat array. This makes these fields redundant and potentially misleading. If they don't serve a specific purpose in defining the voice's primary format, they could be removed for clarity and to avoid unnecessary data.

type VoicesResponseItem struct {
	Name            string   `json:"name"`
	PreviewAudioURL string   `json:"preview_audio_url"`
	Model           string   `json:"model"`
	Voice           string   `json:"voice"`
	Scenarios       []string `json:"scenarios"`
	Language        string   `json:"language"`
}

}

// HandleVoices handles the voices list request for Doubao TTS
func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) mo.Result[any] {
var voicesResponse []VoicesResponseItem

err := json.Unmarshal([]byte(voicesJSON), &voicesResponse)
if err != nil {
return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
}

voices := make([]types.Voice, 0, len(voicesResponse))

for _, voice := range voicesResponse {
voices = append(voices, types.Voice{
ID: voice.Voice,
Name: voice.Name,
Description: voice.Name,
Labels: map[string]any{
"tailoredScenarios": voice.Scenarios,
},
Tags: make([]string, 0),
Formats: []types.VoiceFormat{
// https://www.volcengine.com/docs/6561/1257584
{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 8000, Bitrate: 16, FormatCode: "mp3"},
{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 16000, Bitrate: 16, FormatCode: "mp3"},
{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 22050, Bitrate: 16, FormatCode: "mp3"},
{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 24000, Bitrate: 16, FormatCode: "mp3"},
{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 44100, Bitrate: 16, FormatCode: "mp3"},
{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 48000, Bitrate: 16, FormatCode: "mp3"},
{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 8000, Bitrate: 16, FormatCode: "pcm"},
{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 16000, Bitrate: 16, FormatCode: "pcm"},
{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 22050, Bitrate: 16, FormatCode: "pcm"},
{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 24000, Bitrate: 16, FormatCode: "pcm"},
{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 44100, Bitrate: 16, FormatCode: "pcm"},
{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 48000, Bitrate: 16, FormatCode: "pcm"},
{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 8000, Bitrate: 16, FormatCode: "wav"},
{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 16000, Bitrate: 16, FormatCode: "wav"},
{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 22050, Bitrate: 16, FormatCode: "wav"},
{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 24000, Bitrate: 16, FormatCode: "wav"},
{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 44100, Bitrate: 16, FormatCode: "wav"},
{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 48000, Bitrate: 16, FormatCode: "wav"},
},
CompatibleModels: []string{voice.Model},
PreviewAudioURL: voice.PreviewAudioURL,
Languages: []types.VoiceLanguage{
{
Title: voice.Language,
Code: voice.Language,
},
},
})
}

return mo.Ok[any](types.ListVoicesResponse{
Voices: voices,
})
}
Loading