moeru-ai · chainsaid · Mar 4, 2026 · gemini-code-assist · Mar 4, 2026 · gemini-code-assist
diff --git a/pkg/backend/backend.go b/pkg/backend/backend.go
@@ -7,6 +7,7 @@ import (
 	"github.com/moeru-ai/unspeech/pkg/apierrors"
 	"github.com/moeru-ai/unspeech/pkg/backend/alibaba"
 	"github.com/moeru-ai/unspeech/pkg/backend/deepgram"
+	"github.com/moeru-ai/unspeech/pkg/backend/doubao"
 	"github.com/moeru-ai/unspeech/pkg/backend/elevenlabs"
 	"github.com/moeru-ai/unspeech/pkg/backend/koemotion"
 	"github.com/moeru-ai/unspeech/pkg/backend/microsoft"
@@ -37,6 +38,8 @@ func Speech(c echo.Context) mo.Result[any] {
 		return volcengine.HandleSpeech(c, utils.ResultToOption(options))
 	case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio":
 		return alibaba.HandleSpeech(c, utils.ResultToOption(options))
+	case "doubao", "bytedance", "volcengine-doubao", "doubao-tts":
+		return doubao.HandleSpeech(c, utils.ResultToOption(options))
 	default:
 		return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend"))
 	}
@@ -63,6 +66,8 @@ func Voices(c echo.Context) mo.Result[any] {
 		return volcengine.HandleVoices(c, utils.ResultToOption(options))
 	case "ali", "aliyun", "alibaba", "bailian", "alibaba-model-studio":
 		return alibaba.HandleVoices(c, utils.ResultToOption(options))
+	case "doubao", "bytedance", "volcengine-doubao", "doubao-tts":
+		return doubao.HandleVoices(c, utils.ResultToOption(options))
 	default:
 		return mo.Err[any](apierrors.NewErrBadRequest().WithDetail("unsupported backend"))
 	}

diff --git a/pkg/backend/doubao/speech.go b/pkg/backend/doubao/speech.go
@@ -0,0 +1,210 @@
+package doubao
+
+import (
+	"bytes"
+	"encoding/base64"
+	"encoding/json"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/google/uuid"
+	"github.com/labstack/echo/v4"
+	"github.com/moeru-ai/unspeech/pkg/apierrors"
+	"github.com/moeru-ai/unspeech/pkg/backend/types"
+	"github.com/moeru-ai/unspeech/pkg/utils"
+	"github.com/samber/lo"
+	"github.com/samber/mo"
+)
+
+// DoubaoSpeechRequest represents the request structure for Doubao TTS API v3
+// Reference: https://www.volcengine.com/docs/6561/1329505
+// Note: Although the protocol is v3, the endpoint URL remains /api/v1/tts
+type DoubaoSpeechRequest struct {
+	App *DoubaoSpeechRequestApp `json:"app,omitempty"`
+	// User configuration
+	User *DoubaoSpeechRequestUser `json:"user,omitempty"`
+	// Audio synthesis parameters
+	Audio *DoubaoSpeechRequestAudio `json:"audio,omitempty"`
+	// Request metadata
+	Request *DoubaoSpeechRequestRequest `json:"request,omitempty"`
+}
+
+// DoubaoSpeechRequestApp contains application credentials
+type DoubaoSpeechRequestApp struct {
+	AppID   string `json:"appid,omitempty"`
+	Token   string `json:"token,omitempty"`
+	Cluster string `json:"cluster,omitempty"`
+}
+
+// DoubaoSpeechRequestUser contains user information
+type DoubaoSpeechRequestUser struct {
+	UserID string `json:"uid,omitempty"`
+}
+
+// DoubaoSpeechRequestAudio contains audio synthesis parameters
+type DoubaoSpeechRequestAudio struct {
+	VoiceType        string   `json:"voice_type,omitempty"`
+	Encoding         *string  `json:"encoding,omitempty"`
+	SpeedRatio       *float64 `json:"speed_ratio,omitempty"`
+	Rate             *int     `json:"rate,omitempty"`
+	BitRate          *int     `json:"bit_rate,omitempty"`
+	ExplicitLanguage *string  `json:"explicit_language,omitempty"`
+	ContextLanguage  *string  `json:"context_language,omitempty"`
+	LoudnessRatio    *float64 `json:"loudness_ratio,omitempty"`
+	Pitch            *float64 `json:"pitch,omitempty"`
+	Emotion          *string  `json:"emotion,omitempty"`
+	ResourceID       *string  `json:"resource_id,omitempty"`
+}
+
+// DoubaoSpeechRequestRequest contains request metadata
+type DoubaoSpeechRequestRequest struct {
+	RequestID             string         `json:"reqid,omitempty"`
+	Text                  string         `json:"text"`
+	TextType              *string        `json:"text_type,omitempty"`
+	SilenceDuration       *float64       `json:"silence_duration,omitempty"`
+	WithTimestamp         *string        `json:"with_timestamp,omitempty"`
+	Operation             *string        `json:"operation,omitempty"`
+	ExtraParam            *string        `json:"extra_param,omitempty"`
+	DisableMarkdownFilter *bool          `json:"disable_markdown_filter,omitempty"`
+	EnableLatexTone       *bool          `json:"enable_latex_tn,omitempty"`
+	CacheConfig           map[string]any `json:"cache_config,omitempty"`
+	UseCache              *bool          `json:"use_cache,omitempty"`
+}
+
+// HandleSpeech handles the speech synthesis request for Doubao TTS
+func HandleSpeech(c echo.Context, options mo.Option[types.SpeechRequestOptions]) mo.Result[any] {
+	opts := options.MustGet()
+
+	// Extract token from Authorization header
+	token := strings.TrimPrefix(c.Request().Header.Get("Authorization"), "Bearer ")
+
+	// Doubao TTS uses doubao_tts cluster by default
+	cluster := utils.GetByJSONPath[string](opts.ExtraBody, "{ .app.cluster }")
+	if cluster == "" {
+		cluster = "doubao_tts"
+	}
+
+	// Generate or get user ID
+	userID := utils.GetByJSONPath[string](opts.ExtraBody, "{ .user.uid }")
+	if userID == "" {
+		userID = uuid.New().String()
+	}
+
+	// Generate or get request ID
+	requestID := utils.GetByJSONPath[string](opts.ExtraBody, "{ .request.reqid }")
+	if requestID == "" {
+		requestID = uuid.New().String()
+	}
+
+	// Get operation type, default to "query"
+	operation := utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.operation }")
+	if operation == nil || *operation == "" {
+		operation = lo.ToPtr("query")
+	}
+
+	// Get speed ratio, default to 1.0
+	speedRatio := utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .audio.speed_ratio }")
+	if speedRatio == nil || *speedRatio == 0 {
+		speedRatio = lo.ToPtr(1.0)
+	}
+
+	// Build the request payload
+	reqBody := &DoubaoSpeechRequest{
+		App: &DoubaoSpeechRequestApp{
+			AppID:   utils.GetByJSONPath[string](opts.ExtraBody, "{ .app.appid }"),
+			Token:   token,
+			Cluster: cluster,
+		},
+		User: &DoubaoSpeechRequestUser{
+			UserID: userID,
+		},
+		Audio: &DoubaoSpeechRequestAudio{
+			VoiceType:        opts.Voice,
+			Encoding:         lo.Ternary(opts.ResponseFormat != "", lo.ToPtr(opts.ResponseFormat), lo.ToPtr("mp3")),
+			SpeedRatio:       speedRatio,
+			Rate:             utils.GetByJSONPath[*int](opts.ExtraBody, "{ .audio.rate }"),
+			BitRate:          utils.GetByJSONPath[*int](opts.ExtraBody, "{ .audio.bit_rate }"),
+			ExplicitLanguage: utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.explicit_language }"),
+			ContextLanguage:  utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.context_language }"),
+			LoudnessRatio:    utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .audio.loudness_ratio }"),
+			Pitch:            utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .audio.pitch }"),
+			Emotion:          utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.emotion }"),
+			ResourceID:       utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"),
-			ResourceID:       utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"),
+ResourceID: lo.Ternary(utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }") != nil, utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"), lo.ToPtr("seed-tts-2.0")),
-			ResourceID:       utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"),
+ResourceID: lo.Ternary(utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }") != nil, utils.GetByJSONPath[*string](opts.ExtraBody, "{ .audio.resource_id }"), lo.ToPtr("seed-tts-2.0")),
+		},
+		Request: &DoubaoSpeechRequestRequest{
+			RequestID:             requestID,
+			Text:                  opts.Input,
+			TextType:              utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.text_type }"),
+			SilenceDuration:       utils.GetByJSONPath[*float64](opts.ExtraBody, "{ .request.silence_duration }"),
+			WithTimestamp:         utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.with_timestamp }"),
+			Operation:             operation,
+			ExtraParam:            utils.GetByJSONPath[*string](opts.ExtraBody, "{ .request.extra_param }"),
+			DisableMarkdownFilter: utils.GetByJSONPath[*bool](opts.ExtraBody, "{ .request.disable_markdown_filter }"),
+			EnableLatexTone:       utils.GetByJSONPath[*bool](opts.ExtraBody, "{ .request.enable_latex_tn }"),
+			CacheConfig:           utils.GetByJSONPath[map[string]any](opts.ExtraBody, "{ .request.cache_config }"),
+		},
+	}
+
+	jsonBytes, err := json.Marshal(reqBody)
+	if err != nil {
+		return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
+	}
+
+	req, err := http.NewRequestWithContext(c.Request().Context(), http.MethodPost, "https://openspeech.bytedance.com/api/v1/tts", bytes.NewBuffer(jsonBytes))
+	if err != nil {
+		return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
+	}
+
+	// Set authorization header - Doubao uses Bearer;token format
+	req.Header.Set("Authorization", "Bearer;"+token)
-	req.Header.Set("Authorization", "Bearer;"+token)
+req.Header.Set("Authorization", "Bearer "+token)
-	req.Header.Set("Authorization", "Bearer;"+token)
+req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
+	}
+
+	defer func() { _ = resp.Body.Close() }()
+
+	// Handle error responses
+	if resp.StatusCode >= 400 && resp.StatusCode < 600 {
+		switch {
+		case strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json"):
+			return mo.Err[any](apierrors.
+				NewUpstreamError(resp.StatusCode).
+				WithDetail(utils.NewJSONResponseError(resp.StatusCode, resp.Body).OrEmpty().Error()))
+		case strings.HasPrefix(resp.Header.Get("Content-Type"), "text/"):
+			return mo.Err[any](apierrors.
+				NewUpstreamError(resp.StatusCode).
+				WithDetail(utils.NewTextResponseError(resp.StatusCode, resp.Body).OrEmpty().Error()))
+		default:
+			slog.Warn("unknown upstream error with unknown Content-Type",
+				slog.Int("status", resp.StatusCode),
+				slog.String("content_type", resp.Header.Get("Content-Type")),
+				slog.String("content_length", resp.Header.Get("Content-Length")),
+			)
+		}
+	}
+
+	var resBody map[string]any
+
+	err = json.NewDecoder(resp.Body).Decode(&resBody)
+	if err != nil {
+		return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithError(err).WithCaller())
+	}
+
+	// Extract base64 encoded audio data from response
+	audioBase64String := utils.GetByJSONPath[string](resBody, "{ .data }")
+	if audioBase64String == "" {
+		return mo.Err[any](apierrors.NewErrInternal().WithDetail("upstream returned empty audio base64 string").WithCaller())
+	}
+
+	// Decode base64 audio to binary
+	audioBytes, err := base64.StdEncoding.DecodeString(audioBase64String)
+	if err != nil {
+		return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithError(err).WithCaller())
+	}
+
+	return mo.Ok[any](c.Blob(http.StatusOK, "audio/mp3", audioBytes))
-	return mo.Ok[any](c.Blob(http.StatusOK, "audio/mp3", audioBytes))
+return mo.Ok[any](c.Blob(http.StatusOK, "audio/"+lo.FromPtr(reqBody.Audio.Encoding), audioBytes))
-	return mo.Ok[any](c.Blob(http.StatusOK, "audio/mp3", audioBytes))
+return mo.Ok[any](c.Blob(http.StatusOK, "audio/"+lo.FromPtr(reqBody.Audio.Encoding), audioBytes))
+}
diff --git a/pkg/backend/doubao/voices.go b/pkg/backend/doubao/voices.go
@@ -0,0 +1,85 @@
+package doubao
+
+import (
+	_ "embed"
+	"encoding/json"
+
+	"github.com/labstack/echo/v4"
+	"github.com/moeru-ai/unspeech/pkg/apierrors"
+	"github.com/moeru-ai/unspeech/pkg/backend/types"
+	"github.com/samber/mo"
+)
+
+var (
+	//go:embed voices.json
+	voicesJSON string
+)
+
+// VoicesResponseItem represents a voice in the voices.json file
+type VoicesResponseItem struct {
+	Name            string   `json:"name"`
+	PreviewAudioURL string   `json:"preview_audio_url"`
+	Model           string   `json:"model"`
+	Voice           string   `json:"voice"`
+	Scenarios       []string `json:"scenarios"`
+	Language        string   `json:"language"`
+	Bitrate         string   `json:"bitrate"`
+	Format          string   `json:"format"`
+}
+
+// HandleVoices handles the voices list request for Doubao TTS
+func HandleVoices(c echo.Context, options mo.Option[types.VoicesRequestOptions]) mo.Result[any] {
+	var voicesResponse []VoicesResponseItem
+
+	err := json.Unmarshal([]byte(voicesJSON), &voicesResponse)
+	if err != nil {
+		return mo.Err[any](apierrors.NewErrInternal().WithDetail(err.Error()).WithCaller())
+	}
+
+	voices := make([]types.Voice, 0, len(voicesResponse))
+
+	for _, voice := range voicesResponse {
+		voices = append(voices, types.Voice{
+			ID:          voice.Voice,
+			Name:        voice.Name,
+			Description: voice.Name,
+			Labels: map[string]any{
+				"tailoredScenarios": voice.Scenarios,
+			},
+			Tags: make([]string, 0),
+			Formats: []types.VoiceFormat{
+				// https://www.volcengine.com/docs/6561/1257584
+				{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 8000, Bitrate: 16, FormatCode: "mp3"},
+				{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 16000, Bitrate: 16, FormatCode: "mp3"},
+				{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 22050, Bitrate: 16, FormatCode: "mp3"},
+				{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 24000, Bitrate: 16, FormatCode: "mp3"},
+				{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 44100, Bitrate: 16, FormatCode: "mp3"},
+				{Name: "MP3", Extension: ".mp3", MimeType: "audio/mp3", SampleRate: 48000, Bitrate: 16, FormatCode: "mp3"},
+				{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 8000, Bitrate: 16, FormatCode: "pcm"},
+				{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 16000, Bitrate: 16, FormatCode: "pcm"},
+				{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 22050, Bitrate: 16, FormatCode: "pcm"},
+				{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 24000, Bitrate: 16, FormatCode: "pcm"},
+				{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 44100, Bitrate: 16, FormatCode: "pcm"},
+				{Name: "PCM", Extension: ".pcm", MimeType: "audio/pcm", SampleRate: 48000, Bitrate: 16, FormatCode: "pcm"},
+				{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 8000, Bitrate: 16, FormatCode: "wav"},
+				{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 16000, Bitrate: 16, FormatCode: "wav"},
+				{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 22050, Bitrate: 16, FormatCode: "wav"},
+				{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 24000, Bitrate: 16, FormatCode: "wav"},
+				{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 44100, Bitrate: 16, FormatCode: "wav"},
+				{Name: "WAV", Extension: ".wav", MimeType: "audio/wav", SampleRate: 48000, Bitrate: 16, FormatCode: "wav"},
+			},
+			CompatibleModels: []string{voice.Model},
+			PreviewAudioURL:  voice.PreviewAudioURL,
+			Languages: []types.VoiceLanguage{
+				{
+					Title: voice.Language,
+					Code:  voice.Language,
+				},
+			},
+		})
+	}
+
+	return mo.Ok[any](types.ListVoicesResponse{
+		Voices: voices,
+	})
+}