From 5041507da770d4ca6a454586a9d339a96fab88a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hanabi=20=28=E8=8A=B1=E7=81=AB=29?= Date: Wed, 18 Mar 2026 23:25:31 +0800 Subject: [PATCH 1/2] feat(tts): add MiniMax TTS provider support Add MiniMax as a new TTS provider option for OpenClaw voice synthesis. Features: - Support for MiniMax Speech API (speech-01-turbo, speech-01-hd, speech-02, speech-02-hd) - Configurable voice_id, speed, volume, and pitch parameters - Support for both audio-file and voice-note synthesis targets - Telephony synthesis support (for Discord voice channel integration) - Environment variable support via MINIMAX_API_KEY Configuration example: This enables Discord voice channel users to use MiniMax's high-quality Chinese TTS voices for text-to-speech playback. --- src/config/types.tts.ts | 10 ++ src/config/zod-schema.core.ts | 13 +++ src/tts/provider-registry.ts | 2 + src/tts/providers/minimax.ts | 170 ++++++++++++++++++++++++++++++++++ src/tts/tts.ts | 26 +++++- 5 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 src/tts/providers/minimax.ts diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4703f43ae12..448d1871111 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -93,6 +93,16 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; + /** MiniMax speech configuration. */ + minimax?: { + apiKey?: SecretInput; + baseUrl?: string; + model?: string; + voiceId?: string; + speed?: number; + volume?: number; + pitch?: number; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 22c589c8490..fe38f2840c1 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z }) .strict() .optional(); +const TtsMiniMaxConfigSchema = z + .object({ + apiKey: SecretInputSchema.optional().register(sensitive), + baseUrl: z.string().optional(), + model: z.string().optional(), + voiceId: z.string().optional(), + speed: z.number().min(0.5).max(2).optional(), + volume: z.number().min(0).max(2).optional(), + pitch: z.number().min(-24).max(24).optional(), + }) + .strict() + .optional(); export const TtsConfigSchema = z .object({ auto: TtsAutoSchema.optional(), @@ -447,6 +459,7 @@ export const TtsConfigSchema = z .optional(), edge: TtsMicrosoftConfigSchema, microsoft: TtsMicrosoftConfigSchema, + minimax: TtsMiniMaxConfigSchema, prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts index d1462880a99..9285b8da136 100644 --- a/src/tts/provider-registry.ts +++ b/src/tts/provider-registry.ts @@ -5,12 +5,14 @@ import type { SpeechProviderPlugin } from "../plugins/types.js"; import type { SpeechProviderId } from "./provider-types.js"; import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js"; import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js"; +import { buildMiniMaxSpeechProvider } from "./providers/minimax.js"; import { buildOpenAISpeechProvider } from "./providers/openai.js"; const BUILTIN_SPEECH_PROVIDER_BUILDERS = [ buildOpenAISpeechProvider, buildElevenLabsSpeechProvider, buildMicrosoftSpeechProvider, + buildMiniMaxSpeechProvider, ] as const satisfies readonly (() => SpeechProviderPlugin)[]; function trimToUndefined(value: string | undefined): string | undefined { diff --git a/src/tts/providers/minimax.ts b/src/tts/providers/minimax.ts new file mode 100644 index 00000000000..658056eb25c --- /dev/null +++ b/src/tts/providers/minimax.ts @@ -0,0 +1,170 @@ +import type { SpeechProviderPlugin } from "../../plugins/types.js"; +import type { SpeechVoiceOption } from "../provider-types.js"; + +const MINIMAX_TTS_MODELS = [ + "speech-01-turbo", + "speech-01-hd", + "speech-02-hd", + "speech-02", +] as const; + +// Popular MiniMax voice IDs +const MINIMAX_VOICE_IDS = [ + "female-shaonv", + "male-baijia", + "male-yunyang", + "female-tianmei", + "male-john", + "female-emma", +] as const; + +const DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com"; +const DEFAULT_MINIMAX_MODEL = "speech-01-turbo"; +const DEFAULT_MINIMAX_VOICE = "female-shaonv"; + +function normalizeMiniMaxBaseUrl(baseUrl: string | undefined): string { + const trimmed = baseUrl?.trim(); + return trimmed?.replace(/\/+$/, "") || DEFAULT_MINIMAX_BASE_URL; +} + +export async function minimaxTTS(params: { + text: string; + apiKey: string; + baseUrl?: string; + model?: string; + voiceId?: string; + speed?: number; + volume?: number; + pitch?: number; + timeoutMs?: number; +}): Promise { + const { + text, + apiKey, + baseUrl, + model = DEFAULT_MINIMAX_MODEL, + voiceId = DEFAULT_MINIMAX_VOICE, + speed = 1.0, + volume = 1.0, + pitch = 0, + timeoutMs = 30_000, + } = params; + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await fetch(`${normalizeMiniMaxBaseUrl(baseUrl)}/v1/t2a_v2`, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + text, + voice_setting: { + voice_id: voiceId, + speed: Math.round(speed * 100) / 100, + vol: Math.round(volume * 100) / 100, + pitch, + }, + }), + signal: controller.signal, + }); + + if (!response.ok) { + const error = await response.text().catch(() => "Unknown error"); + throw new Error(`MiniMax TTS API error (${response.status}): ${error}`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} + +export async function listMiniMaxVoices(params: { + apiKey: string; + baseUrl?: string; +}): Promise { + // MiniMax doesn't have a public list voices API, so we return common voices + // Users can use custom voice IDs from their MiniMax dashboard + return MINIMAX_VOICE_IDS.map((voiceId) => ({ + id: voiceId, + name: voiceId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()), + })); +} + +export function buildMiniMaxSpeechProvider(): SpeechProviderPlugin { + return { + id: "minimax", + label: "MiniMax", + models: MINIMAX_TTS_MODELS, + listVoices: async (req) => { + const apiKey = + req.apiKey || + req.config?.minimax.apiKey || + process.env.MINIMAX_API_KEY; + if (!apiKey) { + throw new Error("MiniMax API key missing"); + } + return listMiniMaxVoices({ + apiKey, + baseUrl: req.baseUrl ?? req.config?.minimax.baseUrl, + }); + }, + isConfigured: ({ config }) => + Boolean(config.minimax?.apiKey || process.env.MINIMAX_API_KEY), + synthesize: async (req) => { + const apiKey = + req.config.minimax?.apiKey || process.env.MINIMAX_API_KEY; + if (!apiKey) { + throw new Error("MiniMax API key missing"); + } + const audioBuffer = await minimaxTTS({ + text: req.text, + apiKey, + baseUrl: req.config.minimax?.baseUrl, + model: req.config.minimax?.model ?? DEFAULT_MINIMAX_MODEL, + voiceId: req.config.minimax?.voiceId ?? DEFAULT_MINIMAX_VOICE, + speed: req.config.minimax?.speed, + volume: req.config.minimax?.volume, + pitch: req.config.minimax?.pitch, + timeoutMs: req.config.timeoutMs, + }); + return { + audioBuffer, + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: req.target === "voice-note", + }; + }, + synthesizeTelephony: async (req) => { + // MiniMax doesn't natively support telephony formats + // For Discord voice, we'd need to convert MP3 to PCM/Opus + // This is handled by the voice-call extension's audio pipeline + const apiKey = + req.config.minimax?.apiKey || process.env.MINIMAX_API_KEY; + if (!apiKey) { + throw new Error("MiniMax API key missing"); + } + const audioBuffer = await minimaxTTS({ + text: req.text, + apiKey, + baseUrl: req.config.minimax?.baseUrl, + model: req.config.minimax?.model ?? DEFAULT_MINIMAX_MODEL, + voiceId: req.config.minimax?.voiceId ?? DEFAULT_MINIMAX_VOICE, + speed: req.config.minimax?.speed, + volume: req.config.minimax?.volume, + pitch: req.config.minimax?.pitch, + timeoutMs: req.config.timeoutMs, + }); + return { + audioBuffer, + outputFormat: "mp3", + sampleRate: 24000, // MiniMax default sample rate + }; + }, + }; +} diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 7d48dfb8e07..7a6a8e5d976 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -129,6 +129,15 @@ export type ResolvedTtsConfig = { proxy?: string; timeoutMs?: number; }; + minimax: { + apiKey?: string; + baseUrl: string; + model: string; + voiceId: string; + speed?: number; + volume?: number; + pitch?: number; + }; prefsPath?: string; maxTextLength: number; timeoutMs: number; @@ -319,6 +328,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { proxy: rawMicrosoft.proxy?.trim() || undefined, timeoutMs: rawMicrosoft.timeoutMs, }, + minimax: { + apiKey: normalizeResolvedSecretInputString({ + value: raw.minimax?.apiKey, + path: "messages.tts.minimax.apiKey", + }), + baseUrl: (raw.minimax?.baseUrl?.trim() || "https://api.minimaxi.com").replace(/\/+$/, ""), + model: raw.minimax?.model || "speech-01-turbo", + voiceId: raw.minimax?.voiceId || "female-shaonv", + speed: raw.minimax?.speed, + volume: raw.minimax?.volume, + pitch: raw.minimax?.pitch, + }, prefsPath: raw.prefsPath, maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, @@ -526,10 +547,13 @@ export function resolveTtsApiKey( if (normalizedProvider === "openai") { return config.openai.apiKey || process.env.OPENAI_API_KEY; } + if (normalizedProvider === "minimax") { + return config.minimax.apiKey || process.env.MINIMAX_API_KEY; + } return undefined; } -export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const; +export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft", "minimax"] as const; export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] { const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary; From def16119667670b39037cea088ec82feb774153c Mon Sep 17 00:00:00 2001 From: ViccRondo Date: Thu, 19 Mar 2026 18:48:53 +0800 Subject: [PATCH 2/2] fix(tts): add MiniMax to auto-detection chain and fix listVoices - Add MiniMax to getTtsProvider() auto-detection, after elevenlabs - Make listMiniMaxVoices params optional (no API call needed for static voice list) - Remove unnecessary API key check from listVoices callback Fixes PR feedback from greptile-apps and Codex reviewers --- src/tts/providers/minimax.ts | 19 +++---------------- src/tts/tts.ts | 3 +++ 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/tts/providers/minimax.ts b/src/tts/providers/minimax.ts index 658056eb25c..948acca6b93 100644 --- a/src/tts/providers/minimax.ts +++ b/src/tts/providers/minimax.ts @@ -84,10 +84,7 @@ export async function minimaxTTS(params: { } } -export async function listMiniMaxVoices(params: { - apiKey: string; - baseUrl?: string; -}): Promise { +export async function listMiniMaxVoices(): Promise { // MiniMax doesn't have a public list voices API, so we return common voices // Users can use custom voice IDs from their MiniMax dashboard return MINIMAX_VOICE_IDS.map((voiceId) => ({ @@ -101,18 +98,8 @@ export function buildMiniMaxSpeechProvider(): SpeechProviderPlugin { id: "minimax", label: "MiniMax", models: MINIMAX_TTS_MODELS, - listVoices: async (req) => { - const apiKey = - req.apiKey || - req.config?.minimax.apiKey || - process.env.MINIMAX_API_KEY; - if (!apiKey) { - throw new Error("MiniMax API key missing"); - } - return listMiniMaxVoices({ - apiKey, - baseUrl: req.baseUrl ?? req.config?.minimax.baseUrl, - }); + listVoices: async (_req) => { + return listMiniMaxVoices(); }, isConfigured: ({ config }) => Boolean(config.minimax?.apiKey || process.env.MINIMAX_API_KEY), diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 7a6a8e5d976..5e23d779bec 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -479,6 +479,9 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt if (resolveTtsApiKey(config, "elevenlabs")) { return "elevenlabs"; } + if (resolveTtsApiKey(config, "minimax")) { + return "minimax"; + } return "microsoft"; }