diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4703f43ae12..448d1871111 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -93,6 +93,16 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; + /** MiniMax speech configuration. */ + minimax?: { + apiKey?: SecretInput; + baseUrl?: string; + model?: string; + voiceId?: string; + speed?: number; + volume?: number; + pitch?: number; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 22c589c8490..fe38f2840c1 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z }) .strict() .optional(); +const TtsMiniMaxConfigSchema = z + .object({ + apiKey: SecretInputSchema.optional().register(sensitive), + baseUrl: z.string().optional(), + model: z.string().optional(), + voiceId: z.string().optional(), + speed: z.number().min(0.5).max(2).optional(), + volume: z.number().min(0).max(2).optional(), + pitch: z.number().min(-24).max(24).optional(), + }) + .strict() + .optional(); export const TtsConfigSchema = z .object({ auto: TtsAutoSchema.optional(), @@ -447,6 +459,7 @@ export const TtsConfigSchema = z .optional(), edge: TtsMicrosoftConfigSchema, microsoft: TtsMicrosoftConfigSchema, + minimax: TtsMiniMaxConfigSchema, prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts index d1462880a99..9285b8da136 100644 --- a/src/tts/provider-registry.ts +++ b/src/tts/provider-registry.ts @@ -5,12 +5,14 @@ import type { SpeechProviderPlugin } from "../plugins/types.js"; import type { SpeechProviderId } from "./provider-types.js"; import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js"; import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js"; +import { buildMiniMaxSpeechProvider } from "./providers/minimax.js"; import { buildOpenAISpeechProvider } from "./providers/openai.js"; const BUILTIN_SPEECH_PROVIDER_BUILDERS = [ buildOpenAISpeechProvider, buildElevenLabsSpeechProvider, buildMicrosoftSpeechProvider, + buildMiniMaxSpeechProvider, ] as const satisfies readonly (() => SpeechProviderPlugin)[]; function trimToUndefined(value: string | undefined): string | undefined { diff --git a/src/tts/providers/minimax.ts b/src/tts/providers/minimax.ts new file mode 100644 index 00000000000..948acca6b93 --- /dev/null +++ b/src/tts/providers/minimax.ts @@ -0,0 +1,157 @@ +import type { SpeechProviderPlugin } from "../../plugins/types.js"; +import type { SpeechVoiceOption } from "../provider-types.js"; + +const MINIMAX_TTS_MODELS = [ + "speech-01-turbo", + "speech-01-hd", + "speech-02-hd", + "speech-02", +] as const; + +// Popular MiniMax voice IDs +const MINIMAX_VOICE_IDS = [ + "female-shaonv", + "male-baijia", + "male-yunyang", + "female-tianmei", + "male-john", + "female-emma", +] as const; + +const DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com"; +const DEFAULT_MINIMAX_MODEL = "speech-01-turbo"; +const DEFAULT_MINIMAX_VOICE = "female-shaonv"; + +function normalizeMiniMaxBaseUrl(baseUrl: string | undefined): string { + const trimmed = baseUrl?.trim(); + return trimmed?.replace(/\/+$/, "") || DEFAULT_MINIMAX_BASE_URL; +} + +export async function minimaxTTS(params: { + text: string; + apiKey: string; + baseUrl?: string; + model?: string; + voiceId?: string; + speed?: number; + volume?: number; + pitch?: number; + timeoutMs?: number; +}): Promise { + const { + text, + apiKey, + baseUrl, + model = DEFAULT_MINIMAX_MODEL, + voiceId = DEFAULT_MINIMAX_VOICE, + speed = 1.0, + volume = 1.0, + pitch = 0, + timeoutMs = 30_000, + } = params; + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await fetch(`${normalizeMiniMaxBaseUrl(baseUrl)}/v1/t2a_v2`, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + text, + voice_setting: { + voice_id: voiceId, + speed: Math.round(speed * 100) / 100, + vol: Math.round(volume * 100) / 100, + pitch, + }, + }), + signal: controller.signal, + }); + + if (!response.ok) { + const error = await response.text().catch(() => "Unknown error"); + throw new Error(`MiniMax TTS API error (${response.status}): ${error}`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} + +export async function listMiniMaxVoices(): Promise { + // MiniMax doesn't have a public list voices API, so we return common voices + // Users can use custom voice IDs from their MiniMax dashboard + return MINIMAX_VOICE_IDS.map((voiceId) => ({ + id: voiceId, + name: voiceId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()), + })); +} + +export function buildMiniMaxSpeechProvider(): SpeechProviderPlugin { + return { + id: "minimax", + label: "MiniMax", + models: MINIMAX_TTS_MODELS, + listVoices: async (_req) => { + return listMiniMaxVoices(); + }, + isConfigured: ({ config }) => + Boolean(config.minimax?.apiKey || process.env.MINIMAX_API_KEY), + synthesize: async (req) => { + const apiKey = + req.config.minimax?.apiKey || process.env.MINIMAX_API_KEY; + if (!apiKey) { + throw new Error("MiniMax API key missing"); + } + const audioBuffer = await minimaxTTS({ + text: req.text, + apiKey, + baseUrl: req.config.minimax?.baseUrl, + model: req.config.minimax?.model ?? DEFAULT_MINIMAX_MODEL, + voiceId: req.config.minimax?.voiceId ?? DEFAULT_MINIMAX_VOICE, + speed: req.config.minimax?.speed, + volume: req.config.minimax?.volume, + pitch: req.config.minimax?.pitch, + timeoutMs: req.config.timeoutMs, + }); + return { + audioBuffer, + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: req.target === "voice-note", + }; + }, + synthesizeTelephony: async (req) => { + // MiniMax doesn't natively support telephony formats + // For Discord voice, we'd need to convert MP3 to PCM/Opus + // This is handled by the voice-call extension's audio pipeline + const apiKey = + req.config.minimax?.apiKey || process.env.MINIMAX_API_KEY; + if (!apiKey) { + throw new Error("MiniMax API key missing"); + } + const audioBuffer = await minimaxTTS({ + text: req.text, + apiKey, + baseUrl: req.config.minimax?.baseUrl, + model: req.config.minimax?.model ?? DEFAULT_MINIMAX_MODEL, + voiceId: req.config.minimax?.voiceId ?? DEFAULT_MINIMAX_VOICE, + speed: req.config.minimax?.speed, + volume: req.config.minimax?.volume, + pitch: req.config.minimax?.pitch, + timeoutMs: req.config.timeoutMs, + }); + return { + audioBuffer, + outputFormat: "mp3", + sampleRate: 24000, // MiniMax default sample rate + }; + }, + }; +} diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 17a7c2fc981..71b2131c4ed 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -130,6 +130,15 @@ export type ResolvedTtsConfig = { proxy?: string; timeoutMs?: number; }; + minimax: { + apiKey?: string; + baseUrl: string; + model: string; + voiceId: string; + speed?: number; + volume?: number; + pitch?: number; + }; prefsPath?: string; maxTextLength: number; timeoutMs: number; @@ -337,6 +346,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { proxy: rawMicrosoft.proxy?.trim() || undefined, timeoutMs: rawMicrosoft.timeoutMs, }, + minimax: { + apiKey: normalizeResolvedSecretInputString({ + value: raw.minimax?.apiKey, + path: "messages.tts.minimax.apiKey", + }), + baseUrl: (raw.minimax?.baseUrl?.trim() || "https://api.minimaxi.com").replace(/\/+$/, ""), + model: raw.minimax?.model || "speech-01-turbo", + voiceId: raw.minimax?.voiceId || "female-shaonv", + speed: raw.minimax?.speed, + volume: raw.minimax?.volume, + pitch: raw.minimax?.pitch, + }, prefsPath: raw.prefsPath, maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, @@ -476,6 +497,9 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt if (resolveTtsApiKey(config, "elevenlabs")) { return "elevenlabs"; } + if (resolveTtsApiKey(config, "minimax")) { + return "minimax"; + } return "microsoft"; } @@ -544,10 +568,13 @@ export function resolveTtsApiKey( if (normalizedProvider === "openai") { return config.openai.apiKey || process.env.OPENAI_API_KEY; } + if (normalizedProvider === "minimax") { + return config.minimax.apiKey || process.env.MINIMAX_API_KEY; + } return undefined; } -export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const; +export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft", "minimax"] as const; export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] { const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary;