Merge def16119667670b39037cea088ec82feb774153c into 598f1826d8b2bc969aace2c6459824737667218c

2026-03-20 21:26:51 -07:00 · 2026-03-20 21:26:51 -07:00 · c38e2aaea1
commit c38e2aaea1
parent 598f1826d8 def1611966
5 changed files with 210 additions and 1 deletions
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@ -93,6 +93,16 @@ export type TtsConfig = {
    proxy?: string;
    timeoutMs?: number;
  };
+  /** MiniMax speech configuration. */
+  minimax?: {
+    apiKey?: SecretInput;
+    baseUrl?: string;
+    model?: string;
+    voiceId?: string;
+    speed?: number;
+    volume?: number;
+    pitch?: number;
+  };
  /** Optional path for local TTS user preferences JSON. */
  prefsPath?: string;
  /** Hard cap for text sent to TTS (chars). */
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z
  })
  .strict()
  .optional();
+const TtsMiniMaxConfigSchema = z
+  .object({
+    apiKey: SecretInputSchema.optional().register(sensitive),
+    baseUrl: z.string().optional(),
+    model: z.string().optional(),
+    voiceId: z.string().optional(),
+    speed: z.number().min(0.5).max(2).optional(),
+    volume: z.number().min(0).max(2).optional(),
+    pitch: z.number().min(-24).max(24).optional(),
+  })
+  .strict()
+  .optional();
 export const TtsConfigSchema = z
  .object({
    auto: TtsAutoSchema.optional(),
@ -447,6 +459,7 @@ export const TtsConfigSchema = z
      .optional(),
    edge: TtsMicrosoftConfigSchema,
    microsoft: TtsMicrosoftConfigSchema,
+    minimax: TtsMiniMaxConfigSchema,
    prefsPath: z.string().optional(),
    maxTextLength: z.number().int().min(1).optional(),
    timeoutMs: z.number().int().min(1000).max(120000).optional(),
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@ -5,12 +5,14 @@ import type { SpeechProviderPlugin } from "../plugins/types.js";
 import type { SpeechProviderId } from "./provider-types.js";
 import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
 import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
+import { buildMiniMaxSpeechProvider } from "./providers/minimax.js";
 import { buildOpenAISpeechProvider } from "./providers/openai.js";

 const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
  buildOpenAISpeechProvider,
  buildElevenLabsSpeechProvider,
  buildMicrosoftSpeechProvider,
+  buildMiniMaxSpeechProvider,
 ] as const satisfies readonly (() => SpeechProviderPlugin)[];

 function trimToUndefined(value: string | undefined): string | undefined {
--- a/src/tts/providers/minimax.ts
+++ b/src/tts/providers/minimax.ts
@ -0,0 +1,157 @@
+import type { SpeechProviderPlugin } from "../../plugins/types.js";
+import type { SpeechVoiceOption } from "../provider-types.js";
+
+const MINIMAX_TTS_MODELS = [
+  "speech-01-turbo",
+  "speech-01-hd",
+  "speech-02-hd",
+  "speech-02",
+] as const;
+
+// Popular MiniMax voice IDs
+const MINIMAX_VOICE_IDS = [
+  "female-shaonv",
+  "male-baijia",
+  "male-yunyang",
+  "female-tianmei",
+  "male-john",
+  "female-emma",
+] as const;
+
+const DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com";
+const DEFAULT_MINIMAX_MODEL = "speech-01-turbo";
+const DEFAULT_MINIMAX_VOICE = "female-shaonv";
+
+function normalizeMiniMaxBaseUrl(baseUrl: string | undefined): string {
+  const trimmed = baseUrl?.trim();
+  return trimmed?.replace(/\/+$/, "") || DEFAULT_MINIMAX_BASE_URL;
+}
+
+export async function minimaxTTS(params: {
+  text: string;
+  apiKey: string;
+  baseUrl?: string;
+  model?: string;
+  voiceId?: string;
+  speed?: number;
+  volume?: number;
+  pitch?: number;
+  timeoutMs?: number;
+}): Promise<Buffer> {
+  const {
+    text,
+    apiKey,
+    baseUrl,
+    model = DEFAULT_MINIMAX_MODEL,
+    voiceId = DEFAULT_MINIMAX_VOICE,
+    speed = 1.0,
+    volume = 1.0,
+    pitch = 0,
+    timeoutMs = 30_000,
+  } = params;
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const response = await fetch(`${normalizeMiniMaxBaseUrl(baseUrl)}/v1/t2a_v2`, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model,
+        text,
+        voice_setting: {
+          voice_id: voiceId,
+          speed: Math.round(speed * 100) / 100,
+          vol: Math.round(volume * 100) / 100,
+          pitch,
+        },
+      }),
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      const error = await response.text().catch(() => "Unknown error");
+      throw new Error(`MiniMax TTS API error (${response.status}): ${error}`);
+    }
+
+    return Buffer.from(await response.arrayBuffer());
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+
+export async function listMiniMaxVoices(): Promise<SpeechVoiceOption[]> {
+  // MiniMax doesn't have a public list voices API, so we return common voices
+  // Users can use custom voice IDs from their MiniMax dashboard
+  return MINIMAX_VOICE_IDS.map((voiceId) => ({
+    id: voiceId,
+    name: voiceId.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()),
+  }));
+}
+
+export function buildMiniMaxSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "minimax",
+    label: "MiniMax",
+    models: MINIMAX_TTS_MODELS,
+    listVoices: async (_req) => {
+      return listMiniMaxVoices();
+    },
+    isConfigured: ({ config }) =>
+      Boolean(config.minimax?.apiKey || process.env.MINIMAX_API_KEY),
+    synthesize: async (req) => {
+      const apiKey =
+        req.config.minimax?.apiKey || process.env.MINIMAX_API_KEY;
+      if (!apiKey) {
+        throw new Error("MiniMax API key missing");
+      }
+      const audioBuffer = await minimaxTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.minimax?.baseUrl,
+        model: req.config.minimax?.model ?? DEFAULT_MINIMAX_MODEL,
+        voiceId: req.config.minimax?.voiceId ?? DEFAULT_MINIMAX_VOICE,
+        speed: req.config.minimax?.speed,
+        volume: req.config.minimax?.volume,
+        pitch: req.config.minimax?.pitch,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return {
+        audioBuffer,
+        outputFormat: "mp3",
+        fileExtension: ".mp3",
+        voiceCompatible: req.target === "voice-note",
+      };
+    },
+    synthesizeTelephony: async (req) => {
+      // MiniMax doesn't natively support telephony formats
+      // For Discord voice, we'd need to convert MP3 to PCM/Opus
+      // This is handled by the voice-call extension's audio pipeline
+      const apiKey =
+        req.config.minimax?.apiKey || process.env.MINIMAX_API_KEY;
+      if (!apiKey) {
+        throw new Error("MiniMax API key missing");
+      }
+      const audioBuffer = await minimaxTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.minimax?.baseUrl,
+        model: req.config.minimax?.model ?? DEFAULT_MINIMAX_MODEL,
+        voiceId: req.config.minimax?.voiceId ?? DEFAULT_MINIMAX_VOICE,
+        speed: req.config.minimax?.speed,
+        volume: req.config.minimax?.volume,
+        pitch: req.config.minimax?.pitch,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return {
+        audioBuffer,
+        outputFormat: "mp3",
+        sampleRate: 24000, // MiniMax default sample rate
+      };
+    },
+  };
+}
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@ -130,6 +130,15 @@ export type ResolvedTtsConfig = {
    proxy?: string;
    timeoutMs?: number;
  };
+  minimax: {
+    apiKey?: string;
+    baseUrl: string;
+    model: string;
+    voiceId: string;
+    speed?: number;
+    volume?: number;
+    pitch?: number;
+  };
  prefsPath?: string;
  maxTextLength: number;
  timeoutMs: number;
@ -337,6 +346,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
      proxy: rawMicrosoft.proxy?.trim() || undefined,
      timeoutMs: rawMicrosoft.timeoutMs,
    },
+    minimax: {
+      apiKey: normalizeResolvedSecretInputString({
+        value: raw.minimax?.apiKey,
+        path: "messages.tts.minimax.apiKey",
+      }),
+      baseUrl: (raw.minimax?.baseUrl?.trim() || "https://api.minimaxi.com").replace(/\/+$/, ""),
+      model: raw.minimax?.model || "speech-01-turbo",
+      voiceId: raw.minimax?.voiceId || "female-shaonv",
+      speed: raw.minimax?.speed,
+      volume: raw.minimax?.volume,
+      pitch: raw.minimax?.pitch,
+    },
    prefsPath: raw.prefsPath,
    maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
    timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
@ -476,6 +497,9 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
  if (resolveTtsApiKey(config, "elevenlabs")) {
    return "elevenlabs";
  }
+  if (resolveTtsApiKey(config, "minimax")) {
+    return "minimax";
+  }
  return "microsoft";
 }

@ -544,10 +568,13 @@ export function resolveTtsApiKey(
  if (normalizedProvider === "openai") {
    return config.openai.apiKey || process.env.OPENAI_API_KEY;
  }
+  if (normalizedProvider === "minimax") {
+    return config.minimax.apiKey || process.env.MINIMAX_API_KEY;
+  }
  return undefined;
 }

-export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const;
+export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft", "minimax"] as const;

 export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
  const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary;