From 662031a88e5eac1f31eeaf87293241204e6645ef Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 16 Mar 2026 18:49:55 -0700 Subject: [PATCH] feat(plugins): add speech provider registration --- extensions/elevenlabs/index.ts | 14 + extensions/elevenlabs/openclaw.plugin.json | 8 + extensions/elevenlabs/package.json | 12 + extensions/lobster/src/lobster-tool.test.ts | 1 + extensions/microsoft/index.ts | 14 + extensions/microsoft/openclaw.plugin.json | 8 + extensions/microsoft/package.json | 12 + extensions/openai/index.ts | 2 + extensions/test-utils/plugin-api.ts | 1 + extensions/voice-call/index.ts | 2 +- extensions/voice-call/openclaw.plugin.json | 5 +- src/auto-reply/reply/commands-tts.ts | 19 +- src/auto-reply/reply/route-reply.test.ts | 1 + .../channel-setup/plugin-install.test.ts | 1 + src/config/types.tts.ts | 19 +- src/config/zod-schema.core.ts | 34 ++- src/gateway/server-methods/tts.ts | 52 ++-- src/gateway/server-plugins.test.ts | 1 + ...server.agent.gateway-server-agent.mocks.ts | 20 +- src/gateway/test-helpers.mocks.ts | 1 + src/plugin-sdk/core.ts | 1 + src/plugin-sdk/index.ts | 1 + src/plugins/loader.ts | 1 + src/plugins/registry.ts | 48 +++ src/plugins/types.ts | 26 ++ src/test-utils/channel-plugins.ts | 1 + src/test-utils/plugin-registration.ts | 7 + src/tts/provider-registry.ts | 84 ++++++ src/tts/provider-types.ts | 38 +++ src/tts/providers/elevenlabs.ts | 73 +++++ src/tts/providers/microsoft.ts | 60 ++++ src/tts/providers/openai.ts | 56 ++++ src/tts/tts-core.ts | 11 +- src/tts/tts.test.ts | 25 +- src/tts/tts.ts | 285 +++++------------- 35 files changed, 658 insertions(+), 286 deletions(-) create mode 100644 extensions/elevenlabs/index.ts create mode 100644 extensions/elevenlabs/openclaw.plugin.json create mode 100644 extensions/elevenlabs/package.json create mode 100644 extensions/microsoft/index.ts create mode 100644 extensions/microsoft/openclaw.plugin.json create mode 100644 extensions/microsoft/package.json create mode 100644 src/tts/provider-registry.ts create mode 100644 src/tts/provider-types.ts create mode 100644 src/tts/providers/elevenlabs.ts create mode 100644 src/tts/providers/microsoft.ts create mode 100644 src/tts/providers/openai.ts diff --git a/extensions/elevenlabs/index.ts b/extensions/elevenlabs/index.ts new file mode 100644 index 00000000000..49d792df20f --- /dev/null +++ b/extensions/elevenlabs/index.ts @@ -0,0 +1,14 @@ +import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core"; +import { buildElevenLabsSpeechProvider } from "../../src/tts/providers/elevenlabs.js"; + +const elevenLabsPlugin = { + id: "elevenlabs", + name: "ElevenLabs Speech", + description: "Bundled ElevenLabs speech provider", + configSchema: emptyPluginConfigSchema(), + register(api: OpenClawPluginApi) { + api.registerSpeechProvider(buildElevenLabsSpeechProvider()); + }, +}; + +export default elevenLabsPlugin; diff --git a/extensions/elevenlabs/openclaw.plugin.json b/extensions/elevenlabs/openclaw.plugin.json new file mode 100644 index 00000000000..3015fa282a2 --- /dev/null +++ b/extensions/elevenlabs/openclaw.plugin.json @@ -0,0 +1,8 @@ +{ + "id": "elevenlabs", + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/elevenlabs/package.json b/extensions/elevenlabs/package.json new file mode 100644 index 00000000000..d4b5d32f16c --- /dev/null +++ b/extensions/elevenlabs/package.json @@ -0,0 +1,12 @@ +{ + "name": "@openclaw/elevenlabs-speech", + "version": "2026.3.14", + "private": true, + "description": "OpenClaw ElevenLabs speech plugin", + "type": "module", + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/lobster/src/lobster-tool.test.ts b/extensions/lobster/src/lobster-tool.test.ts index 21d090846b0..0ed5c0eda97 100644 --- a/extensions/lobster/src/lobster-tool.test.ts +++ b/extensions/lobster/src/lobster-tool.test.ts @@ -44,6 +44,7 @@ function fakeApi(overrides: Partial = {}): OpenClawPluginApi registerCli() {}, registerService() {}, registerProvider() {}, + registerSpeechProvider() {}, registerWebSearchProvider() {}, registerInteractiveHandler() {}, registerHook() {}, diff --git a/extensions/microsoft/index.ts b/extensions/microsoft/index.ts new file mode 100644 index 00000000000..358ea2057a0 --- /dev/null +++ b/extensions/microsoft/index.ts @@ -0,0 +1,14 @@ +import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core"; +import { buildMicrosoftSpeechProvider } from "../../src/tts/providers/microsoft.js"; + +const microsoftPlugin = { + id: "microsoft", + name: "Microsoft Speech", + description: "Bundled Microsoft speech provider", + configSchema: emptyPluginConfigSchema(), + register(api: OpenClawPluginApi) { + api.registerSpeechProvider(buildMicrosoftSpeechProvider()); + }, +}; + +export default microsoftPlugin; diff --git a/extensions/microsoft/openclaw.plugin.json b/extensions/microsoft/openclaw.plugin.json new file mode 100644 index 00000000000..85a130c463a --- /dev/null +++ b/extensions/microsoft/openclaw.plugin.json @@ -0,0 +1,8 @@ +{ + "id": "microsoft", + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/microsoft/package.json b/extensions/microsoft/package.json new file mode 100644 index 00000000000..400095cc1f0 --- /dev/null +++ b/extensions/microsoft/package.json @@ -0,0 +1,12 @@ +{ + "name": "@openclaw/microsoft-speech", + "version": "2026.3.14", + "private": true, + "description": "OpenClaw Microsoft speech plugin", + "type": "module", + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/openai/index.ts b/extensions/openai/index.ts index 3a01aad8db9..cd528f72211 100644 --- a/extensions/openai/index.ts +++ b/extensions/openai/index.ts @@ -1,4 +1,5 @@ import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core"; +import { buildOpenAISpeechProvider } from "../../src/tts/providers/openai.js"; import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js"; import { buildOpenAIProvider } from "./openai-provider.js"; @@ -10,6 +11,7 @@ const openAIPlugin = { register(api: OpenClawPluginApi) { api.registerProvider(buildOpenAIProvider()); api.registerProvider(buildOpenAICodexProviderPlugin()); + api.registerSpeechProvider(buildOpenAISpeechProvider()); }, }; diff --git a/extensions/test-utils/plugin-api.ts b/extensions/test-utils/plugin-api.ts index 5c621700602..281e151aeb7 100644 --- a/extensions/test-utils/plugin-api.ts +++ b/extensions/test-utils/plugin-api.ts @@ -15,6 +15,7 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi registerCli() {}, registerService() {}, registerProvider() {}, + registerSpeechProvider() {}, registerWebSearchProvider() {}, registerInteractiveHandler() {}, registerCommand() {}, diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index 7d14270bcf8..f20e2da6674 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -80,7 +80,7 @@ const voiceCallConfigSchema = { "streaming.streamPath": { label: "Media Stream Path", advanced: true }, "tts.provider": { label: "TTS Provider Override", - help: "Deep-merges with messages.tts (Edge is ignored for calls).", + help: "Deep-merges with messages.tts (Microsoft is ignored for calls).", advanced: true, }, "tts.openai.model": { label: "OpenAI TTS Model", advanced: true }, diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index fef3ccc6ad9..ff85a30a947 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -101,7 +101,7 @@ }, "tts.provider": { "label": "TTS Provider Override", - "help": "Deep-merges with messages.tts (Edge is ignored for calls).", + "help": "Deep-merges with messages.tts (Microsoft is ignored for calls).", "advanced": true }, "tts.openai.model": { @@ -420,8 +420,7 @@ "enum": ["final", "all"] }, "provider": { - "type": "string", - "enum": ["openai", "elevenlabs", "edge"] + "type": "string" }, "summaryModel": { "type": "string" diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index a6711d2c643..e635b038831 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -1,4 +1,5 @@ import { logVerbose } from "../../globals.js"; +import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js"; import { getLastTtsAttempt, getTtsMaxLength, @@ -54,7 +55,7 @@ function ttsUsage(): ReplyPayload { `• /tts summary [on|off] — View/change auto-summary\n` + `• /tts audio — Generate audio from text\n\n` + `**Providers:**\n` + - `• edge — Free, fast (default)\n` + + `• microsoft — Microsoft Edge-backed speech (default fallback)\n` + `• openai — High quality (requires API key)\n` + `• elevenlabs — Premium voices (requires API key)\n\n` + `**Text Limit (default: 1500, max: 4096):**\n` + @@ -62,7 +63,7 @@ function ttsUsage(): ReplyPayload { `• Summary ON: AI summarizes, then generates audio\n` + `• Summary OFF: Truncates text, then generates audio\n\n` + `**Examples:**\n` + - `/tts provider edge\n` + + `/tts provider microsoft\n` + `/tts limit 2000\n` + `/tts audio Hello, this is a test!`, }; @@ -161,7 +162,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (!args.trim()) { const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai")); const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs")); - const hasEdge = isTtsProviderConfigured(config, "edge"); + const hasMicrosoft = isTtsProviderConfigured(config, "microsoft", params.cfg); return { shouldContinue: false, reply: { @@ -170,21 +171,23 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand `Primary: ${currentProvider}\n` + `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` + `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` + - `Edge enabled: ${hasEdge ? "✅" : "❌"}\n` + - `Usage: /tts provider openai | elevenlabs | edge`, + `Microsoft enabled: ${hasMicrosoft ? "✅" : "❌"}\n` + + `Usage: /tts provider openai | elevenlabs | microsoft`, }, }; } const requested = args.trim().toLowerCase(); - if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") { + const knownProviders = new Set(listSpeechProviders(params.cfg).map((provider) => provider.id)); + if (requested !== "edge" && !knownProviders.has(requested)) { return { shouldContinue: false, reply: ttsUsage() }; } + const nextProvider = normalizeSpeechProviderId(requested) ?? requested; setTtsProvider(prefsPath, requested); return { shouldContinue: false, - reply: { text: `✅ TTS provider set to ${requested}.` }, + reply: { text: `✅ TTS provider set to ${nextProvider}.` }, }; } @@ -249,7 +252,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "status") { const enabled = isTtsEnabled(config, prefsPath); const provider = getTtsProvider(config, prefsPath); - const hasKey = isTtsProviderConfigured(config, provider); + const hasKey = isTtsProviderConfigured(config, provider, params.cfg); const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath); const last = getLastTtsAttempt(); diff --git a/src/auto-reply/reply/route-reply.test.ts b/src/auto-reply/reply/route-reply.test.ts index b7b6cd31e9f..5bf5f5c2cec 100644 --- a/src/auto-reply/reply/route-reply.test.ts +++ b/src/auto-reply/reply/route-reply.test.ts @@ -91,6 +91,7 @@ const createRegistry = (channels: PluginRegistry["channels"]): PluginRegistry => enabled: true, })), providers: [], + speechProviders: [], webSearchProviders: [], gatewayHandlers: {}, httpRoutes: [], diff --git a/src/commands/channel-setup/plugin-install.test.ts b/src/commands/channel-setup/plugin-install.test.ts index 056b2709891..5ad6399fa4a 100644 --- a/src/commands/channel-setup/plugin-install.test.ts +++ b/src/commands/channel-setup/plugin-install.test.ts @@ -337,6 +337,7 @@ describe("ensureChannelSetupPluginInstalled", () => { hookNames: [], channelIds: [], providerIds: [], + speechProviderIds: [], webSearchProviderIds: [], gatewayMethods: [], cliCommands: [], diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index a6232f9de5a..4703f43ae12 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -1,6 +1,6 @@ import type { SecretInput } from "./types.secrets.js"; -export type TtsProvider = "elevenlabs" | "openai" | "edge"; +export type TtsProvider = string; export type TtsMode = "final" | "all"; @@ -66,9 +66,22 @@ export type TtsConfig = { /** System-level instructions for the TTS model (gpt-4o-mini-tts only). */ instructions?: string; }; - /** Microsoft Edge (node-edge-tts) configuration. */ + /** Legacy alias for Microsoft speech configuration. */ edge?: { - /** Explicitly allow Edge TTS usage (no API key required). */ + /** Explicitly allow Microsoft speech usage (no API key required). */ + enabled?: boolean; + voice?: string; + lang?: string; + outputFormat?: string; + pitch?: string; + rate?: string; + volume?: string; + saveSubtitles?: boolean; + proxy?: string; + timeoutMs?: number; + }; + /** Preferred alias for Microsoft speech configuration. */ + microsoft?: { enabled?: boolean; voice?: string; lang?: string; diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 305efab4b26..199637bba52 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -353,9 +353,24 @@ export const MarkdownConfigSchema = z .strict() .optional(); -export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]); +export const TtsProviderSchema = z.string().min(1); export const TtsModeSchema = z.enum(["final", "all"]); export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]); +const TtsMicrosoftConfigSchema = z + .object({ + enabled: z.boolean().optional(), + voice: z.string().optional(), + lang: z.string().optional(), + outputFormat: z.string().optional(), + pitch: z.string().optional(), + rate: z.string().optional(), + volume: z.string().optional(), + saveSubtitles: z.boolean().optional(), + proxy: z.string().optional(), + timeoutMs: z.number().int().min(1000).max(120000).optional(), + }) + .strict() + .optional(); export const TtsConfigSchema = z .object({ auto: TtsAutoSchema.optional(), @@ -409,21 +424,8 @@ export const TtsConfigSchema = z }) .strict() .optional(), - edge: z - .object({ - enabled: z.boolean().optional(), - voice: z.string().optional(), - lang: z.string().optional(), - outputFormat: z.string().optional(), - pitch: z.string().optional(), - rate: z.string().optional(), - volume: z.string().optional(), - saveSubtitles: z.boolean().optional(), - proxy: z.string().optional(), - timeoutMs: z.number().int().min(1000).max(120000).optional(), - }) - .strict() - .optional(), + edge: TtsMicrosoftConfigSchema, + microsoft: TtsMicrosoftConfigSchema, prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/gateway/server-methods/tts.ts b/src/gateway/server-methods/tts.ts index 5e4e8254eba..0f7729bf3b5 100644 --- a/src/gateway/server-methods/tts.ts +++ b/src/gateway/server-methods/tts.ts @@ -1,4 +1,5 @@ import { loadConfig } from "../../config/config.js"; +import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js"; import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, @@ -26,9 +27,9 @@ export const ttsHandlers: GatewayRequestHandlers = { const prefsPath = resolveTtsPrefsPath(config); const provider = getTtsProvider(config, prefsPath); const autoMode = resolveTtsAutoMode({ config, prefsPath }); - const fallbackProviders = resolveTtsProviderOrder(provider) + const fallbackProviders = resolveTtsProviderOrder(provider, cfg) .slice(1) - .filter((candidate) => isTtsProviderConfigured(config, candidate)); + .filter((candidate) => isTtsProviderConfigured(config, candidate, cfg)); respond(true, { enabled: isTtsEnabled(config, prefsPath), auto: autoMode, @@ -38,7 +39,7 @@ export const ttsHandlers: GatewayRequestHandlers = { prefsPath, hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")), hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")), - edgeEnabled: isTtsProviderConfigured(config, "edge"), + microsoftEnabled: isTtsProviderConfigured(config, "microsoft", cfg), }); } catch (err) { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); @@ -99,20 +100,23 @@ export const ttsHandlers: GatewayRequestHandlers = { } }, "tts.setProvider": async ({ params, respond }) => { - const provider = typeof params.provider === "string" ? params.provider.trim() : ""; - if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") { + const provider = normalizeSpeechProviderId( + typeof params.provider === "string" ? params.provider.trim() : "", + ); + const cfg = loadConfig(); + const knownProviders = new Set(listSpeechProviders(cfg).map((entry) => entry.id)); + if (!provider || !knownProviders.has(provider)) { respond( false, undefined, errorShape( ErrorCodes.INVALID_REQUEST, - "Invalid provider. Use openai, elevenlabs, or edge.", + "Invalid provider. Use a registered TTS provider id such as openai, elevenlabs, or microsoft.", ), ); return; } try { - const cfg = loadConfig(); const config = resolveTtsConfig(cfg); const prefsPath = resolveTtsPrefsPath(config); setTtsProvider(prefsPath, provider); @@ -127,27 +131,19 @@ export const ttsHandlers: GatewayRequestHandlers = { const config = resolveTtsConfig(cfg); const prefsPath = resolveTtsPrefsPath(config); respond(true, { - providers: [ - { - id: "openai", - name: "OpenAI", - configured: Boolean(resolveTtsApiKey(config, "openai")), - models: [...OPENAI_TTS_MODELS], - voices: [...OPENAI_TTS_VOICES], - }, - { - id: "elevenlabs", - name: "ElevenLabs", - configured: Boolean(resolveTtsApiKey(config, "elevenlabs")), - models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"], - }, - { - id: "edge", - name: "Edge TTS", - configured: isTtsProviderConfigured(config, "edge"), - models: [], - }, - ], + providers: listSpeechProviders(cfg).map((provider) => ({ + id: provider.id, + name: provider.label, + configured: provider.isConfigured({ cfg, config }), + models: + provider.id === "openai" && provider.models == null + ? [...OPENAI_TTS_MODELS] + : [...(provider.models ?? [])], + voices: + provider.id === "openai" && provider.voices == null + ? [...OPENAI_TTS_VOICES] + : [...(provider.voices ?? [])], + })), active: getTtsProvider(config, prefsPath), }); } catch (err) { diff --git a/src/gateway/server-plugins.test.ts b/src/gateway/server-plugins.test.ts index 8e0d97a1580..58f5c9da4eb 100644 --- a/src/gateway/server-plugins.test.ts +++ b/src/gateway/server-plugins.test.ts @@ -29,6 +29,7 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({ channelSetups: [], commands: [], providers: [], + speechProviders: [], webSearchProviders: [], gatewayHandlers: {}, httpRoutes: [], diff --git a/src/gateway/server.agent.gateway-server-agent.mocks.ts b/src/gateway/server.agent.gateway-server-agent.mocks.ts index acf507dbde2..f6b29fe041a 100644 --- a/src/gateway/server.agent.gateway-server-agent.mocks.ts +++ b/src/gateway/server.agent.gateway-server-agent.mocks.ts @@ -1,25 +1,9 @@ import { vi } from "vitest"; -import type { PluginRegistry } from "../plugins/registry.js"; +import { createEmptyPluginRegistry, type PluginRegistry } from "../plugins/registry.js"; import { setActivePluginRegistry } from "../plugins/runtime.js"; export const registryState: { registry: PluginRegistry } = { - registry: { - plugins: [], - tools: [], - hooks: [], - typedHooks: [], - channels: [], - channelSetups: [], - providers: [], - webSearchProviders: [], - gatewayHandlers: {}, - httpHandlers: [], - httpRoutes: [], - cliRegistrars: [], - services: [], - commands: [], - diagnostics: [], - } as PluginRegistry, + registry: createEmptyPluginRegistry(), }; export function setRegistry(registry: PluginRegistry) { diff --git a/src/gateway/test-helpers.mocks.ts b/src/gateway/test-helpers.mocks.ts index 4bfb7ef4e4d..e05fcc85320 100644 --- a/src/gateway/test-helpers.mocks.ts +++ b/src/gateway/test-helpers.mocks.ts @@ -146,6 +146,7 @@ const createStubPluginRegistry = (): PluginRegistry => ({ ], channelSetups: [], providers: [], + speechProviders: [], webSearchProviders: [], gatewayHandlers: {}, httpRoutes: [], diff --git a/src/plugin-sdk/core.ts b/src/plugin-sdk/core.ts index 0c521f84122..00621521067 100644 --- a/src/plugin-sdk/core.ts +++ b/src/plugin-sdk/core.ts @@ -21,6 +21,7 @@ export type { ProviderResolveDynamicModelContext, ProviderNormalizeResolvedModelContext, ProviderRuntimeModel, + SpeechProviderPlugin, ProviderThinkingPolicyContext, ProviderWrapStreamFnContext, OpenClawPluginService, diff --git a/src/plugin-sdk/index.ts b/src/plugin-sdk/index.ts index 721e9da25e6..07b51661d2d 100644 --- a/src/plugin-sdk/index.ts +++ b/src/plugin-sdk/index.ts @@ -140,6 +140,7 @@ export type { ProviderResolveDynamicModelContext, ProviderNormalizeResolvedModelContext, ProviderRuntimeModel, + SpeechProviderPlugin, ProviderThinkingPolicyContext, ProviderWrapStreamFnContext, } from "../plugins/types.js"; diff --git a/src/plugins/loader.ts b/src/plugins/loader.ts index e86f846b5d8..a2e05fc06b9 100644 --- a/src/plugins/loader.ts +++ b/src/plugins/loader.ts @@ -494,6 +494,7 @@ function createPluginRecord(params: { hookNames: [], channelIds: [], providerIds: [], + speechProviderIds: [], webSearchProviderIds: [], gatewayMethods: [], cliCommands: [], diff --git a/src/plugins/registry.ts b/src/plugins/registry.ts index fabf9fa1069..231e6f267aa 100644 --- a/src/plugins/registry.ts +++ b/src/plugins/registry.ts @@ -46,6 +46,7 @@ import type { PluginHookName, PluginHookHandlerMap, PluginHookRegistration as TypedPluginHookRegistration, + SpeechProviderPlugin, WebSearchProviderPlugin, } from "./types.js"; @@ -110,6 +111,14 @@ export type PluginWebSearchProviderRegistration = { rootDir?: string; }; +export type PluginSpeechProviderRegistration = { + pluginId: string; + pluginName?: string; + provider: SpeechProviderPlugin; + source: string; + rootDir?: string; +}; + export type PluginHookRegistration = { pluginId: string; entry: HookEntry; @@ -154,6 +163,7 @@ export type PluginRecord = { hookNames: string[]; channelIds: string[]; providerIds: string[]; + speechProviderIds: string[]; webSearchProviderIds: string[]; gatewayMethods: string[]; cliCommands: string[]; @@ -174,6 +184,7 @@ export type PluginRegistry = { channels: PluginChannelRegistration[]; channelSetups: PluginChannelSetupRegistration[]; providers: PluginProviderRegistration[]; + speechProviders: PluginSpeechProviderRegistration[]; webSearchProviders: PluginWebSearchProviderRegistration[]; gatewayHandlers: GatewayRequestHandlers; httpRoutes: PluginHttpRouteRegistration[]; @@ -219,6 +230,7 @@ export function createEmptyPluginRegistry(): PluginRegistry { channels: [], channelSetups: [], providers: [], + speechProviders: [], webSearchProviders: [], gatewayHandlers: {}, httpRoutes: [], @@ -550,6 +562,37 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) { }); }; + const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => { + const id = provider.id.trim(); + if (!id) { + pushDiagnostic({ + level: "error", + pluginId: record.id, + source: record.source, + message: "speech provider registration missing id", + }); + return; + } + const existing = registry.speechProviders.find((entry) => entry.provider.id === id); + if (existing) { + pushDiagnostic({ + level: "error", + pluginId: record.id, + source: record.source, + message: `speech provider already registered: ${id} (${existing.pluginId})`, + }); + return; + } + record.speechProviderIds.push(id); + registry.speechProviders.push({ + pluginId: record.id, + pluginName: record.name, + provider, + source: record.source, + rootDir: record.rootDir, + }); + }; + const registerWebSearchProvider = (record: PluginRecord, provider: WebSearchProviderPlugin) => { const id = provider.id.trim(); if (!id) { @@ -789,6 +832,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) { registerChannel: (registration) => registerChannel(record, registration, registrationMode), registerProvider: registrationMode === "full" ? (provider) => registerProvider(record, provider) : () => {}, + registerSpeechProvider: + registrationMode === "full" + ? (provider) => registerSpeechProvider(record, provider) + : () => {}, registerWebSearchProvider: registrationMode === "full" ? (provider) => registerWebSearchProvider(record, provider) @@ -862,6 +909,7 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) { registerTool, registerChannel, registerProvider, + registerSpeechProvider, registerWebSearchProvider, registerGatewayMethod, registerCli, diff --git a/src/plugins/types.ts b/src/plugins/types.ts index 0c817a99cf8..2a2e2b9fd5f 100644 --- a/src/plugins/types.ts +++ b/src/plugins/types.ts @@ -27,6 +27,14 @@ import type { HookEntry } from "../hooks/types.js"; import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js"; import type { RuntimeEnv } from "../runtime.js"; import type { RuntimeWebSearchMetadata } from "../secrets/runtime-web-tools.types.js"; +import type { + SpeechProviderConfiguredContext, + SpeechProviderId, + SpeechSynthesisRequest, + SpeechSynthesisResult, + SpeechTelephonySynthesisRequest, + SpeechTelephonySynthesisResult, +} from "../tts/provider-types.js"; import type { WizardPrompter } from "../wizard/prompts.js"; import type { PluginRuntime } from "./runtime/types.js"; @@ -853,6 +861,23 @@ export type PluginWebSearchProviderEntry = WebSearchProviderPlugin & { pluginId: string; }; +export type SpeechProviderPlugin = { + id: SpeechProviderId; + label: string; + aliases?: string[]; + models?: readonly string[]; + voices?: readonly string[]; + isConfigured: (ctx: SpeechProviderConfiguredContext) => boolean; + synthesize: (req: SpeechSynthesisRequest) => Promise; + synthesizeTelephony?: ( + req: SpeechTelephonySynthesisRequest, + ) => Promise; +}; + +export type PluginSpeechProviderEntry = SpeechProviderPlugin & { + pluginId: string; +}; + export type OpenClawPluginGatewayMethod = { method: string; handler: GatewayRequestHandler; @@ -1211,6 +1236,7 @@ export type OpenClawPluginApi = { registerCli: (registrar: OpenClawPluginCliRegistrar, opts?: { commands?: string[] }) => void; registerService: (service: OpenClawPluginService) => void; registerProvider: (provider: ProviderPlugin) => void; + registerSpeechProvider: (provider: SpeechProviderPlugin) => void; registerWebSearchProvider: (provider: WebSearchProviderPlugin) => void; registerInteractiveHandler: (registration: PluginInteractiveHandlerRegistration) => void; /** diff --git a/src/test-utils/channel-plugins.ts b/src/test-utils/channel-plugins.ts index 4f52350f8fc..588c1ca7db6 100644 --- a/src/test-utils/channel-plugins.ts +++ b/src/test-utils/channel-plugins.ts @@ -26,6 +26,7 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl enabled: true, })), providers: [], + speechProviders: [], webSearchProviders: [], gatewayHandlers: {}, httpRoutes: [], diff --git a/src/test-utils/plugin-registration.ts b/src/test-utils/plugin-registration.ts index e17e4a2520d..6231dedf17b 100644 --- a/src/test-utils/plugin-registration.ts +++ b/src/test-utils/plugin-registration.ts @@ -2,29 +2,36 @@ import type { AnyAgentTool, OpenClawPluginApi, ProviderPlugin, + SpeechProviderPlugin, WebSearchProviderPlugin, } from "../plugins/types.js"; export type CapturedPluginRegistration = { api: OpenClawPluginApi; providers: ProviderPlugin[]; + speechProviders: SpeechProviderPlugin[]; webSearchProviders: WebSearchProviderPlugin[]; tools: AnyAgentTool[]; }; export function createCapturedPluginRegistration(): CapturedPluginRegistration { const providers: ProviderPlugin[] = []; + const speechProviders: SpeechProviderPlugin[] = []; const webSearchProviders: WebSearchProviderPlugin[] = []; const tools: AnyAgentTool[] = []; return { providers, + speechProviders, webSearchProviders, tools, api: { registerProvider(provider: ProviderPlugin) { providers.push(provider); }, + registerSpeechProvider(provider: SpeechProviderPlugin) { + speechProviders.push(provider); + }, registerWebSearchProvider(provider: WebSearchProviderPlugin) { webSearchProviders.push(provider); }, diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts new file mode 100644 index 00000000000..ee60764aa4d --- /dev/null +++ b/src/tts/provider-registry.ts @@ -0,0 +1,84 @@ +import type { OpenClawConfig } from "../config/config.js"; +import { loadOpenClawPlugins } from "../plugins/loader.js"; +import { getActivePluginRegistry } from "../plugins/runtime.js"; +import type { SpeechProviderPlugin } from "../plugins/types.js"; +import type { SpeechProviderId } from "./provider-types.js"; +import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js"; +import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js"; +import { buildOpenAISpeechProvider } from "./providers/openai.js"; + +const BUILTIN_SPEECH_PROVIDERS: readonly SpeechProviderPlugin[] = [ + buildOpenAISpeechProvider(), + buildElevenLabsSpeechProvider(), + buildMicrosoftSpeechProvider(), +]; + +function trimToUndefined(value: string | undefined): string | undefined { + const trimmed = value?.trim().toLowerCase(); + return trimmed ? trimmed : undefined; +} + +export function normalizeSpeechProviderId( + providerId: string | undefined, +): SpeechProviderId | undefined { + const normalized = trimToUndefined(providerId); + if (!normalized) { + return undefined; + } + return normalized === "edge" ? "microsoft" : normalized; +} + +function resolveSpeechProviderPluginEntries(cfg?: OpenClawConfig): SpeechProviderPlugin[] { + const active = getActivePluginRegistry(); + const registry = + (active?.speechProviders?.length ?? 0) > 0 || !cfg + ? active + : loadOpenClawPlugins({ config: cfg }); + return registry?.speechProviders?.map((entry) => entry.provider) ?? []; +} + +function buildProviderMaps(cfg?: OpenClawConfig): { + canonical: Map; + aliases: Map; +} { + const canonical = new Map(); + const aliases = new Map(); + const register = (provider: SpeechProviderPlugin) => { + const id = normalizeSpeechProviderId(provider.id); + if (!id) { + return; + } + canonical.set(id, provider); + aliases.set(id, provider); + for (const alias of provider.aliases ?? []) { + const normalizedAlias = normalizeSpeechProviderId(alias); + if (normalizedAlias) { + aliases.set(normalizedAlias, provider); + } + } + }; + + for (const provider of BUILTIN_SPEECH_PROVIDERS) { + register(provider); + } + for (const provider of resolveSpeechProviderPluginEntries(cfg)) { + register(provider); + } + + return { canonical, aliases }; +} + +export function listSpeechProviders(cfg?: OpenClawConfig): SpeechProviderPlugin[] { + return [...buildProviderMaps(cfg).canonical.values()]; +} + +export function getSpeechProvider( + providerId: string | undefined, + cfg?: OpenClawConfig, +): SpeechProviderPlugin | undefined { + const normalized = normalizeSpeechProviderId(providerId); + if (!normalized) { + return undefined; + } + return buildProviderMaps(cfg).aliases.get(normalized); +} diff --git a/src/tts/provider-types.ts b/src/tts/provider-types.ts new file mode 100644 index 00000000000..bfbeb38f02a --- /dev/null +++ b/src/tts/provider-types.ts @@ -0,0 +1,38 @@ +import type { OpenClawConfig } from "../config/config.js"; +import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "./tts.js"; + +export type SpeechProviderId = string; + +export type SpeechSynthesisTarget = "audio-file" | "voice-note"; + +export type SpeechProviderConfiguredContext = { + cfg?: OpenClawConfig; + config: ResolvedTtsConfig; +}; + +export type SpeechSynthesisRequest = { + text: string; + cfg: OpenClawConfig; + config: ResolvedTtsConfig; + target: SpeechSynthesisTarget; + overrides?: TtsDirectiveOverrides; +}; + +export type SpeechSynthesisResult = { + audioBuffer: Buffer; + outputFormat: string; + fileExtension: string; + voiceCompatible: boolean; +}; + +export type SpeechTelephonySynthesisRequest = { + text: string; + cfg: OpenClawConfig; + config: ResolvedTtsConfig; +}; + +export type SpeechTelephonySynthesisResult = { + audioBuffer: Buffer; + outputFormat: string; + sampleRate: number; +}; diff --git a/src/tts/providers/elevenlabs.ts b/src/tts/providers/elevenlabs.ts new file mode 100644 index 00000000000..2b6df133edc --- /dev/null +++ b/src/tts/providers/elevenlabs.ts @@ -0,0 +1,73 @@ +import type { SpeechProviderPlugin } from "../../plugins/types.js"; +import { elevenLabsTTS } from "../tts-core.js"; + +const ELEVENLABS_TTS_MODELS = [ + "eleven_multilingual_v2", + "eleven_turbo_v2_5", + "eleven_monolingual_v1", +] as const; + +export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin { + return { + id: "elevenlabs", + label: "ElevenLabs", + models: ELEVENLABS_TTS_MODELS, + isConfigured: ({ config }) => + Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY), + synthesize: async (req) => { + const apiKey = + req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY; + if (!apiKey) { + throw new Error("ElevenLabs API key missing"); + } + const outputFormat = req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128"; + const audioBuffer = await elevenLabsTTS({ + text: req.text, + apiKey, + baseUrl: req.config.elevenlabs.baseUrl, + voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId, + modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId, + outputFormat, + seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed, + applyTextNormalization: + req.overrides?.elevenlabs?.applyTextNormalization ?? + req.config.elevenlabs.applyTextNormalization, + languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode, + voiceSettings: { + ...req.config.elevenlabs.voiceSettings, + ...req.overrides?.elevenlabs?.voiceSettings, + }, + timeoutMs: req.config.timeoutMs, + }); + return { + audioBuffer, + outputFormat, + fileExtension: req.target === "voice-note" ? ".opus" : ".mp3", + voiceCompatible: req.target === "voice-note", + }; + }, + synthesizeTelephony: async (req) => { + const apiKey = + req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY; + if (!apiKey) { + throw new Error("ElevenLabs API key missing"); + } + const outputFormat = "pcm_22050"; + const sampleRate = 22_050; + const audioBuffer = await elevenLabsTTS({ + text: req.text, + apiKey, + baseUrl: req.config.elevenlabs.baseUrl, + voiceId: req.config.elevenlabs.voiceId, + modelId: req.config.elevenlabs.modelId, + outputFormat, + seed: req.config.elevenlabs.seed, + applyTextNormalization: req.config.elevenlabs.applyTextNormalization, + languageCode: req.config.elevenlabs.languageCode, + voiceSettings: req.config.elevenlabs.voiceSettings, + timeoutMs: req.config.timeoutMs, + }); + return { audioBuffer, outputFormat, sampleRate }; + }, + }; +} diff --git a/src/tts/providers/microsoft.ts b/src/tts/providers/microsoft.ts new file mode 100644 index 00000000000..ee31e35a204 --- /dev/null +++ b/src/tts/providers/microsoft.ts @@ -0,0 +1,60 @@ +import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; +import path from "node:path"; +import { resolvePreferredOpenClawTmpDir } from "../../infra/tmp-openclaw-dir.js"; +import { isVoiceCompatibleAudio } from "../../media/audio.js"; +import type { SpeechProviderPlugin } from "../../plugins/types.js"; +import { edgeTTS, inferEdgeExtension } from "../tts-core.js"; + +const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; + +export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { + return { + id: "microsoft", + label: "Microsoft", + aliases: ["edge"], + isConfigured: ({ config }) => config.edge.enabled, + synthesize: async (req) => { + const tempRoot = resolvePreferredOpenClawTmpDir(); + mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-")); + let outputFormat = req.config.edge.outputFormat; + const fallbackOutputFormat = + outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; + + try { + const runEdge = async (format: string) => { + const fileExtension = inferEdgeExtension(format); + const outputPath = path.join(tempDir, `speech${fileExtension}`); + await edgeTTS({ + text: req.text, + outputPath, + config: { + ...req.config.edge, + outputFormat: format, + }, + timeoutMs: req.config.timeoutMs, + }); + const audioBuffer = readFileSync(outputPath); + return { + audioBuffer, + outputFormat: format, + fileExtension, + voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }), + }; + }; + + try { + return await runEdge(outputFormat); + } catch (err) { + if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) { + throw err; + } + outputFormat = fallbackOutputFormat; + return await runEdge(outputFormat); + } + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } + }, + }; +} diff --git a/src/tts/providers/openai.ts b/src/tts/providers/openai.ts new file mode 100644 index 00000000000..bf52c1644a9 --- /dev/null +++ b/src/tts/providers/openai.ts @@ -0,0 +1,56 @@ +import type { SpeechProviderPlugin } from "../../plugins/types.js"; +import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "../tts-core.js"; + +export function buildOpenAISpeechProvider(): SpeechProviderPlugin { + return { + id: "openai", + label: "OpenAI", + models: OPENAI_TTS_MODELS, + voices: OPENAI_TTS_VOICES, + isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY), + synthesize: async (req) => { + const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error("OpenAI API key missing"); + } + const responseFormat = req.target === "voice-note" ? "opus" : "mp3"; + const audioBuffer = await openaiTTS({ + text: req.text, + apiKey, + baseUrl: req.config.openai.baseUrl, + model: req.overrides?.openai?.model ?? req.config.openai.model, + voice: req.overrides?.openai?.voice ?? req.config.openai.voice, + speed: req.config.openai.speed, + instructions: req.config.openai.instructions, + responseFormat, + timeoutMs: req.config.timeoutMs, + }); + return { + audioBuffer, + outputFormat: responseFormat, + fileExtension: responseFormat === "opus" ? ".opus" : ".mp3", + voiceCompatible: req.target === "voice-note", + }; + }, + synthesizeTelephony: async (req) => { + const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error("OpenAI API key missing"); + } + const outputFormat = "pcm"; + const sampleRate = 24_000; + const audioBuffer = await openaiTTS({ + text: req.text, + apiKey, + baseUrl: req.config.openai.baseUrl, + model: req.config.openai.model, + voice: req.config.openai.voice, + speed: req.config.openai.speed, + instructions: req.config.openai.instructions, + responseFormat: outputFormat, + timeoutMs: req.config.timeoutMs, + }); + return { audioBuffer, outputFormat, sampleRate }; + }, + }; +} diff --git a/src/tts/tts-core.ts b/src/tts/tts-core.ts index 5d3000d7ad3..7bdc8f56288 100644 --- a/src/tts/tts-core.ts +++ b/src/tts/tts-core.ts @@ -156,10 +156,13 @@ export function parseTtsDirectives( if (!policy.allowProvider) { break; } - if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") { - overrides.provider = rawValue; - } else { - warnings.push(`unsupported provider "${rawValue}"`); + { + const providerId = rawValue.trim().toLowerCase(); + if (providerId) { + overrides.provider = providerId; + } else { + warnings.push("invalid provider id"); + } } break; case "voice": diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 8b232ed034d..16b91b6f330 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -311,7 +311,7 @@ describe("tts", () => { expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1); }); - it("accepts edge as provider override", () => { + it("accepts edge as a legacy microsoft provider override", () => { const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true }); const input = "Hello [[tts:provider=edge]] world"; const result = parseTtsDirectives(input, policy); @@ -524,8 +524,8 @@ describe("tts", () => { ELEVENLABS_API_KEY: undefined, XI_API_KEY: undefined, }, - prefsPath: "/tmp/tts-prefs-edge.json", - expected: "edge", + prefsPath: "/tmp/tts-prefs-microsoft.json", + expected: "microsoft", }, ] as const; @@ -539,6 +539,25 @@ describe("tts", () => { }); }); + describe("resolveTtsConfig provider normalization", () => { + it("normalizes legacy edge provider ids to microsoft", () => { + const config = resolveTtsConfig({ + agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, + messages: { + tts: { + provider: "edge", + edge: { + enabled: true, + }, + }, + }, + }); + + expect(config.provider).toBe("microsoft"); + expect(getTtsProvider(config, "/tmp/tts-prefs-normalized.json")).toBe("microsoft"); + }); + }); + describe("resolveTtsConfig – openai.baseUrl", () => { const baseCfg: OpenClawConfig = { agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 403efc10543..44cb57fd6e8 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -5,7 +5,6 @@ import { readFileSync, writeFileSync, mkdtempSync, - rmSync, renameSync, unlinkSync, } from "node:fs"; @@ -25,20 +24,20 @@ import type { import { logVerbose } from "../globals.js"; import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; import { stripMarkdown } from "../line/markdown-to-line.js"; -import { isVoiceCompatibleAudio } from "../media/audio.js"; import { CONFIG_DIR, resolveUserPath } from "../utils.js"; +import { + getSpeechProvider, + listSpeechProviders, + normalizeSpeechProviderId, +} from "./provider-registry.js"; import { DEFAULT_OPENAI_BASE_URL, - edgeTTS, - elevenLabsTTS, - inferEdgeExtension, isValidOpenAIModel, isValidOpenAIVoice, isValidVoiceId, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, resolveOpenAITtsInstructions, - openaiTTS, parseTtsDirectives, scheduleCleanup, summarizeText, @@ -83,11 +82,6 @@ const DEFAULT_OUTPUT = { voiceCompatible: false, }; -const TELEPHONY_OUTPUT = { - openai: { format: "pcm" as const, sampleRate: 24000 }, - elevenlabs: { format: "pcm_22050", sampleRate: 22050 }, -}; - const TTS_AUTO_MODES = new Set(["off", "always", "inbound", "tagged"]); export type ResolvedTtsConfig = { @@ -261,12 +255,13 @@ function resolveModelOverridePolicy( export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { const raw: TtsConfig = cfg.messages?.tts ?? {}; const providerSource = raw.provider ? "config" : "default"; - const edgeOutputFormat = raw.edge?.outputFormat?.trim(); + const rawMicrosoft = { ...raw.edge, ...raw.microsoft }; + const edgeOutputFormat = rawMicrosoft.outputFormat?.trim(); const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off"); return { auto, mode: raw.mode ?? "final", - provider: raw.provider ?? "edge", + provider: normalizeSpeechProviderId(raw.provider) ?? "microsoft", providerSource, summaryModel: raw.summaryModel?.trim() || undefined, modelOverrides: resolveModelOverridePolicy(raw.modelOverrides), @@ -311,17 +306,17 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { instructions: raw.openai?.instructions?.trim() || undefined, }, edge: { - enabled: raw.edge?.enabled ?? true, - voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE, - lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG, + enabled: rawMicrosoft.enabled ?? true, + voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE, + lang: rawMicrosoft.lang?.trim() || DEFAULT_EDGE_LANG, outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT, outputFormatConfigured: Boolean(edgeOutputFormat), - pitch: raw.edge?.pitch?.trim() || undefined, - rate: raw.edge?.rate?.trim() || undefined, - volume: raw.edge?.volume?.trim() || undefined, - saveSubtitles: raw.edge?.saveSubtitles ?? false, - proxy: raw.edge?.proxy?.trim() || undefined, - timeoutMs: raw.edge?.timeoutMs, + pitch: rawMicrosoft.pitch?.trim() || undefined, + rate: rawMicrosoft.rate?.trim() || undefined, + volume: rawMicrosoft.volume?.trim() || undefined, + saveSubtitles: rawMicrosoft.saveSubtitles ?? false, + proxy: rawMicrosoft.proxy?.trim() || undefined, + timeoutMs: rawMicrosoft.timeoutMs, }, prefsPath: raw.prefsPath, maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, @@ -448,11 +443,12 @@ export function setTtsEnabled(prefsPath: string, enabled: boolean): void { export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider { const prefs = readPrefs(prefsPath); - if (prefs.tts?.provider) { - return prefs.tts.provider; + const prefsProvider = normalizeSpeechProviderId(prefs.tts?.provider); + if (prefsProvider) { + return prefsProvider; } if (config.providerSource === "config") { - return config.provider; + return normalizeSpeechProviderId(config.provider) ?? config.provider; } if (resolveTtsApiKey(config, "openai")) { @@ -461,12 +457,12 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt if (resolveTtsApiKey(config, "elevenlabs")) { return "elevenlabs"; } - return "edge"; + return "microsoft"; } export function setTtsProvider(prefsPath: string, provider: TtsProvider): void { updatePrefs(prefsPath, (prefs) => { - prefs.tts = { ...prefs.tts, provider }; + prefs.tts = { ...prefs.tts, provider: normalizeSpeechProviderId(provider) ?? provider }; }); } @@ -522,26 +518,42 @@ export function resolveTtsApiKey( config: ResolvedTtsConfig, provider: TtsProvider, ): string | undefined { - if (provider === "elevenlabs") { + const normalizedProvider = normalizeSpeechProviderId(provider); + if (normalizedProvider === "elevenlabs") { return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY; } - if (provider === "openai") { + if (normalizedProvider === "openai") { return config.openai.apiKey || process.env.OPENAI_API_KEY; } return undefined; } -export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const; +export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const; -export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] { - return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)]; +export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] { + const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary; + const ordered = new Set([normalizedPrimary]); + for (const provider of TTS_PROVIDERS) { + if (provider !== normalizedPrimary) { + ordered.add(provider); + } + } + for (const provider of listSpeechProviders(cfg)) { + const normalized = normalizeSpeechProviderId(provider.id) ?? provider.id; + if (normalized !== normalizedPrimary) { + ordered.add(normalized); + } + } + return [...ordered]; } -export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean { - if (provider === "edge") { - return config.edge.enabled; - } - return Boolean(resolveTtsApiKey(config, provider)); +export function isTtsProviderConfigured( + config: ResolvedTtsConfig, + provider: TtsProvider, + cfg?: OpenClawConfig, +): boolean { + const resolvedProvider = getSpeechProvider(provider, cfg); + return resolvedProvider?.isConfigured({ cfg, config }) ?? false; } function formatTtsProviderError(provider: TtsProvider, err: unknown): string { @@ -581,10 +593,10 @@ function resolveTtsRequestSetup(params: { } const userProvider = getTtsProvider(config, prefsPath); - const provider = params.providerOverride ?? userProvider; + const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider; return { config, - providers: resolveTtsProviderOrder(provider), + providers: resolveTtsProviderOrder(provider, params.cfg), }; } @@ -607,136 +619,36 @@ export async function textToSpeech(params: { const { config, providers } = setup; const channelId = resolveChannelId(params.channel); - const output = resolveOutputFormat(channelId); + const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file"; const errors: string[] = []; for (const provider of providers) { const providerStart = Date.now(); try { - if (provider === "edge") { - if (!config.edge.enabled) { - errors.push("edge: disabled"); - continue; - } - - const tempRoot = resolvePreferredOpenClawTmpDir(); - mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); - const tempDir = mkdtempSync(path.join(tempRoot, "tts-")); - let edgeOutputFormat = resolveEdgeOutputFormat(config); - const fallbackEdgeOutputFormat = - edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; - - const attemptEdgeTts = async (outputFormat: string) => { - const extension = inferEdgeExtension(outputFormat); - const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`); - await edgeTTS({ - text: params.text, - outputPath: audioPath, - config: { - ...config.edge, - outputFormat, - }, - timeoutMs: config.timeoutMs, - }); - return { audioPath, outputFormat }; - }; - - let edgeResult: { audioPath: string; outputFormat: string }; - try { - edgeResult = await attemptEdgeTts(edgeOutputFormat); - } catch (err) { - if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) { - logVerbose( - `TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`, - ); - edgeOutputFormat = fallbackEdgeOutputFormat; - try { - edgeResult = await attemptEdgeTts(edgeOutputFormat); - } catch (fallbackErr) { - try { - rmSync(tempDir, { recursive: true, force: true }); - } catch { - // ignore cleanup errors - } - throw fallbackErr; - } - } else { - try { - rmSync(tempDir, { recursive: true, force: true }); - } catch { - // ignore cleanup errors - } - throw err; - } - } - - scheduleCleanup(tempDir); - const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath }); - - return { - success: true, - audioPath: edgeResult.audioPath, - latencyMs: Date.now() - providerStart, - provider, - outputFormat: edgeResult.outputFormat, - voiceCompatible, - }; - } - - const apiKey = resolveTtsApiKey(config, provider); - if (!apiKey) { - errors.push(`${provider}: no API key`); + const resolvedProvider = getSpeechProvider(provider, params.cfg); + if (!resolvedProvider) { + errors.push(`${provider}: no provider registered`); continue; } - - let audioBuffer: Buffer; - if (provider === "elevenlabs") { - const voiceIdOverride = params.overrides?.elevenlabs?.voiceId; - const modelIdOverride = params.overrides?.elevenlabs?.modelId; - const voiceSettings = { - ...config.elevenlabs.voiceSettings, - ...params.overrides?.elevenlabs?.voiceSettings, - }; - const seedOverride = params.overrides?.elevenlabs?.seed; - const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization; - const languageOverride = params.overrides?.elevenlabs?.languageCode; - audioBuffer = await elevenLabsTTS({ - text: params.text, - apiKey, - baseUrl: config.elevenlabs.baseUrl, - voiceId: voiceIdOverride ?? config.elevenlabs.voiceId, - modelId: modelIdOverride ?? config.elevenlabs.modelId, - outputFormat: output.elevenlabs, - seed: seedOverride ?? config.elevenlabs.seed, - applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization, - languageCode: languageOverride ?? config.elevenlabs.languageCode, - voiceSettings, - timeoutMs: config.timeoutMs, - }); - } else { - const openaiModelOverride = params.overrides?.openai?.model; - const openaiVoiceOverride = params.overrides?.openai?.voice; - audioBuffer = await openaiTTS({ - text: params.text, - apiKey, - baseUrl: config.openai.baseUrl, - model: openaiModelOverride ?? config.openai.model, - voice: openaiVoiceOverride ?? config.openai.voice, - speed: config.openai.speed, - instructions: config.openai.instructions, - responseFormat: output.openai, - timeoutMs: config.timeoutMs, - }); + if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) { + errors.push(`${provider}: not configured`); + continue; } - + const synthesis = await resolvedProvider.synthesize({ + text: params.text, + cfg: params.cfg, + config, + target, + overrides: params.overrides, + }); const latencyMs = Date.now() - providerStart; const tempRoot = resolvePreferredOpenClawTmpDir(); mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); const tempDir = mkdtempSync(path.join(tempRoot, "tts-")); - const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`); - writeFileSync(audioPath, audioBuffer); + const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`); + writeFileSync(audioPath, synthesis.audioBuffer); scheduleCleanup(tempDir); return { @@ -744,8 +656,8 @@ export async function textToSpeech(params: { audioPath, latencyMs, provider, - outputFormat: provider === "openai" ? output.openai : output.elevenlabs, - voiceCompatible: output.voiceCompatible, + outputFormat: synthesis.outputFormat, + voiceCompatible: synthesis.voiceCompatible, }; } catch (err) { errors.push(formatTtsProviderError(provider, err)); @@ -776,63 +688,32 @@ export async function textToSpeechTelephony(params: { for (const provider of providers) { const providerStart = Date.now(); try { - if (provider === "edge") { - errors.push("edge: unsupported for telephony"); + const resolvedProvider = getSpeechProvider(provider, params.cfg); + if (!resolvedProvider) { + errors.push(`${provider}: no provider registered`); continue; } - - const apiKey = resolveTtsApiKey(config, provider); - if (!apiKey) { - errors.push(`${provider}: no API key`); + if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) { + errors.push(`${provider}: not configured`); continue; } - - if (provider === "elevenlabs") { - const output = TELEPHONY_OUTPUT.elevenlabs; - const audioBuffer = await elevenLabsTTS({ - text: params.text, - apiKey, - baseUrl: config.elevenlabs.baseUrl, - voiceId: config.elevenlabs.voiceId, - modelId: config.elevenlabs.modelId, - outputFormat: output.format, - seed: config.elevenlabs.seed, - applyTextNormalization: config.elevenlabs.applyTextNormalization, - languageCode: config.elevenlabs.languageCode, - voiceSettings: config.elevenlabs.voiceSettings, - timeoutMs: config.timeoutMs, - }); - - return { - success: true, - audioBuffer, - latencyMs: Date.now() - providerStart, - provider, - outputFormat: output.format, - sampleRate: output.sampleRate, - }; + if (!resolvedProvider.synthesizeTelephony) { + errors.push(`${provider}: unsupported for telephony`); + continue; } - - const output = TELEPHONY_OUTPUT.openai; - const audioBuffer = await openaiTTS({ + const synthesis = await resolvedProvider.synthesizeTelephony({ text: params.text, - apiKey, - baseUrl: config.openai.baseUrl, - model: config.openai.model, - voice: config.openai.voice, - speed: config.openai.speed, - instructions: config.openai.instructions, - responseFormat: output.format, - timeoutMs: config.timeoutMs, + cfg: params.cfg, + config, }); return { success: true, - audioBuffer, + audioBuffer: synthesis.audioBuffer, latencyMs: Date.now() - providerStart, provider, - outputFormat: output.format, - sampleRate: output.sampleRate, + outputFormat: synthesis.outputFormat, + sampleRate: synthesis.sampleRate, }; } catch (err) { errors.push(formatTtsProviderError(provider, err));