From 33b95fed9aa147324207b580f89176e5c96f84fa Mon Sep 17 00:00:00 2001 From: Yobo Date: Fri, 20 Mar 2026 16:25:28 -0700 Subject: [PATCH] feat(tts): add Azure Speech TTS provider - Add Azure TTS provider with SSML synthesis - Support for 400+ neural voices including Cantonese (zh-HK) - Config options: apiKey, region, voice, lang, outputFormat - Environment variables: AZURE_SPEECH_API_KEY, AZURE_SPEECH_REGION - Provider ID: 'azure' with alias 'azure-tts' - Built-in voices: zh-HK-HiuMaanNeural, zh-HK-HiuGaaiNeural --- src/config/types.tts.ts | 10 +++ src/config/zod-schema.core.ts | 13 ++++ src/tts/provider-registry.ts | 2 + src/tts/providers/azure.ts | 139 ++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+) create mode 100644 src/tts/providers/azure.ts diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4703f43ae12..eaf0c35d242 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -93,6 +93,16 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; + /** Azure Speech configuration. */ + azure?: { + apiKey?: SecretInput; + region?: string; + baseUrl?: string; + voice?: string; + lang?: string; + outputFormat?: string; + timeoutMs?: number; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 22c589c8490..d12de3fff35 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z }) .strict() .optional(); +const TtsAzureConfigSchema = z + .object({ + apiKey: SecretInputSchema.optional().register(sensitive), + region: z.string().optional(), + baseUrl: z.string().optional(), + voice: z.string().optional(), + lang: z.string().optional(), + outputFormat: z.string().optional(), + timeoutMs: z.number().int().min(1000).max(120000).optional(), + }) + .strict() + .optional(); export const TtsConfigSchema = z .object({ auto: TtsAutoSchema.optional(), @@ -447,6 +459,7 @@ export const TtsConfigSchema = z .optional(), edge: TtsMicrosoftConfigSchema, microsoft: TtsMicrosoftConfigSchema, + azure: TtsAzureConfigSchema, prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts index d1462880a99..88a99b613d7 100644 --- a/src/tts/provider-registry.ts +++ b/src/tts/provider-registry.ts @@ -3,6 +3,7 @@ import { loadOpenClawPlugins } from "../plugins/loader.js"; import { getActivePluginRegistry } from "../plugins/runtime.js"; import type { SpeechProviderPlugin } from "../plugins/types.js"; import type { SpeechProviderId } from "./provider-types.js"; +import { buildAzureSpeechProvider } from "./providers/azure.js"; import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js"; import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js"; import { buildOpenAISpeechProvider } from "./providers/openai.js"; @@ -11,6 +12,7 @@ const BUILTIN_SPEECH_PROVIDER_BUILDERS = [ buildOpenAISpeechProvider, buildElevenLabsSpeechProvider, buildMicrosoftSpeechProvider, + buildAzureSpeechProvider, ] as const satisfies readonly (() => SpeechProviderPlugin)[]; function trimToUndefined(value: string | undefined): string | undefined { diff --git a/src/tts/providers/azure.ts b/src/tts/providers/azure.ts new file mode 100644 index 00000000000..04db3b4fa82 --- /dev/null +++ b/src/tts/providers/azure.ts @@ -0,0 +1,139 @@ +import type { SpeechProviderPlugin } from "../../plugins/types.js"; +import type { SpeechVoiceOption } from "../provider-types.js"; + +const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; + +type AzureVoiceListEntry = { + Name?: string; + DisplayName?: string; + LocalName?: string; + ShortName?: string; + Gender?: string; + Locale?: string; + VoiceType?: string; + Status?: string; +}; + +function normalizeAzureBaseUrl(baseUrl: string | undefined): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) { + return "https://eastus.tts.speech.microsoft.com"; + } + return trimmed.replace(/\/+$/, ""); +} + +export async function listAzureVoices(params: { + apiKey: string; + region?: string; + baseUrl?: string; +}): Promise { + const base = normalizeAzureBaseUrl(params.baseUrl); + const region = params.region || "eastus"; + const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`; + + const response = await fetch(url, { + headers: { + "Ocp-Apim-Subscription-Key": params.apiKey, + }, + }); + + if (!response.ok) { + throw new Error(`Azure voices API error (${response.status})`); + } + + const voices = (await response.json()) as AzureVoiceListEntry[]; + return Array.isArray(voices) + ? voices + .map((voice) => ({ + id: voice.ShortName?.trim() ?? "", + name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined, + category: voice.VoiceType?.trim() || undefined, + locale: voice.Locale?.trim() || undefined, + gender: voice.Gender?.trim() || undefined, + })) + .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated") + : []; +} + +function buildAzureSSML(text: string, voice: string, lang?: string): string { + const escapedText = text + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); + + return `${escapedText}`; +} + +export function buildAzureSpeechProvider(): SpeechProviderPlugin { + return { + id: "azure", + label: "Azure Speech", + aliases: ["azure-tts"], + listVoices: async (req) => { + const apiKey = + req.apiKey || + req.config?.azure?.apiKey || + process.env.AZURE_SPEECH_API_KEY; + if (!apiKey) { + throw new Error("Azure Speech API key missing"); + } + return listAzureVoices({ + apiKey, + region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION, + baseUrl: req.config?.azure?.baseUrl, + }); + }, + isConfigured: ({ config }) => + Boolean( + config.azure?.apiKey || + process.env.AZURE_SPEECH_API_KEY, + ), + synthesize: async (req) => { + const apiKey = + req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; + if (!apiKey) { + throw new Error("Azure Speech API key missing"); + } + + const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus"; + const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl); + const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice; + const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang; + const outputFormat = + req.overrides?.azure?.outputFormat ?? + req.config?.azure?.outputFormat ?? + DEFAULT_AZURE_OUTPUT_FORMAT; + + if (!voice) { + throw new Error("Azure voice not configured"); + } + + const endpoint = `${baseUrl}/cognitiveservices/v1`; + const ssml = buildAzureSSML(req.text, voice, lang); + + const response = await fetch(endpoint, { + method: "POST", + headers: { + "Ocp-Apim-Subscription-Key": apiKey, + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": outputFormat, + }, + body: ssml, + }); + + if (!response.ok) { + throw new Error(`Azure TTS failed: ${response.status} ${response.statusText}`); + } + + const audioBuffer = await response.arrayBuffer(); + return { + audioBuffer: Buffer.from(audioBuffer), + outputFormat, + fileExtension: outputFormat.includes("mp3") ? ".mp3" : ".wav", + voiceCompatible: true, + }; + }, + }; +}