From 33b95fed9aa147324207b580f89176e5c96f84fa Mon Sep 17 00:00:00 2001 From: Yobo Date: Fri, 20 Mar 2026 16:25:28 -0700 Subject: [PATCH 1/3] feat(tts): add Azure Speech TTS provider - Add Azure TTS provider with SSML synthesis - Support for 400+ neural voices including Cantonese (zh-HK) - Config options: apiKey, region, voice, lang, outputFormat - Environment variables: AZURE_SPEECH_API_KEY, AZURE_SPEECH_REGION - Provider ID: 'azure' with alias 'azure-tts' - Built-in voices: zh-HK-HiuMaanNeural, zh-HK-HiuGaaiNeural --- src/config/types.tts.ts | 10 +++ src/config/zod-schema.core.ts | 13 ++++ src/tts/provider-registry.ts | 2 + src/tts/providers/azure.ts | 139 ++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+) create mode 100644 src/tts/providers/azure.ts diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4703f43ae12..eaf0c35d242 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -93,6 +93,16 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; + /** Azure Speech configuration. */ + azure?: { + apiKey?: SecretInput; + region?: string; + baseUrl?: string; + voice?: string; + lang?: string; + outputFormat?: string; + timeoutMs?: number; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 22c589c8490..d12de3fff35 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z }) .strict() .optional(); +const TtsAzureConfigSchema = z + .object({ + apiKey: SecretInputSchema.optional().register(sensitive), + region: z.string().optional(), + baseUrl: z.string().optional(), + voice: z.string().optional(), + lang: z.string().optional(), + outputFormat: z.string().optional(), + timeoutMs: z.number().int().min(1000).max(120000).optional(), + }) + .strict() + .optional(); export const TtsConfigSchema = z .object({ auto: TtsAutoSchema.optional(), @@ -447,6 +459,7 @@ export const TtsConfigSchema = z .optional(), edge: TtsMicrosoftConfigSchema, microsoft: TtsMicrosoftConfigSchema, + azure: TtsAzureConfigSchema, prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts index d1462880a99..88a99b613d7 100644 --- a/src/tts/provider-registry.ts +++ b/src/tts/provider-registry.ts @@ -3,6 +3,7 @@ import { loadOpenClawPlugins } from "../plugins/loader.js"; import { getActivePluginRegistry } from "../plugins/runtime.js"; import type { SpeechProviderPlugin } from "../plugins/types.js"; import type { SpeechProviderId } from "./provider-types.js"; +import { buildAzureSpeechProvider } from "./providers/azure.js"; import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js"; import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js"; import { buildOpenAISpeechProvider } from "./providers/openai.js"; @@ -11,6 +12,7 @@ const BUILTIN_SPEECH_PROVIDER_BUILDERS = [ buildOpenAISpeechProvider, buildElevenLabsSpeechProvider, buildMicrosoftSpeechProvider, + buildAzureSpeechProvider, ] as const satisfies readonly (() => SpeechProviderPlugin)[]; function trimToUndefined(value: string | undefined): string | undefined { diff --git a/src/tts/providers/azure.ts b/src/tts/providers/azure.ts new file mode 100644 index 00000000000..04db3b4fa82 --- /dev/null +++ b/src/tts/providers/azure.ts @@ -0,0 +1,139 @@ +import type { SpeechProviderPlugin } from "../../plugins/types.js"; +import type { SpeechVoiceOption } from "../provider-types.js"; + +const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; + +type AzureVoiceListEntry = { + Name?: string; + DisplayName?: string; + LocalName?: string; + ShortName?: string; + Gender?: string; + Locale?: string; + VoiceType?: string; + Status?: string; +}; + +function normalizeAzureBaseUrl(baseUrl: string | undefined): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) { + return "https://eastus.tts.speech.microsoft.com"; + } + return trimmed.replace(/\/+$/, ""); +} + +export async function listAzureVoices(params: { + apiKey: string; + region?: string; + baseUrl?: string; +}): Promise { + const base = normalizeAzureBaseUrl(params.baseUrl); + const region = params.region || "eastus"; + const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`; + + const response = await fetch(url, { + headers: { + "Ocp-Apim-Subscription-Key": params.apiKey, + }, + }); + + if (!response.ok) { + throw new Error(`Azure voices API error (${response.status})`); + } + + const voices = (await response.json()) as AzureVoiceListEntry[]; + return Array.isArray(voices) + ? voices + .map((voice) => ({ + id: voice.ShortName?.trim() ?? "", + name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined, + category: voice.VoiceType?.trim() || undefined, + locale: voice.Locale?.trim() || undefined, + gender: voice.Gender?.trim() || undefined, + })) + .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated") + : []; +} + +function buildAzureSSML(text: string, voice: string, lang?: string): string { + const escapedText = text + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); + + return `${escapedText}`; +} + +export function buildAzureSpeechProvider(): SpeechProviderPlugin { + return { + id: "azure", + label: "Azure Speech", + aliases: ["azure-tts"], + listVoices: async (req) => { + const apiKey = + req.apiKey || + req.config?.azure?.apiKey || + process.env.AZURE_SPEECH_API_KEY; + if (!apiKey) { + throw new Error("Azure Speech API key missing"); + } + return listAzureVoices({ + apiKey, + region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION, + baseUrl: req.config?.azure?.baseUrl, + }); + }, + isConfigured: ({ config }) => + Boolean( + config.azure?.apiKey || + process.env.AZURE_SPEECH_API_KEY, + ), + synthesize: async (req) => { + const apiKey = + req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; + if (!apiKey) { + throw new Error("Azure Speech API key missing"); + } + + const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus"; + const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl); + const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice; + const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang; + const outputFormat = + req.overrides?.azure?.outputFormat ?? + req.config?.azure?.outputFormat ?? + DEFAULT_AZURE_OUTPUT_FORMAT; + + if (!voice) { + throw new Error("Azure voice not configured"); + } + + const endpoint = `${baseUrl}/cognitiveservices/v1`; + const ssml = buildAzureSSML(req.text, voice, lang); + + const response = await fetch(endpoint, { + method: "POST", + headers: { + "Ocp-Apim-Subscription-Key": apiKey, + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": outputFormat, + }, + body: ssml, + }); + + if (!response.ok) { + throw new Error(`Azure TTS failed: ${response.status} ${response.statusText}`); + } + + const audioBuffer = await response.arrayBuffer(); + return { + audioBuffer: Buffer.from(audioBuffer), + outputFormat, + fileExtension: outputFormat.includes("mp3") ? ".mp3" : ".wav", + voiceCompatible: true, + }; + }, + }; +} From 8e07d5c326f93c05de45f46e019ad1b97a782404 Mon Sep 17 00:00:00 2001 From: Yobo Date: Fri, 20 Mar 2026 16:38:43 -0700 Subject: [PATCH 2/3] test(tts): add Azure TTS provider tests - Test voice list mapping from Azure API response - Test filtering of deprecated voices - Test error handling for API failures - Test custom baseUrl support --- src/tts/providers/azure.test.ts | 125 ++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 src/tts/providers/azure.test.ts diff --git a/src/tts/providers/azure.test.ts b/src/tts/providers/azure.test.ts new file mode 100644 index 00000000000..aa65ead85e2 --- /dev/null +++ b/src/tts/providers/azure.test.ts @@ -0,0 +1,125 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { listAzureVoices } from "./azure.js"; + +describe("listAzureVoices", () => { + const originalFetch = globalThis.fetch; + + afterEach(() => { + globalThis.fetch = originalFetch; + vi.restoreAllMocks(); + }); + + it("maps Azure voice metadata into speech voice options", async () => { + globalThis.fetch = vi.fn().mockResolvedValue( + new Response( + JSON.stringify([ + { + Name: "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)", + DisplayName: "HiuMaan Neural (zh-HK)", + LocalName: "HiuMaan", + ShortName: "zh-HK-HiuMaanNeural", + Gender: "Female", + Locale: "zh-HK", + VoiceType: "Neural", + Status: "Available", + }, + { + Name: "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)", + DisplayName: "Xiaoxiao Neural (zh-CN)", + ShortName: "zh-CN-XiaoxiaoNeural", + Gender: "Female", + Locale: "zh-CN", + VoiceType: "Neural", + Status: "Available", + }, + ]), + { status: 200 }, + ), + ) as typeof globalThis.fetch; + + const voices = await listAzureVoices({ + apiKey: "test-key", + region: "eastus", + }); + + expect(voices).toEqual([ + { + id: "zh-HK-HiuMaanNeural", + name: "HiuMaan Neural (zh-HK)", + category: "Neural", + locale: "zh-HK", + gender: "Female", + }, + { + id: "zh-CN-XiaoxiaoNeural", + name: "Xiaoxiao Neural (zh-CN)", + category: "Neural", + locale: "zh-CN", + gender: "Female", + }, + ]); + expect(globalThis.fetch).toHaveBeenCalledWith( + "https://eastus.tts.speech.microsoft.com/cognitiveservices/voices/list", + expect.objectContaining({ + headers: expect.objectContaining({ + "Ocp-Apim-Subscription-Key": "test-key", + }), + }), + ); + }); + + it("filters out deprecated voices", async () => { + globalThis.fetch = vi.fn().mockResolvedValue( + new Response( + JSON.stringify([ + { + ShortName: "zh-HK-HiuMaanNeural", + Gender: "Female", + Locale: "zh-HK", + Status: "Available", + }, + { + ShortName: "zh-HK-OldVoice", + Gender: "Male", + Locale: "zh-HK", + Status: "Deprecated", + }, + ]), + { status: 200 }, + ), + ) as typeof globalThis.fetch; + + const voices = await listAzureVoices({ + apiKey: "test-key", + }); + + expect(voices).toHaveLength(1); + expect(voices[0].id).toBe("zh-HK-HiuMaanNeural"); + }); + + it("throws on Azure voice list failures", async () => { + globalThis.fetch = vi + .fn() + .mockResolvedValue(new Response("nope", { status: 503 })) as typeof globalThis.fetch; + + await expect( + listAzureVoices({ apiKey: "test-key", region: "eastus" }), + ).rejects.toThrow("Azure voices API error (503)"); + }); + + it("uses custom baseUrl when provided", async () => { + globalThis.fetch = vi.fn().mockResolvedValue( + new Response(JSON.stringify([]), { status: 200 }), + ) as typeof globalThis.fetch; + + await listAzureVoices({ + apiKey: "test-key", + baseUrl: "https://custom.region.tts.speech.microsoft.com", + }); + + expect(globalThis.fetch).toHaveBeenCalledWith( + "https://custom.region.tts.speech.microsoft.com/cognitiveservices/voices/list", + expect.any(Object), + ); + }); +}); From ea9ffd2659e29e612fa77245b150ffee34439632 Mon Sep 17 00:00:00 2001 From: Yobo Date: Fri, 20 Mar 2026 19:26:28 -0700 Subject: [PATCH 3/3] fix(tts): address review comments for Azure TTS provider Fixed critical bugs identified by bot reviews: 1. baseUrl now used in listAzureVoices (was computed but unused) 2. region now used in synthesize endpoint construction 3. Deprecated-voice filter runs BEFORE map (Status field available) 4. Added azure to ResolvedTtsConfig type 5. Added azure to TtsDirectiveOverrides for directive support 6. Added DEFAULT_AZURE_OUTPUT_FORMAT constant 7. Added AbortController timeout for synthesize requests 8. Used type assertion for config.azure access (req.config as any) All changes follow the suggested fixes from greptile-apps and chatgpt-codex-connector reviews. --- src/tts/providers/azure.ts | 38 +++++++++++++++++++++++--------------- src/tts/tts.ts | 31 +++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/tts/providers/azure.ts b/src/tts/providers/azure.ts index 04db3b4fa82..f03099905b5 100644 --- a/src/tts/providers/azure.ts +++ b/src/tts/providers/azure.ts @@ -27,9 +27,11 @@ export async function listAzureVoices(params: { region?: string; baseUrl?: string; }): Promise { - const base = normalizeAzureBaseUrl(params.baseUrl); const region = params.region || "eastus"; - const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`; + // Use baseUrl if provided, otherwise derive from region + const url = params.baseUrl + ? `${normalizeAzureBaseUrl(params.baseUrl)}/cognitiveservices/voices/list` + : `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`; const response = await fetch(url, { headers: { @@ -42,8 +44,10 @@ export async function listAzureVoices(params: { } const voices = (await response.json()) as AzureVoiceListEntry[]; + // Filter deprecated voices BEFORE mapping (Status field is available here) return Array.isArray(voices) ? voices + .filter((voice) => voice.Status !== "Deprecated") .map((voice) => ({ id: voice.ShortName?.trim() ?? "", name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined, @@ -51,7 +55,7 @@ export async function listAzureVoices(params: { locale: voice.Locale?.trim() || undefined, gender: voice.Gender?.trim() || undefined, })) - .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated") + .filter((voice) => voice.id.length > 0) : []; } @@ -74,43 +78,46 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin { listVoices: async (req) => { const apiKey = req.apiKey || - req.config?.azure?.apiKey || + (req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; if (!apiKey) { throw new Error("Azure Speech API key missing"); } return listAzureVoices({ apiKey, - region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION, - baseUrl: req.config?.azure?.baseUrl, + region: (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION, + baseUrl: (req.config as any)?.azure?.baseUrl, }); }, isConfigured: ({ config }) => Boolean( - config.azure?.apiKey || + (config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY, ), synthesize: async (req) => { const apiKey = - req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; + (req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; if (!apiKey) { throw new Error("Azure Speech API key missing"); } - const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus"; - const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl); - const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice; - const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang; + const region = (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus"; + const baseUrl = (req.config as any)?.azure?.baseUrl; + // Use baseUrl if provided, otherwise derive from region + const endpoint = baseUrl + ? `${normalizeAzureBaseUrl(baseUrl)}/cognitiveservices/v1` + : `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`; + + const voice = (req.config as any)?.azure?.voice; + const lang = (req.config as any)?.azure?.lang; const outputFormat = - req.overrides?.azure?.outputFormat ?? - req.config?.azure?.outputFormat ?? + (req.config as any)?.azure?.outputFormat ?? DEFAULT_AZURE_OUTPUT_FORMAT; if (!voice) { throw new Error("Azure voice not configured"); } - const endpoint = `${baseUrl}/cognitiveservices/v1`; const ssml = buildAzureSSML(req.text, voice, lang); const response = await fetch(endpoint, { @@ -121,6 +128,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin { "X-Microsoft-OutputFormat": outputFormat, }, body: ssml, + signal: AbortSignal.timeout((req.config as any)?.azure?.timeoutMs ?? 30000), }); if (!response.ok) { diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 17a7c2fc981..2ddc8d1b8cf 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -59,6 +59,7 @@ const DEFAULT_OPENAI_VOICE = "alloy"; const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; const DEFAULT_EDGE_LANG = "en-US"; const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; +const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { stability: 0.5, @@ -117,7 +118,17 @@ export type ResolvedTtsConfig = { speed?: number; instructions?: string; }; - edge: { + + azure: { + apiKey?: string; + region: string; + baseUrl: string; + voice: string; + lang: string; + outputFormat: string; + timeoutMs?: number; + }; +edge: { enabled: boolean; voice: string; lang: string; @@ -177,7 +188,11 @@ export type TtsDirectiveOverrides = { voice?: string; outputFormat?: string; }; -}; + azure?: { + voice?: string; + lang?: string; + outputFormat?: string; + }; export type TtsDirectiveParseResult = { cleanedText: string; @@ -324,6 +339,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { speed: raw.openai?.speed, instructions: raw.openai?.instructions?.trim() || undefined, }, + azure: { + apiKey: normalizeResolvedSecretInputString({ + value: raw.azure?.apiKey, + path: "messages.tts.azure.apiKey", + }), + region: raw.azure?.region?.trim() || process.env.AZURE_SPEECH_REGION || "eastus", + baseUrl: raw.azure?.baseUrl?.trim() || "", + voice: raw.azure?.voice || "", + lang: raw.azure?.lang?.trim() || "en-US", + outputFormat: raw.azure?.outputFormat?.trim() || DEFAULT_AZURE_OUTPUT_FORMAT, + timeoutMs: raw.azure?.timeoutMs, + }, edge: { enabled: rawMicrosoft.enabled ?? true, voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,