diff --git a/src/tts/providers/azure.ts b/src/tts/providers/azure.ts index 04db3b4fa82..f03099905b5 100644 --- a/src/tts/providers/azure.ts +++ b/src/tts/providers/azure.ts @@ -27,9 +27,11 @@ export async function listAzureVoices(params: { region?: string; baseUrl?: string; }): Promise { - const base = normalizeAzureBaseUrl(params.baseUrl); const region = params.region || "eastus"; - const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`; + // Use baseUrl if provided, otherwise derive from region + const url = params.baseUrl + ? `${normalizeAzureBaseUrl(params.baseUrl)}/cognitiveservices/voices/list` + : `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`; const response = await fetch(url, { headers: { @@ -42,8 +44,10 @@ export async function listAzureVoices(params: { } const voices = (await response.json()) as AzureVoiceListEntry[]; + // Filter deprecated voices BEFORE mapping (Status field is available here) return Array.isArray(voices) ? voices + .filter((voice) => voice.Status !== "Deprecated") .map((voice) => ({ id: voice.ShortName?.trim() ?? "", name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined, @@ -51,7 +55,7 @@ export async function listAzureVoices(params: { locale: voice.Locale?.trim() || undefined, gender: voice.Gender?.trim() || undefined, })) - .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated") + .filter((voice) => voice.id.length > 0) : []; } @@ -74,43 +78,46 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin { listVoices: async (req) => { const apiKey = req.apiKey || - req.config?.azure?.apiKey || + (req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; if (!apiKey) { throw new Error("Azure Speech API key missing"); } return listAzureVoices({ apiKey, - region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION, - baseUrl: req.config?.azure?.baseUrl, + region: (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION, + baseUrl: (req.config as any)?.azure?.baseUrl, }); }, isConfigured: ({ config }) => Boolean( - config.azure?.apiKey || + (config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY, ), synthesize: async (req) => { const apiKey = - req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; + (req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY; if (!apiKey) { throw new Error("Azure Speech API key missing"); } - const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus"; - const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl); - const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice; - const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang; + const region = (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus"; + const baseUrl = (req.config as any)?.azure?.baseUrl; + // Use baseUrl if provided, otherwise derive from region + const endpoint = baseUrl + ? `${normalizeAzureBaseUrl(baseUrl)}/cognitiveservices/v1` + : `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`; + + const voice = (req.config as any)?.azure?.voice; + const lang = (req.config as any)?.azure?.lang; const outputFormat = - req.overrides?.azure?.outputFormat ?? - req.config?.azure?.outputFormat ?? + (req.config as any)?.azure?.outputFormat ?? DEFAULT_AZURE_OUTPUT_FORMAT; if (!voice) { throw new Error("Azure voice not configured"); } - const endpoint = `${baseUrl}/cognitiveservices/v1`; const ssml = buildAzureSSML(req.text, voice, lang); const response = await fetch(endpoint, { @@ -121,6 +128,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin { "X-Microsoft-OutputFormat": outputFormat, }, body: ssml, + signal: AbortSignal.timeout((req.config as any)?.azure?.timeoutMs ?? 30000), }); if (!response.ok) { diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 17a7c2fc981..2ddc8d1b8cf 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -59,6 +59,7 @@ const DEFAULT_OPENAI_VOICE = "alloy"; const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; const DEFAULT_EDGE_LANG = "en-US"; const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; +const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { stability: 0.5, @@ -117,7 +118,17 @@ export type ResolvedTtsConfig = { speed?: number; instructions?: string; }; - edge: { + + azure: { + apiKey?: string; + region: string; + baseUrl: string; + voice: string; + lang: string; + outputFormat: string; + timeoutMs?: number; + }; +edge: { enabled: boolean; voice: string; lang: string; @@ -177,7 +188,11 @@ export type TtsDirectiveOverrides = { voice?: string; outputFormat?: string; }; -}; + azure?: { + voice?: string; + lang?: string; + outputFormat?: string; + }; export type TtsDirectiveParseResult = { cleanedText: string; @@ -324,6 +339,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { speed: raw.openai?.speed, instructions: raw.openai?.instructions?.trim() || undefined, }, + azure: { + apiKey: normalizeResolvedSecretInputString({ + value: raw.azure?.apiKey, + path: "messages.tts.azure.apiKey", + }), + region: raw.azure?.region?.trim() || process.env.AZURE_SPEECH_REGION || "eastus", + baseUrl: raw.azure?.baseUrl?.trim() || "", + voice: raw.azure?.voice || "", + lang: raw.azure?.lang?.trim() || "en-US", + outputFormat: raw.azure?.outputFormat?.trim() || DEFAULT_AZURE_OUTPUT_FORMAT, + timeoutMs: raw.azure?.timeoutMs, + }, edge: { enabled: rawMicrosoft.enabled ?? true, voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,