fix(tts): address review comments for Azure TTS provider

Fixed critical bugs identified by bot reviews: 1. baseUrl now used in listAzureVoices (was computed but unused) 2. region now used in synthesize endpoint construction 3. Deprecated-voice filter runs BEFORE map (Status field available) 4. Added azure to ResolvedTtsConfig type 5. Added azure to TtsDirectiveOverrides for directive support 6. Added DEFAULT_AZURE_OUTPUT_FORMAT constant 7. Added AbortController timeout for synthesize requests 8. Used type assertion for config.azure access (req.config as any) All changes follow the suggested fixes from greptile-apps and chatgpt-codex-connector reviews.
2026-03-20 19:26:28 -07:00 · 2026-03-20 19:26:28 -07:00 · ea9ffd2659
commit ea9ffd2659
parent 8e07d5c326
2 changed files with 52 additions and 17 deletions
--- a/src/tts/providers/azure.ts
+++ b/src/tts/providers/azure.ts
@ -27,9 +27,11 @@ export async function listAzureVoices(params: {
  region?: string;
  baseUrl?: string;
 }): Promise<SpeechVoiceOption[]> {
-  const base = normalizeAzureBaseUrl(params.baseUrl);
  const region = params.region || "eastus";
-  const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
+  // Use baseUrl if provided, otherwise derive from region
+  const url = params.baseUrl
+    ? `${normalizeAzureBaseUrl(params.baseUrl)}/cognitiveservices/voices/list`
+    : `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;

  const response = await fetch(url, {
    headers: {
@ -42,8 +44,10 @@ export async function listAzureVoices(params: {
  }

  const voices = (await response.json()) as AzureVoiceListEntry[];
+  // Filter deprecated voices BEFORE mapping (Status field is available here)
  return Array.isArray(voices)
    ? voices
+        .filter((voice) => voice.Status !== "Deprecated")
        .map((voice) => ({
          id: voice.ShortName?.trim() ?? "",
          name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
@ -51,7 +55,7 @@ export async function listAzureVoices(params: {
          locale: voice.Locale?.trim() || undefined,
          gender: voice.Gender?.trim() || undefined,
        }))
-        .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated")
+        .filter((voice) => voice.id.length > 0)
    : [];
 }

@ -74,43 +78,46 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
    listVoices: async (req) => {
      const apiKey =
        req.apiKey ||
-        req.config?.azure?.apiKey ||
+        (req.config as any)?.azure?.apiKey ||
        process.env.AZURE_SPEECH_API_KEY;
      if (!apiKey) {
        throw new Error("Azure Speech API key missing");
      }
      return listAzureVoices({
        apiKey,
-        region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION,
-        baseUrl: req.config?.azure?.baseUrl,
+        region: (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION,
+        baseUrl: (req.config as any)?.azure?.baseUrl,
      });
    },
    isConfigured: ({ config }) =>
      Boolean(
-        config.azure?.apiKey ||
+        (config as any)?.azure?.apiKey ||
          process.env.AZURE_SPEECH_API_KEY,
      ),
    synthesize: async (req) => {
      const apiKey =
-        req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
+        (req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
      if (!apiKey) {
        throw new Error("Azure Speech API key missing");
      }

-      const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
-      const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl);
-      const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice;
-      const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang;
+      const region = (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
+      const baseUrl = (req.config as any)?.azure?.baseUrl;
+      // Use baseUrl if provided, otherwise derive from region
+      const endpoint = baseUrl
+        ? `${normalizeAzureBaseUrl(baseUrl)}/cognitiveservices/v1`
+        : `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
+
+      const voice = (req.config as any)?.azure?.voice;
+      const lang = (req.config as any)?.azure?.lang;
      const outputFormat =
-        req.overrides?.azure?.outputFormat ??
-        req.config?.azure?.outputFormat ??
+        (req.config as any)?.azure?.outputFormat ??
        DEFAULT_AZURE_OUTPUT_FORMAT;

      if (!voice) {
        throw new Error("Azure voice not configured");
      }

-      const endpoint = `${baseUrl}/cognitiveservices/v1`;
      const ssml = buildAzureSSML(req.text, voice, lang);

      const response = await fetch(endpoint, {
@ -121,6 +128,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
          "X-Microsoft-OutputFormat": outputFormat,
        },
        body: ssml,
+        signal: AbortSignal.timeout((req.config as any)?.azure?.timeoutMs ?? 30000),
      });

      if (!response.ok) {
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@ -59,6 +59,7 @@ const DEFAULT_OPENAI_VOICE = "alloy";
 const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
 const DEFAULT_EDGE_LANG = "en-US";
 const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";

 const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
  stability: 0.5,
@ -117,7 +118,17 @@ export type ResolvedTtsConfig = {
    speed?: number;
    instructions?: string;
  };
-  edge: {
+  
+  azure: {
+    apiKey?: string;
+    region: string;
+    baseUrl: string;
+    voice: string;
+    lang: string;
+    outputFormat: string;
+    timeoutMs?: number;
+  };
+edge: {
    enabled: boolean;
    voice: string;
    lang: string;
@ -177,7 +188,11 @@ export type TtsDirectiveOverrides = {
    voice?: string;
    outputFormat?: string;
  };
-};
+  azure?: {
+    voice?: string;
+    lang?: string;
+    outputFormat?: string;
+  };

 export type TtsDirectiveParseResult = {
  cleanedText: string;
@ -324,6 +339,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
      speed: raw.openai?.speed,
      instructions: raw.openai?.instructions?.trim() || undefined,
    },
+    azure: {
+      apiKey: normalizeResolvedSecretInputString({
+        value: raw.azure?.apiKey,
+        path: "messages.tts.azure.apiKey",
+      }),
+      region: raw.azure?.region?.trim() || process.env.AZURE_SPEECH_REGION || "eastus",
+      baseUrl: raw.azure?.baseUrl?.trim() || "",
+      voice: raw.azure?.voice || "",
+      lang: raw.azure?.lang?.trim() || "en-US",
+      outputFormat: raw.azure?.outputFormat?.trim() || DEFAULT_AZURE_OUTPUT_FORMAT,
+      timeoutMs: raw.azure?.timeoutMs,
+    },
    edge: {
      enabled: rawMicrosoft.enabled ?? true,
      voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,