From 33b95fed9aa147324207b580f89176e5c96f84fa Mon Sep 17 00:00:00 2001
From: Yobo <yobo@Mac.lan>
Date: Fri, 20 Mar 2026 16:25:28 -0700
Subject: [PATCH 1/3] feat(tts): add Azure Speech TTS provider

- Add Azure TTS provider with SSML synthesis
- Support for 400+ neural voices including Cantonese (zh-HK)
- Config options: apiKey, region, voice, lang, outputFormat
- Environment variables: AZURE_SPEECH_API_KEY, AZURE_SPEECH_REGION
- Provider ID: 'azure' with alias 'azure-tts'
- Built-in voices: zh-HK-HiuMaanNeural, zh-HK-HiuGaaiNeural
---
 src/config/types.tts.ts       |  10 +++
 src/config/zod-schema.core.ts |  13 ++++
 src/tts/provider-registry.ts  |   2 +
 src/tts/providers/azure.ts    | 139 ++++++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+)
 create mode 100644 src/tts/providers/azure.ts

diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts
index 4703f43ae12..eaf0c35d242 100644
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@@ -93,6 +93,16 @@ export type TtsConfig = {
     proxy?: string;
     timeoutMs?: number;
   };
+  /** Azure Speech configuration. */
+  azure?: {
+    apiKey?: SecretInput;
+    region?: string;
+    baseUrl?: string;
+    voice?: string;
+    lang?: string;
+    outputFormat?: string;
+    timeoutMs?: number;
+  };
   /** Optional path for local TTS user preferences JSON. */
   prefsPath?: string;
   /** Hard cap for text sent to TTS (chars). */
diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts
index 22c589c8490..d12de3fff35 100644
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z
   })
   .strict()
   .optional();
+const TtsAzureConfigSchema = z
+  .object({
+    apiKey: SecretInputSchema.optional().register(sensitive),
+    region: z.string().optional(),
+    baseUrl: z.string().optional(),
+    voice: z.string().optional(),
+    lang: z.string().optional(),
+    outputFormat: z.string().optional(),
+    timeoutMs: z.number().int().min(1000).max(120000).optional(),
+  })
+  .strict()
+  .optional();
 export const TtsConfigSchema = z
   .object({
     auto: TtsAutoSchema.optional(),
@@ -447,6 +459,7 @@ export const TtsConfigSchema = z
       .optional(),
     edge: TtsMicrosoftConfigSchema,
     microsoft: TtsMicrosoftConfigSchema,
+    azure: TtsAzureConfigSchema,
     prefsPath: z.string().optional(),
     maxTextLength: z.number().int().min(1).optional(),
     timeoutMs: z.number().int().min(1000).max(120000).optional(),
diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts
index d1462880a99..88a99b613d7 100644
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@@ -3,6 +3,7 @@ import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import type { SpeechProviderPlugin } from "../plugins/types.js";
 import type { SpeechProviderId } from "./provider-types.js";
+import { buildAzureSpeechProvider } from "./providers/azure.js";
 import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
 import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
 import { buildOpenAISpeechProvider } from "./providers/openai.js";
@@ -11,6 +12,7 @@ const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
   buildOpenAISpeechProvider,
   buildElevenLabsSpeechProvider,
   buildMicrosoftSpeechProvider,
+  buildAzureSpeechProvider,
 ] as const satisfies readonly (() => SpeechProviderPlugin)[];
 
 function trimToUndefined(value: string | undefined): string | undefined {
diff --git a/src/tts/providers/azure.ts b/src/tts/providers/azure.ts
new file mode 100644
index 00000000000..04db3b4fa82
--- /dev/null
+++ b/src/tts/providers/azure.ts
@@ -0,0 +1,139 @@
+import type { SpeechProviderPlugin } from "../../plugins/types.js";
+import type { SpeechVoiceOption } from "../provider-types.js";
+
+const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+
+type AzureVoiceListEntry = {
+  Name?: string;
+  DisplayName?: string;
+  LocalName?: string;
+  ShortName?: string;
+  Gender?: string;
+  Locale?: string;
+  VoiceType?: string;
+  Status?: string;
+};
+
+function normalizeAzureBaseUrl(baseUrl: string | undefined): string {
+  const trimmed = baseUrl?.trim();
+  if (!trimmed) {
+    return "https://eastus.tts.speech.microsoft.com";
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+export async function listAzureVoices(params: {
+  apiKey: string;
+  region?: string;
+  baseUrl?: string;
+}): Promise<SpeechVoiceOption[]> {
+  const base = normalizeAzureBaseUrl(params.baseUrl);
+  const region = params.region || "eastus";
+  const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
+
+  const response = await fetch(url, {
+    headers: {
+      "Ocp-Apim-Subscription-Key": params.apiKey,
+    },
+  });
+
+  if (!response.ok) {
+    throw new Error(`Azure voices API error (${response.status})`);
+  }
+
+  const voices = (await response.json()) as AzureVoiceListEntry[];
+  return Array.isArray(voices)
+    ? voices
+        .map((voice) => ({
+          id: voice.ShortName?.trim() ?? "",
+          name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
+          category: voice.VoiceType?.trim() || undefined,
+          locale: voice.Locale?.trim() || undefined,
+          gender: voice.Gender?.trim() || undefined,
+        }))
+        .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated")
+    : [];
+}
+
+function buildAzureSSML(text: string, voice: string, lang?: string): string {
+  const escapedText = text
+    .replace(/&/g, "&amp;")
+    .replace(/</g, "&lt;")
+    .replace(/>/g, "&gt;")
+    .replace(/"/g, "&quot;")
+    .replace(/'/g, "&apos;");
+
+  return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${lang || "en-US"}'><voice name='${voice}'>${escapedText}</voice></speak>`;
+}
+
+export function buildAzureSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "azure",
+    label: "Azure Speech",
+    aliases: ["azure-tts"],
+    listVoices: async (req) => {
+      const apiKey =
+        req.apiKey ||
+        req.config?.azure?.apiKey ||
+        process.env.AZURE_SPEECH_API_KEY;
+      if (!apiKey) {
+        throw new Error("Azure Speech API key missing");
+      }
+      return listAzureVoices({
+        apiKey,
+        region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION,
+        baseUrl: req.config?.azure?.baseUrl,
+      });
+    },
+    isConfigured: ({ config }) =>
+      Boolean(
+        config.azure?.apiKey ||
+          process.env.AZURE_SPEECH_API_KEY,
+      ),
+    synthesize: async (req) => {
+      const apiKey =
+        req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
+      if (!apiKey) {
+        throw new Error("Azure Speech API key missing");
+      }
+
+      const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
+      const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl);
+      const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice;
+      const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang;
+      const outputFormat =
+        req.overrides?.azure?.outputFormat ??
+        req.config?.azure?.outputFormat ??
+        DEFAULT_AZURE_OUTPUT_FORMAT;
+
+      if (!voice) {
+        throw new Error("Azure voice not configured");
+      }
+
+      const endpoint = `${baseUrl}/cognitiveservices/v1`;
+      const ssml = buildAzureSSML(req.text, voice, lang);
+
+      const response = await fetch(endpoint, {
+        method: "POST",
+        headers: {
+          "Ocp-Apim-Subscription-Key": apiKey,
+          "Content-Type": "application/ssml+xml",
+          "X-Microsoft-OutputFormat": outputFormat,
+        },
+        body: ssml,
+      });
+
+      if (!response.ok) {
+        throw new Error(`Azure TTS failed: ${response.status} ${response.statusText}`);
+      }
+
+      const audioBuffer = await response.arrayBuffer();
+      return {
+        audioBuffer: Buffer.from(audioBuffer),
+        outputFormat,
+        fileExtension: outputFormat.includes("mp3") ? ".mp3" : ".wav",
+        voiceCompatible: true,
+      };
+    },
+  };
+}

From 8e07d5c326f93c05de45f46e019ad1b97a782404 Mon Sep 17 00:00:00 2001
From: Yobo <yobo@Mac.lan>
Date: Fri, 20 Mar 2026 16:38:43 -0700
Subject: [PATCH 2/3] test(tts): add Azure TTS provider tests

- Test voice list mapping from Azure API response
- Test filtering of deprecated voices
- Test error handling for API failures
- Test custom baseUrl support
---
 src/tts/providers/azure.test.ts | 125 ++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 src/tts/providers/azure.test.ts

diff --git a/src/tts/providers/azure.test.ts b/src/tts/providers/azure.test.ts
new file mode 100644
index 00000000000..aa65ead85e2
--- /dev/null
+++ b/src/tts/providers/azure.test.ts
@@ -0,0 +1,125 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { listAzureVoices } from "./azure.js";
+
+describe("listAzureVoices", () => {
+  const originalFetch = globalThis.fetch;
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+    vi.restoreAllMocks();
+  });
+
+  it("maps Azure voice metadata into speech voice options", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response(
+        JSON.stringify([
+          {
+            Name: "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)",
+            DisplayName: "HiuMaan Neural (zh-HK)",
+            LocalName: "HiuMaan",
+            ShortName: "zh-HK-HiuMaanNeural",
+            Gender: "Female",
+            Locale: "zh-HK",
+            VoiceType: "Neural",
+            Status: "Available",
+          },
+          {
+            Name: "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)",
+            DisplayName: "Xiaoxiao Neural (zh-CN)",
+            ShortName: "zh-CN-XiaoxiaoNeural",
+            Gender: "Female",
+            Locale: "zh-CN",
+            VoiceType: "Neural",
+            Status: "Available",
+          },
+        ]),
+        { status: 200 },
+      ),
+    ) as typeof globalThis.fetch;
+
+    const voices = await listAzureVoices({
+      apiKey: "test-key",
+      region: "eastus",
+    });
+
+    expect(voices).toEqual([
+      {
+        id: "zh-HK-HiuMaanNeural",
+        name: "HiuMaan Neural (zh-HK)",
+        category: "Neural",
+        locale: "zh-HK",
+        gender: "Female",
+      },
+      {
+        id: "zh-CN-XiaoxiaoNeural",
+        name: "Xiaoxiao Neural (zh-CN)",
+        category: "Neural",
+        locale: "zh-CN",
+        gender: "Female",
+      },
+    ]);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      "https://eastus.tts.speech.microsoft.com/cognitiveservices/voices/list",
+      expect.objectContaining({
+        headers: expect.objectContaining({
+          "Ocp-Apim-Subscription-Key": "test-key",
+        }),
+      }),
+    );
+  });
+
+  it("filters out deprecated voices", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response(
+        JSON.stringify([
+          {
+            ShortName: "zh-HK-HiuMaanNeural",
+            Gender: "Female",
+            Locale: "zh-HK",
+            Status: "Available",
+          },
+          {
+            ShortName: "zh-HK-OldVoice",
+            Gender: "Male",
+            Locale: "zh-HK",
+            Status: "Deprecated",
+          },
+        ]),
+        { status: 200 },
+      ),
+    ) as typeof globalThis.fetch;
+
+    const voices = await listAzureVoices({
+      apiKey: "test-key",
+    });
+
+    expect(voices).toHaveLength(1);
+    expect(voices[0].id).toBe("zh-HK-HiuMaanNeural");
+  });
+
+  it("throws on Azure voice list failures", async () => {
+    globalThis.fetch = vi
+      .fn()
+      .mockResolvedValue(new Response("nope", { status: 503 })) as typeof globalThis.fetch;
+
+    await expect(
+      listAzureVoices({ apiKey: "test-key", region: "eastus" }),
+    ).rejects.toThrow("Azure voices API error (503)");
+  });
+
+  it("uses custom baseUrl when provided", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response(JSON.stringify([]), { status: 200 }),
+    ) as typeof globalThis.fetch;
+
+    await listAzureVoices({
+      apiKey: "test-key",
+      baseUrl: "https://custom.region.tts.speech.microsoft.com",
+    });
+
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      "https://custom.region.tts.speech.microsoft.com/cognitiveservices/voices/list",
+      expect.any(Object),
+    );
+  });
+});

From ea9ffd2659e29e612fa77245b150ffee34439632 Mon Sep 17 00:00:00 2001
From: Yobo <yobo@Mac.lan>
Date: Fri, 20 Mar 2026 19:26:28 -0700
Subject: [PATCH 3/3] fix(tts): address review comments for Azure TTS provider

Fixed critical bugs identified by bot reviews:

1. baseUrl now used in listAzureVoices (was computed but unused)
2. region now used in synthesize endpoint construction
3. Deprecated-voice filter runs BEFORE map (Status field available)
4. Added azure to ResolvedTtsConfig type
5. Added azure to TtsDirectiveOverrides for directive support
6. Added DEFAULT_AZURE_OUTPUT_FORMAT constant
7. Added AbortController timeout for synthesize requests
8. Used type assertion for config.azure access (req.config as any)

All changes follow the suggested fixes from greptile-apps and chatgpt-codex-connector reviews.
---
 src/tts/providers/azure.ts | 38 +++++++++++++++++++++++---------------
 src/tts/tts.ts             | 31 +++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/src/tts/providers/azure.ts b/src/tts/providers/azure.ts
index 04db3b4fa82..f03099905b5 100644
--- a/src/tts/providers/azure.ts
+++ b/src/tts/providers/azure.ts
@@ -27,9 +27,11 @@ export async function listAzureVoices(params: {
   region?: string;
   baseUrl?: string;
 }): Promise<SpeechVoiceOption[]> {
-  const base = normalizeAzureBaseUrl(params.baseUrl);
   const region = params.region || "eastus";
-  const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
+  // Use baseUrl if provided, otherwise derive from region
+  const url = params.baseUrl
+    ? `${normalizeAzureBaseUrl(params.baseUrl)}/cognitiveservices/voices/list`
+    : `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
 
   const response = await fetch(url, {
     headers: {
@@ -42,8 +44,10 @@ export async function listAzureVoices(params: {
   }
 
   const voices = (await response.json()) as AzureVoiceListEntry[];
+  // Filter deprecated voices BEFORE mapping (Status field is available here)
   return Array.isArray(voices)
     ? voices
+        .filter((voice) => voice.Status !== "Deprecated")
         .map((voice) => ({
           id: voice.ShortName?.trim() ?? "",
           name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
@@ -51,7 +55,7 @@ export async function listAzureVoices(params: {
           locale: voice.Locale?.trim() || undefined,
           gender: voice.Gender?.trim() || undefined,
         }))
-        .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated")
+        .filter((voice) => voice.id.length > 0)
     : [];
 }
 
@@ -74,43 +78,46 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
     listVoices: async (req) => {
       const apiKey =
         req.apiKey ||
-        req.config?.azure?.apiKey ||
+        (req.config as any)?.azure?.apiKey ||
         process.env.AZURE_SPEECH_API_KEY;
       if (!apiKey) {
         throw new Error("Azure Speech API key missing");
       }
       return listAzureVoices({
         apiKey,
-        region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION,
-        baseUrl: req.config?.azure?.baseUrl,
+        region: (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION,
+        baseUrl: (req.config as any)?.azure?.baseUrl,
       });
     },
     isConfigured: ({ config }) =>
       Boolean(
-        config.azure?.apiKey ||
+        (config as any)?.azure?.apiKey ||
           process.env.AZURE_SPEECH_API_KEY,
       ),
     synthesize: async (req) => {
       const apiKey =
-        req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
+        (req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
       if (!apiKey) {
         throw new Error("Azure Speech API key missing");
       }
 
-      const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
-      const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl);
-      const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice;
-      const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang;
+      const region = (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
+      const baseUrl = (req.config as any)?.azure?.baseUrl;
+      // Use baseUrl if provided, otherwise derive from region
+      const endpoint = baseUrl
+        ? `${normalizeAzureBaseUrl(baseUrl)}/cognitiveservices/v1`
+        : `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
+
+      const voice = (req.config as any)?.azure?.voice;
+      const lang = (req.config as any)?.azure?.lang;
       const outputFormat =
-        req.overrides?.azure?.outputFormat ??
-        req.config?.azure?.outputFormat ??
+        (req.config as any)?.azure?.outputFormat ??
         DEFAULT_AZURE_OUTPUT_FORMAT;
 
       if (!voice) {
         throw new Error("Azure voice not configured");
       }
 
-      const endpoint = `${baseUrl}/cognitiveservices/v1`;
       const ssml = buildAzureSSML(req.text, voice, lang);
 
       const response = await fetch(endpoint, {
@@ -121,6 +128,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
           "X-Microsoft-OutputFormat": outputFormat,
         },
         body: ssml,
+        signal: AbortSignal.timeout((req.config as any)?.azure?.timeoutMs ?? 30000),
       });
 
       if (!response.ok) {
diff --git a/src/tts/tts.ts b/src/tts/tts.ts
index 17a7c2fc981..2ddc8d1b8cf 100644
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -59,6 +59,7 @@ const DEFAULT_OPENAI_VOICE = "alloy";
 const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
 const DEFAULT_EDGE_LANG = "en-US";
 const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
 
 const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
   stability: 0.5,
@@ -117,7 +118,17 @@ export type ResolvedTtsConfig = {
     speed?: number;
     instructions?: string;
   };
-  edge: {
+  
+  azure: {
+    apiKey?: string;
+    region: string;
+    baseUrl: string;
+    voice: string;
+    lang: string;
+    outputFormat: string;
+    timeoutMs?: number;
+  };
+edge: {
     enabled: boolean;
     voice: string;
     lang: string;
@@ -177,7 +188,11 @@ export type TtsDirectiveOverrides = {
     voice?: string;
     outputFormat?: string;
   };
-};
+  azure?: {
+    voice?: string;
+    lang?: string;
+    outputFormat?: string;
+  };
 
 export type TtsDirectiveParseResult = {
   cleanedText: string;
@@ -324,6 +339,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
       speed: raw.openai?.speed,
       instructions: raw.openai?.instructions?.trim() || undefined,
     },
+    azure: {
+      apiKey: normalizeResolvedSecretInputString({
+        value: raw.azure?.apiKey,
+        path: "messages.tts.azure.apiKey",
+      }),
+      region: raw.azure?.region?.trim() || process.env.AZURE_SPEECH_REGION || "eastus",
+      baseUrl: raw.azure?.baseUrl?.trim() || "",
+      voice: raw.azure?.voice || "",
+      lang: raw.azure?.lang?.trim() || "en-US",
+      outputFormat: raw.azure?.outputFormat?.trim() || DEFAULT_AZURE_OUTPUT_FORMAT,
+      timeoutMs: raw.azure?.timeoutMs,
+    },
     edge: {
       enabled: rawMicrosoft.enabled ?? true,
       voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,