From 33b95fed9aa147324207b580f89176e5c96f84fa Mon Sep 17 00:00:00 2001
From: Yobo <yobo@Mac.lan>
Date: Fri, 20 Mar 2026 16:25:28 -0700
Subject: [PATCH] feat(tts): add Azure Speech TTS provider

- Add Azure TTS provider with SSML synthesis
- Support for 400+ neural voices including Cantonese (zh-HK)
- Config options: apiKey, region, voice, lang, outputFormat
- Environment variables: AZURE_SPEECH_API_KEY, AZURE_SPEECH_REGION
- Provider ID: 'azure' with alias 'azure-tts'
- Built-in voices: zh-HK-HiuMaanNeural, zh-HK-HiuGaaiNeural
---
 src/config/types.tts.ts       |  10 +++
 src/config/zod-schema.core.ts |  13 ++++
 src/tts/provider-registry.ts  |   2 +
 src/tts/providers/azure.ts    | 139 ++++++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+)
 create mode 100644 src/tts/providers/azure.ts

diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts
index 4703f43ae12..eaf0c35d242 100644
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@@ -93,6 +93,16 @@ export type TtsConfig = {
     proxy?: string;
     timeoutMs?: number;
   };
+  /** Azure Speech configuration. */
+  azure?: {
+    apiKey?: SecretInput;
+    region?: string;
+    baseUrl?: string;
+    voice?: string;
+    lang?: string;
+    outputFormat?: string;
+    timeoutMs?: number;
+  };
   /** Optional path for local TTS user preferences JSON. */
   prefsPath?: string;
   /** Hard cap for text sent to TTS (chars). */
diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts
index 22c589c8490..d12de3fff35 100644
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z
   })
   .strict()
   .optional();
+const TtsAzureConfigSchema = z
+  .object({
+    apiKey: SecretInputSchema.optional().register(sensitive),
+    region: z.string().optional(),
+    baseUrl: z.string().optional(),
+    voice: z.string().optional(),
+    lang: z.string().optional(),
+    outputFormat: z.string().optional(),
+    timeoutMs: z.number().int().min(1000).max(120000).optional(),
+  })
+  .strict()
+  .optional();
 export const TtsConfigSchema = z
   .object({
     auto: TtsAutoSchema.optional(),
@@ -447,6 +459,7 @@ export const TtsConfigSchema = z
       .optional(),
     edge: TtsMicrosoftConfigSchema,
     microsoft: TtsMicrosoftConfigSchema,
+    azure: TtsAzureConfigSchema,
     prefsPath: z.string().optional(),
     maxTextLength: z.number().int().min(1).optional(),
     timeoutMs: z.number().int().min(1000).max(120000).optional(),
diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts
index d1462880a99..88a99b613d7 100644
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@@ -3,6 +3,7 @@ import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import type { SpeechProviderPlugin } from "../plugins/types.js";
 import type { SpeechProviderId } from "./provider-types.js";
+import { buildAzureSpeechProvider } from "./providers/azure.js";
 import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
 import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
 import { buildOpenAISpeechProvider } from "./providers/openai.js";
@@ -11,6 +12,7 @@ const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
   buildOpenAISpeechProvider,
   buildElevenLabsSpeechProvider,
   buildMicrosoftSpeechProvider,
+  buildAzureSpeechProvider,
 ] as const satisfies readonly (() => SpeechProviderPlugin)[];
 
 function trimToUndefined(value: string | undefined): string | undefined {
diff --git a/src/tts/providers/azure.ts b/src/tts/providers/azure.ts
new file mode 100644
index 00000000000..04db3b4fa82
--- /dev/null
+++ b/src/tts/providers/azure.ts
@@ -0,0 +1,139 @@
+import type { SpeechProviderPlugin } from "../../plugins/types.js";
+import type { SpeechVoiceOption } from "../provider-types.js";
+
+const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+
+type AzureVoiceListEntry = {
+  Name?: string;
+  DisplayName?: string;
+  LocalName?: string;
+  ShortName?: string;
+  Gender?: string;
+  Locale?: string;
+  VoiceType?: string;
+  Status?: string;
+};
+
+function normalizeAzureBaseUrl(baseUrl: string | undefined): string {
+  const trimmed = baseUrl?.trim();
+  if (!trimmed) {
+    return "https://eastus.tts.speech.microsoft.com";
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+export async function listAzureVoices(params: {
+  apiKey: string;
+  region?: string;
+  baseUrl?: string;
+}): Promise<SpeechVoiceOption[]> {
+  const base = normalizeAzureBaseUrl(params.baseUrl);
+  const region = params.region || "eastus";
+  const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
+
+  const response = await fetch(url, {
+    headers: {
+      "Ocp-Apim-Subscription-Key": params.apiKey,
+    },
+  });
+
+  if (!response.ok) {
+    throw new Error(`Azure voices API error (${response.status})`);
+  }
+
+  const voices = (await response.json()) as AzureVoiceListEntry[];
+  return Array.isArray(voices)
+    ? voices
+        .map((voice) => ({
+          id: voice.ShortName?.trim() ?? "",
+          name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
+          category: voice.VoiceType?.trim() || undefined,
+          locale: voice.Locale?.trim() || undefined,
+          gender: voice.Gender?.trim() || undefined,
+        }))
+        .filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated")
+    : [];
+}
+
+function buildAzureSSML(text: string, voice: string, lang?: string): string {
+  const escapedText = text
+    .replace(/&/g, "&amp;")
+    .replace(/</g, "&lt;")
+    .replace(/>/g, "&gt;")
+    .replace(/"/g, "&quot;")
+    .replace(/'/g, "&apos;");
+
+  return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${lang || "en-US"}'><voice name='${voice}'>${escapedText}</voice></speak>`;
+}
+
+export function buildAzureSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "azure",
+    label: "Azure Speech",
+    aliases: ["azure-tts"],
+    listVoices: async (req) => {
+      const apiKey =
+        req.apiKey ||
+        req.config?.azure?.apiKey ||
+        process.env.AZURE_SPEECH_API_KEY;
+      if (!apiKey) {
+        throw new Error("Azure Speech API key missing");
+      }
+      return listAzureVoices({
+        apiKey,
+        region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION,
+        baseUrl: req.config?.azure?.baseUrl,
+      });
+    },
+    isConfigured: ({ config }) =>
+      Boolean(
+        config.azure?.apiKey ||
+          process.env.AZURE_SPEECH_API_KEY,
+      ),
+    synthesize: async (req) => {
+      const apiKey =
+        req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
+      if (!apiKey) {
+        throw new Error("Azure Speech API key missing");
+      }
+
+      const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
+      const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl);
+      const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice;
+      const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang;
+      const outputFormat =
+        req.overrides?.azure?.outputFormat ??
+        req.config?.azure?.outputFormat ??
+        DEFAULT_AZURE_OUTPUT_FORMAT;
+
+      if (!voice) {
+        throw new Error("Azure voice not configured");
+      }
+
+      const endpoint = `${baseUrl}/cognitiveservices/v1`;
+      const ssml = buildAzureSSML(req.text, voice, lang);
+
+      const response = await fetch(endpoint, {
+        method: "POST",
+        headers: {
+          "Ocp-Apim-Subscription-Key": apiKey,
+          "Content-Type": "application/ssml+xml",
+          "X-Microsoft-OutputFormat": outputFormat,
+        },
+        body: ssml,
+      });
+
+      if (!response.ok) {
+        throw new Error(`Azure TTS failed: ${response.status} ${response.statusText}`);
+      }
+
+      const audioBuffer = await response.arrayBuffer();
+      return {
+        audioBuffer: Buffer.from(audioBuffer),
+        outputFormat,
+        fileExtension: outputFormat.includes("mp3") ? ".mp3" : ".wav",
+        voiceCompatible: true,
+      };
+    },
+  };
+}