Merge ea9ffd2659e29e612fa77245b150ffee34439632 into 5e417b44e1540f528d2ae63e3e20229a902d1db2

2026-03-20 19:26:33 -07:00 · 2026-03-20 19:26:33 -07:00 · 8e924c76fc
commit 8e924c76fc
parent 5e417b44e1 ea9ffd2659
6 changed files with 326 additions and 2 deletions
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@ -93,6 +93,16 @@ export type TtsConfig = {
    proxy?: string;
    timeoutMs?: number;
  };
+  /** Azure Speech configuration. */
+  azure?: {
+    apiKey?: SecretInput;
+    region?: string;
+    baseUrl?: string;
+    voice?: string;
+    lang?: string;
+    outputFormat?: string;
+    timeoutMs?: number;
+  };
  /** Optional path for local TTS user preferences JSON. */
  prefsPath?: string;
  /** Hard cap for text sent to TTS (chars). */
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z
  })
  .strict()
  .optional();
+const TtsAzureConfigSchema = z
+  .object({
+    apiKey: SecretInputSchema.optional().register(sensitive),
+    region: z.string().optional(),
+    baseUrl: z.string().optional(),
+    voice: z.string().optional(),
+    lang: z.string().optional(),
+    outputFormat: z.string().optional(),
+    timeoutMs: z.number().int().min(1000).max(120000).optional(),
+  })
+  .strict()
+  .optional();
 export const TtsConfigSchema = z
  .object({
    auto: TtsAutoSchema.optional(),
@ -447,6 +459,7 @@ export const TtsConfigSchema = z
      .optional(),
    edge: TtsMicrosoftConfigSchema,
    microsoft: TtsMicrosoftConfigSchema,
+    azure: TtsAzureConfigSchema,
    prefsPath: z.string().optional(),
    maxTextLength: z.number().int().min(1).optional(),
    timeoutMs: z.number().int().min(1000).max(120000).optional(),
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@ -3,6 +3,7 @@ import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import type { SpeechProviderPlugin } from "../plugins/types.js";
 import type { SpeechProviderId } from "./provider-types.js";
+import { buildAzureSpeechProvider } from "./providers/azure.js";
 import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
 import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
 import { buildOpenAISpeechProvider } from "./providers/openai.js";
@ -11,6 +12,7 @@ const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
  buildOpenAISpeechProvider,
  buildElevenLabsSpeechProvider,
  buildMicrosoftSpeechProvider,
+  buildAzureSpeechProvider,
 ] as const satisfies readonly (() => SpeechProviderPlugin)[];

 function trimToUndefined(value: string | undefined): string | undefined {
--- a/src/tts/providers/azure.test.ts
+++ b/src/tts/providers/azure.test.ts
@ -0,0 +1,125 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { listAzureVoices } from "./azure.js";
+
+describe("listAzureVoices", () => {
+  const originalFetch = globalThis.fetch;
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+    vi.restoreAllMocks();
+  });
+
+  it("maps Azure voice metadata into speech voice options", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response(
+        JSON.stringify([
+          {
+            Name: "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)",
+            DisplayName: "HiuMaan Neural (zh-HK)",
+            LocalName: "HiuMaan",
+            ShortName: "zh-HK-HiuMaanNeural",
+            Gender: "Female",
+            Locale: "zh-HK",
+            VoiceType: "Neural",
+            Status: "Available",
+          },
+          {
+            Name: "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)",
+            DisplayName: "Xiaoxiao Neural (zh-CN)",
+            ShortName: "zh-CN-XiaoxiaoNeural",
+            Gender: "Female",
+            Locale: "zh-CN",
+            VoiceType: "Neural",
+            Status: "Available",
+          },
+        ]),
+        { status: 200 },
+      ),
+    ) as typeof globalThis.fetch;
+
+    const voices = await listAzureVoices({
+      apiKey: "test-key",
+      region: "eastus",
+    });
+
+    expect(voices).toEqual([
+      {
+        id: "zh-HK-HiuMaanNeural",
+        name: "HiuMaan Neural (zh-HK)",
+        category: "Neural",
+        locale: "zh-HK",
+        gender: "Female",
+      },
+      {
+        id: "zh-CN-XiaoxiaoNeural",
+        name: "Xiaoxiao Neural (zh-CN)",
+        category: "Neural",
+        locale: "zh-CN",
+        gender: "Female",
+      },
+    ]);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      "https://eastus.tts.speech.microsoft.com/cognitiveservices/voices/list",
+      expect.objectContaining({
+        headers: expect.objectContaining({
+          "Ocp-Apim-Subscription-Key": "test-key",
+        }),
+      }),
+    );
+  });
+
+  it("filters out deprecated voices", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response(
+        JSON.stringify([
+          {
+            ShortName: "zh-HK-HiuMaanNeural",
+            Gender: "Female",
+            Locale: "zh-HK",
+            Status: "Available",
+          },
+          {
+            ShortName: "zh-HK-OldVoice",
+            Gender: "Male",
+            Locale: "zh-HK",
+            Status: "Deprecated",
+          },
+        ]),
+        { status: 200 },
+      ),
+    ) as typeof globalThis.fetch;
+
+    const voices = await listAzureVoices({
+      apiKey: "test-key",
+    });
+
+    expect(voices).toHaveLength(1);
+    expect(voices[0].id).toBe("zh-HK-HiuMaanNeural");
+  });
+
+  it("throws on Azure voice list failures", async () => {
+    globalThis.fetch = vi
+      .fn()
+      .mockResolvedValue(new Response("nope", { status: 503 })) as typeof globalThis.fetch;
+
+    await expect(
+      listAzureVoices({ apiKey: "test-key", region: "eastus" }),
+    ).rejects.toThrow("Azure voices API error (503)");
+  });
+
+  it("uses custom baseUrl when provided", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response(JSON.stringify([]), { status: 200 }),
+    ) as typeof globalThis.fetch;
+
+    await listAzureVoices({
+      apiKey: "test-key",
+      baseUrl: "https://custom.region.tts.speech.microsoft.com",
+    });
+
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      "https://custom.region.tts.speech.microsoft.com/cognitiveservices/voices/list",
+      expect.any(Object),
+    );
+  });
+});
--- a/src/tts/providers/azure.ts
+++ b/src/tts/providers/azure.ts
@ -0,0 +1,147 @@
+import type { SpeechProviderPlugin } from "../../plugins/types.js";
+import type { SpeechVoiceOption } from "../provider-types.js";
+
+const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+
+type AzureVoiceListEntry = {
+  Name?: string;
+  DisplayName?: string;
+  LocalName?: string;
+  ShortName?: string;
+  Gender?: string;
+  Locale?: string;
+  VoiceType?: string;
+  Status?: string;
+};
+
+function normalizeAzureBaseUrl(baseUrl: string | undefined): string {
+  const trimmed = baseUrl?.trim();
+  if (!trimmed) {
+    return "https://eastus.tts.speech.microsoft.com";
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+export async function listAzureVoices(params: {
+  apiKey: string;
+  region?: string;
+  baseUrl?: string;
+}): Promise<SpeechVoiceOption[]> {
+  const region = params.region || "eastus";
+  // Use baseUrl if provided, otherwise derive from region
+  const url = params.baseUrl
+    ? `${normalizeAzureBaseUrl(params.baseUrl)}/cognitiveservices/voices/list`
+    : `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
+
+  const response = await fetch(url, {
+    headers: {
+      "Ocp-Apim-Subscription-Key": params.apiKey,
+    },
+  });
+
+  if (!response.ok) {
+    throw new Error(`Azure voices API error (${response.status})`);
+  }
+
+  const voices = (await response.json()) as AzureVoiceListEntry[];
+  // Filter deprecated voices BEFORE mapping (Status field is available here)
+  return Array.isArray(voices)
+    ? voices
+        .filter((voice) => voice.Status !== "Deprecated")
+        .map((voice) => ({
+          id: voice.ShortName?.trim() ?? "",
+          name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
+          category: voice.VoiceType?.trim() || undefined,
+          locale: voice.Locale?.trim() || undefined,
+          gender: voice.Gender?.trim() || undefined,
+        }))
+        .filter((voice) => voice.id.length > 0)
+    : [];
+}
+
+function buildAzureSSML(text: string, voice: string, lang?: string): string {
+  const escapedText = text
+    .replace(/&/g, "&amp;")
+    .replace(/</g, "&lt;")
+    .replace(/>/g, "&gt;")
+    .replace(/"/g, "&quot;")
+    .replace(/'/g, "&apos;");
+
+  return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${lang || "en-US"}'><voice name='${voice}'>${escapedText}</voice></speak>`;
+}
+
+export function buildAzureSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "azure",
+    label: "Azure Speech",
+    aliases: ["azure-tts"],
+    listVoices: async (req) => {
+      const apiKey =
+        req.apiKey ||
+        (req.config as any)?.azure?.apiKey ||
+        process.env.AZURE_SPEECH_API_KEY;
+      if (!apiKey) {
+        throw new Error("Azure Speech API key missing");
+      }
+      return listAzureVoices({
+        apiKey,
+        region: (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION,
+        baseUrl: (req.config as any)?.azure?.baseUrl,
+      });
+    },
+    isConfigured: ({ config }) =>
+      Boolean(
+        (config as any)?.azure?.apiKey ||
+          process.env.AZURE_SPEECH_API_KEY,
+      ),
+    synthesize: async (req) => {
+      const apiKey =
+        (req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
+      if (!apiKey) {
+        throw new Error("Azure Speech API key missing");
+      }
+
+      const region = (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
+      const baseUrl = (req.config as any)?.azure?.baseUrl;
+      // Use baseUrl if provided, otherwise derive from region
+      const endpoint = baseUrl
+        ? `${normalizeAzureBaseUrl(baseUrl)}/cognitiveservices/v1`
+        : `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
+
+      const voice = (req.config as any)?.azure?.voice;
+      const lang = (req.config as any)?.azure?.lang;
+      const outputFormat =
+        (req.config as any)?.azure?.outputFormat ??
+        DEFAULT_AZURE_OUTPUT_FORMAT;
+
+      if (!voice) {
+        throw new Error("Azure voice not configured");
+      }
+
+      const ssml = buildAzureSSML(req.text, voice, lang);
+
+      const response = await fetch(endpoint, {
+        method: "POST",
+        headers: {
+          "Ocp-Apim-Subscription-Key": apiKey,
+          "Content-Type": "application/ssml+xml",
+          "X-Microsoft-OutputFormat": outputFormat,
+        },
+        body: ssml,
+        signal: AbortSignal.timeout((req.config as any)?.azure?.timeoutMs ?? 30000),
+      });
+
+      if (!response.ok) {
+        throw new Error(`Azure TTS failed: ${response.status} ${response.statusText}`);
+      }
+
+      const audioBuffer = await response.arrayBuffer();
+      return {
+        audioBuffer: Buffer.from(audioBuffer),
+        outputFormat,
+        fileExtension: outputFormat.includes("mp3") ? ".mp3" : ".wav",
+        voiceCompatible: true,
+      };
+    },
+  };
+}
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@ -59,6 +59,7 @@ const DEFAULT_OPENAI_VOICE = "alloy";
 const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
 const DEFAULT_EDGE_LANG = "en-US";
 const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";

 const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
  stability: 0.5,
@ -117,7 +118,17 @@ export type ResolvedTtsConfig = {
    speed?: number;
    instructions?: string;
  };
-  edge: {
+  
+  azure: {
+    apiKey?: string;
+    region: string;
+    baseUrl: string;
+    voice: string;
+    lang: string;
+    outputFormat: string;
+    timeoutMs?: number;
+  };
+edge: {
    enabled: boolean;
    voice: string;
    lang: string;
@ -177,7 +188,11 @@ export type TtsDirectiveOverrides = {
    voice?: string;
    outputFormat?: string;
  };
-};
+  azure?: {
+    voice?: string;
+    lang?: string;
+    outputFormat?: string;
+  };

 export type TtsDirectiveParseResult = {
  cleanedText: string;
@ -324,6 +339,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
      speed: raw.openai?.speed,
      instructions: raw.openai?.instructions?.trim() || undefined,
    },
+    azure: {
+      apiKey: normalizeResolvedSecretInputString({
+        value: raw.azure?.apiKey,
+        path: "messages.tts.azure.apiKey",
+      }),
+      region: raw.azure?.region?.trim() || process.env.AZURE_SPEECH_REGION || "eastus",
+      baseUrl: raw.azure?.baseUrl?.trim() || "",
+      voice: raw.azure?.voice || "",
+      lang: raw.azure?.lang?.trim() || "en-US",
+      outputFormat: raw.azure?.outputFormat?.trim() || DEFAULT_AZURE_OUTPUT_FORMAT,
+      timeoutMs: raw.azure?.timeoutMs,
+    },
    edge: {
      enabled: rawMicrosoft.enabled ?? true,
      voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,