feat(tts): add Azure Speech TTS provider

- Add Azure TTS provider with SSML synthesis
- Support for 400+ neural voices including Cantonese (zh-HK)
- Config options: apiKey, region, voice, lang, outputFormat
- Environment variables: AZURE_SPEECH_API_KEY, AZURE_SPEECH_REGION
- Provider ID: 'azure' with alias 'azure-tts'
- Built-in voices: zh-HK-HiuMaanNeural, zh-HK-HiuGaaiNeural
This commit is contained in:
Yobo 2026-03-20 16:25:28 -07:00
parent 6526074c85
commit 33b95fed9a
4 changed files with 164 additions and 0 deletions

View File

@ -93,6 +93,16 @@ export type TtsConfig = {
proxy?: string;
timeoutMs?: number;
};
/** Azure Speech configuration. */
azure?: {
apiKey?: SecretInput;
region?: string;
baseUrl?: string;
voice?: string;
lang?: string;
outputFormat?: string;
timeoutMs?: number;
};
/** Optional path for local TTS user preferences JSON. */
prefsPath?: string;
/** Hard cap for text sent to TTS (chars). */

View File

@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z
})
.strict()
.optional();
const TtsAzureConfigSchema = z
.object({
apiKey: SecretInputSchema.optional().register(sensitive),
region: z.string().optional(),
baseUrl: z.string().optional(),
voice: z.string().optional(),
lang: z.string().optional(),
outputFormat: z.string().optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.optional();
export const TtsConfigSchema = z
.object({
auto: TtsAutoSchema.optional(),
@ -447,6 +459,7 @@ export const TtsConfigSchema = z
.optional(),
edge: TtsMicrosoftConfigSchema,
microsoft: TtsMicrosoftConfigSchema,
azure: TtsAzureConfigSchema,
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),

View File

@ -3,6 +3,7 @@ import { loadOpenClawPlugins } from "../plugins/loader.js";
import { getActivePluginRegistry } from "../plugins/runtime.js";
import type { SpeechProviderPlugin } from "../plugins/types.js";
import type { SpeechProviderId } from "./provider-types.js";
import { buildAzureSpeechProvider } from "./providers/azure.js";
import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
import { buildOpenAISpeechProvider } from "./providers/openai.js";
@ -11,6 +12,7 @@ const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
buildOpenAISpeechProvider,
buildElevenLabsSpeechProvider,
buildMicrosoftSpeechProvider,
buildAzureSpeechProvider,
] as const satisfies readonly (() => SpeechProviderPlugin)[];
function trimToUndefined(value: string | undefined): string | undefined {

139
src/tts/providers/azure.ts Normal file
View File

@ -0,0 +1,139 @@
import type { SpeechProviderPlugin } from "../../plugins/types.js";
import type { SpeechVoiceOption } from "../provider-types.js";
const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
type AzureVoiceListEntry = {
Name?: string;
DisplayName?: string;
LocalName?: string;
ShortName?: string;
Gender?: string;
Locale?: string;
VoiceType?: string;
Status?: string;
};
function normalizeAzureBaseUrl(baseUrl: string | undefined): string {
const trimmed = baseUrl?.trim();
if (!trimmed) {
return "https://eastus.tts.speech.microsoft.com";
}
return trimmed.replace(/\/+$/, "");
}
export async function listAzureVoices(params: {
apiKey: string;
region?: string;
baseUrl?: string;
}): Promise<SpeechVoiceOption[]> {
const base = normalizeAzureBaseUrl(params.baseUrl);
const region = params.region || "eastus";
const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
const response = await fetch(url, {
headers: {
"Ocp-Apim-Subscription-Key": params.apiKey,
},
});
if (!response.ok) {
throw new Error(`Azure voices API error (${response.status})`);
}
const voices = (await response.json()) as AzureVoiceListEntry[];
return Array.isArray(voices)
? voices
.map((voice) => ({
id: voice.ShortName?.trim() ?? "",
name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
category: voice.VoiceType?.trim() || undefined,
locale: voice.Locale?.trim() || undefined,
gender: voice.Gender?.trim() || undefined,
}))
.filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated")
: [];
}
function buildAzureSSML(text: string, voice: string, lang?: string): string {
const escapedText = text
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&apos;");
return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${lang || "en-US"}'><voice name='${voice}'>${escapedText}</voice></speak>`;
}
export function buildAzureSpeechProvider(): SpeechProviderPlugin {
return {
id: "azure",
label: "Azure Speech",
aliases: ["azure-tts"],
listVoices: async (req) => {
const apiKey =
req.apiKey ||
req.config?.azure?.apiKey ||
process.env.AZURE_SPEECH_API_KEY;
if (!apiKey) {
throw new Error("Azure Speech API key missing");
}
return listAzureVoices({
apiKey,
region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION,
baseUrl: req.config?.azure?.baseUrl,
});
},
isConfigured: ({ config }) =>
Boolean(
config.azure?.apiKey ||
process.env.AZURE_SPEECH_API_KEY,
),
synthesize: async (req) => {
const apiKey =
req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
if (!apiKey) {
throw new Error("Azure Speech API key missing");
}
const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl);
const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice;
const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang;
const outputFormat =
req.overrides?.azure?.outputFormat ??
req.config?.azure?.outputFormat ??
DEFAULT_AZURE_OUTPUT_FORMAT;
if (!voice) {
throw new Error("Azure voice not configured");
}
const endpoint = `${baseUrl}/cognitiveservices/v1`;
const ssml = buildAzureSSML(req.text, voice, lang);
const response = await fetch(endpoint, {
method: "POST",
headers: {
"Ocp-Apim-Subscription-Key": apiKey,
"Content-Type": "application/ssml+xml",
"X-Microsoft-OutputFormat": outputFormat,
},
body: ssml,
});
if (!response.ok) {
throw new Error(`Azure TTS failed: ${response.status} ${response.statusText}`);
}
const audioBuffer = await response.arrayBuffer();
return {
audioBuffer: Buffer.from(audioBuffer),
outputFormat,
fileExtension: outputFormat.includes("mp3") ? ".mp3" : ".wav",
voiceCompatible: true,
};
},
};
}