Merge ea9ffd2659e29e612fa77245b150ffee34439632 into 5e417b44e1540f528d2ae63e3e20229a902d1db2
This commit is contained in:
commit
8e924c76fc
@ -93,6 +93,16 @@ export type TtsConfig = {
|
||||
proxy?: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
/** Azure Speech configuration. */
|
||||
azure?: {
|
||||
apiKey?: SecretInput;
|
||||
region?: string;
|
||||
baseUrl?: string;
|
||||
voice?: string;
|
||||
lang?: string;
|
||||
outputFormat?: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
/** Optional path for local TTS user preferences JSON. */
|
||||
prefsPath?: string;
|
||||
/** Hard cap for text sent to TTS (chars). */
|
||||
|
||||
@ -392,6 +392,18 @@ const TtsMicrosoftConfigSchema = z
|
||||
})
|
||||
.strict()
|
||||
.optional();
|
||||
const TtsAzureConfigSchema = z
|
||||
.object({
|
||||
apiKey: SecretInputSchema.optional().register(sensitive),
|
||||
region: z.string().optional(),
|
||||
baseUrl: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
outputFormat: z.string().optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional();
|
||||
export const TtsConfigSchema = z
|
||||
.object({
|
||||
auto: TtsAutoSchema.optional(),
|
||||
@ -447,6 +459,7 @@ export const TtsConfigSchema = z
|
||||
.optional(),
|
||||
edge: TtsMicrosoftConfigSchema,
|
||||
microsoft: TtsMicrosoftConfigSchema,
|
||||
azure: TtsAzureConfigSchema,
|
||||
prefsPath: z.string().optional(),
|
||||
maxTextLength: z.number().int().min(1).optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
|
||||
@ -3,6 +3,7 @@ import { loadOpenClawPlugins } from "../plugins/loader.js";
|
||||
import { getActivePluginRegistry } from "../plugins/runtime.js";
|
||||
import type { SpeechProviderPlugin } from "../plugins/types.js";
|
||||
import type { SpeechProviderId } from "./provider-types.js";
|
||||
import { buildAzureSpeechProvider } from "./providers/azure.js";
|
||||
import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
|
||||
import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
|
||||
import { buildOpenAISpeechProvider } from "./providers/openai.js";
|
||||
@ -11,6 +12,7 @@ const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
|
||||
buildOpenAISpeechProvider,
|
||||
buildElevenLabsSpeechProvider,
|
||||
buildMicrosoftSpeechProvider,
|
||||
buildAzureSpeechProvider,
|
||||
] as const satisfies readonly (() => SpeechProviderPlugin)[];
|
||||
|
||||
function trimToUndefined(value: string | undefined): string | undefined {
|
||||
|
||||
125
src/tts/providers/azure.test.ts
Normal file
125
src/tts/providers/azure.test.ts
Normal file
@ -0,0 +1,125 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { listAzureVoices } from "./azure.js";
|
||||
|
||||
describe("listAzureVoices", () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch;
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it("maps Azure voice metadata into speech voice options", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue(
|
||||
new Response(
|
||||
JSON.stringify([
|
||||
{
|
||||
Name: "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)",
|
||||
DisplayName: "HiuMaan Neural (zh-HK)",
|
||||
LocalName: "HiuMaan",
|
||||
ShortName: "zh-HK-HiuMaanNeural",
|
||||
Gender: "Female",
|
||||
Locale: "zh-HK",
|
||||
VoiceType: "Neural",
|
||||
Status: "Available",
|
||||
},
|
||||
{
|
||||
Name: "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)",
|
||||
DisplayName: "Xiaoxiao Neural (zh-CN)",
|
||||
ShortName: "zh-CN-XiaoxiaoNeural",
|
||||
Gender: "Female",
|
||||
Locale: "zh-CN",
|
||||
VoiceType: "Neural",
|
||||
Status: "Available",
|
||||
},
|
||||
]),
|
||||
{ status: 200 },
|
||||
),
|
||||
) as typeof globalThis.fetch;
|
||||
|
||||
const voices = await listAzureVoices({
|
||||
apiKey: "test-key",
|
||||
region: "eastus",
|
||||
});
|
||||
|
||||
expect(voices).toEqual([
|
||||
{
|
||||
id: "zh-HK-HiuMaanNeural",
|
||||
name: "HiuMaan Neural (zh-HK)",
|
||||
category: "Neural",
|
||||
locale: "zh-HK",
|
||||
gender: "Female",
|
||||
},
|
||||
{
|
||||
id: "zh-CN-XiaoxiaoNeural",
|
||||
name: "Xiaoxiao Neural (zh-CN)",
|
||||
category: "Neural",
|
||||
locale: "zh-CN",
|
||||
gender: "Female",
|
||||
},
|
||||
]);
|
||||
expect(globalThis.fetch).toHaveBeenCalledWith(
|
||||
"https://eastus.tts.speech.microsoft.com/cognitiveservices/voices/list",
|
||||
expect.objectContaining({
|
||||
headers: expect.objectContaining({
|
||||
"Ocp-Apim-Subscription-Key": "test-key",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("filters out deprecated voices", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue(
|
||||
new Response(
|
||||
JSON.stringify([
|
||||
{
|
||||
ShortName: "zh-HK-HiuMaanNeural",
|
||||
Gender: "Female",
|
||||
Locale: "zh-HK",
|
||||
Status: "Available",
|
||||
},
|
||||
{
|
||||
ShortName: "zh-HK-OldVoice",
|
||||
Gender: "Male",
|
||||
Locale: "zh-HK",
|
||||
Status: "Deprecated",
|
||||
},
|
||||
]),
|
||||
{ status: 200 },
|
||||
),
|
||||
) as typeof globalThis.fetch;
|
||||
|
||||
const voices = await listAzureVoices({
|
||||
apiKey: "test-key",
|
||||
});
|
||||
|
||||
expect(voices).toHaveLength(1);
|
||||
expect(voices[0].id).toBe("zh-HK-HiuMaanNeural");
|
||||
});
|
||||
|
||||
it("throws on Azure voice list failures", async () => {
|
||||
globalThis.fetch = vi
|
||||
.fn()
|
||||
.mockResolvedValue(new Response("nope", { status: 503 })) as typeof globalThis.fetch;
|
||||
|
||||
await expect(
|
||||
listAzureVoices({ apiKey: "test-key", region: "eastus" }),
|
||||
).rejects.toThrow("Azure voices API error (503)");
|
||||
});
|
||||
|
||||
it("uses custom baseUrl when provided", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue(
|
||||
new Response(JSON.stringify([]), { status: 200 }),
|
||||
) as typeof globalThis.fetch;
|
||||
|
||||
await listAzureVoices({
|
||||
apiKey: "test-key",
|
||||
baseUrl: "https://custom.region.tts.speech.microsoft.com",
|
||||
});
|
||||
|
||||
expect(globalThis.fetch).toHaveBeenCalledWith(
|
||||
"https://custom.region.tts.speech.microsoft.com/cognitiveservices/voices/list",
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
});
|
||||
147
src/tts/providers/azure.ts
Normal file
147
src/tts/providers/azure.ts
Normal file
@ -0,0 +1,147 @@
|
||||
import type { SpeechProviderPlugin } from "../../plugins/types.js";
|
||||
import type { SpeechVoiceOption } from "../provider-types.js";
|
||||
|
||||
const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
|
||||
type AzureVoiceListEntry = {
|
||||
Name?: string;
|
||||
DisplayName?: string;
|
||||
LocalName?: string;
|
||||
ShortName?: string;
|
||||
Gender?: string;
|
||||
Locale?: string;
|
||||
VoiceType?: string;
|
||||
Status?: string;
|
||||
};
|
||||
|
||||
function normalizeAzureBaseUrl(baseUrl: string | undefined): string {
|
||||
const trimmed = baseUrl?.trim();
|
||||
if (!trimmed) {
|
||||
return "https://eastus.tts.speech.microsoft.com";
|
||||
}
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
export async function listAzureVoices(params: {
|
||||
apiKey: string;
|
||||
region?: string;
|
||||
baseUrl?: string;
|
||||
}): Promise<SpeechVoiceOption[]> {
|
||||
const region = params.region || "eastus";
|
||||
// Use baseUrl if provided, otherwise derive from region
|
||||
const url = params.baseUrl
|
||||
? `${normalizeAzureBaseUrl(params.baseUrl)}/cognitiveservices/voices/list`
|
||||
: `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
"Ocp-Apim-Subscription-Key": params.apiKey,
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Azure voices API error (${response.status})`);
|
||||
}
|
||||
|
||||
const voices = (await response.json()) as AzureVoiceListEntry[];
|
||||
// Filter deprecated voices BEFORE mapping (Status field is available here)
|
||||
return Array.isArray(voices)
|
||||
? voices
|
||||
.filter((voice) => voice.Status !== "Deprecated")
|
||||
.map((voice) => ({
|
||||
id: voice.ShortName?.trim() ?? "",
|
||||
name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
|
||||
category: voice.VoiceType?.trim() || undefined,
|
||||
locale: voice.Locale?.trim() || undefined,
|
||||
gender: voice.Gender?.trim() || undefined,
|
||||
}))
|
||||
.filter((voice) => voice.id.length > 0)
|
||||
: [];
|
||||
}
|
||||
|
||||
function buildAzureSSML(text: string, voice: string, lang?: string): string {
|
||||
const escapedText = text
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, """)
|
||||
.replace(/'/g, "'");
|
||||
|
||||
return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${lang || "en-US"}'><voice name='${voice}'>${escapedText}</voice></speak>`;
|
||||
}
|
||||
|
||||
export function buildAzureSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "azure",
|
||||
label: "Azure Speech",
|
||||
aliases: ["azure-tts"],
|
||||
listVoices: async (req) => {
|
||||
const apiKey =
|
||||
req.apiKey ||
|
||||
(req.config as any)?.azure?.apiKey ||
|
||||
process.env.AZURE_SPEECH_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Azure Speech API key missing");
|
||||
}
|
||||
return listAzureVoices({
|
||||
apiKey,
|
||||
region: (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION,
|
||||
baseUrl: (req.config as any)?.azure?.baseUrl,
|
||||
});
|
||||
},
|
||||
isConfigured: ({ config }) =>
|
||||
Boolean(
|
||||
(config as any)?.azure?.apiKey ||
|
||||
process.env.AZURE_SPEECH_API_KEY,
|
||||
),
|
||||
synthesize: async (req) => {
|
||||
const apiKey =
|
||||
(req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Azure Speech API key missing");
|
||||
}
|
||||
|
||||
const region = (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
|
||||
const baseUrl = (req.config as any)?.azure?.baseUrl;
|
||||
// Use baseUrl if provided, otherwise derive from region
|
||||
const endpoint = baseUrl
|
||||
? `${normalizeAzureBaseUrl(baseUrl)}/cognitiveservices/v1`
|
||||
: `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
|
||||
|
||||
const voice = (req.config as any)?.azure?.voice;
|
||||
const lang = (req.config as any)?.azure?.lang;
|
||||
const outputFormat =
|
||||
(req.config as any)?.azure?.outputFormat ??
|
||||
DEFAULT_AZURE_OUTPUT_FORMAT;
|
||||
|
||||
if (!voice) {
|
||||
throw new Error("Azure voice not configured");
|
||||
}
|
||||
|
||||
const ssml = buildAzureSSML(req.text, voice, lang);
|
||||
|
||||
const response = await fetch(endpoint, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Ocp-Apim-Subscription-Key": apiKey,
|
||||
"Content-Type": "application/ssml+xml",
|
||||
"X-Microsoft-OutputFormat": outputFormat,
|
||||
},
|
||||
body: ssml,
|
||||
signal: AbortSignal.timeout((req.config as any)?.azure?.timeoutMs ?? 30000),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Azure TTS failed: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
const audioBuffer = await response.arrayBuffer();
|
||||
return {
|
||||
audioBuffer: Buffer.from(audioBuffer),
|
||||
outputFormat,
|
||||
fileExtension: outputFormat.includes("mp3") ? ".mp3" : ".wav",
|
||||
voiceCompatible: true,
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -59,6 +59,7 @@ const DEFAULT_OPENAI_VOICE = "alloy";
|
||||
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
|
||||
const DEFAULT_EDGE_LANG = "en-US";
|
||||
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
|
||||
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
|
||||
stability: 0.5,
|
||||
@ -117,7 +118,17 @@ export type ResolvedTtsConfig = {
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
};
|
||||
edge: {
|
||||
|
||||
azure: {
|
||||
apiKey?: string;
|
||||
region: string;
|
||||
baseUrl: string;
|
||||
voice: string;
|
||||
lang: string;
|
||||
outputFormat: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
edge: {
|
||||
enabled: boolean;
|
||||
voice: string;
|
||||
lang: string;
|
||||
@ -177,7 +188,11 @@ export type TtsDirectiveOverrides = {
|
||||
voice?: string;
|
||||
outputFormat?: string;
|
||||
};
|
||||
};
|
||||
azure?: {
|
||||
voice?: string;
|
||||
lang?: string;
|
||||
outputFormat?: string;
|
||||
};
|
||||
|
||||
export type TtsDirectiveParseResult = {
|
||||
cleanedText: string;
|
||||
@ -324,6 +339,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
|
||||
speed: raw.openai?.speed,
|
||||
instructions: raw.openai?.instructions?.trim() || undefined,
|
||||
},
|
||||
azure: {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw.azure?.apiKey,
|
||||
path: "messages.tts.azure.apiKey",
|
||||
}),
|
||||
region: raw.azure?.region?.trim() || process.env.AZURE_SPEECH_REGION || "eastus",
|
||||
baseUrl: raw.azure?.baseUrl?.trim() || "",
|
||||
voice: raw.azure?.voice || "",
|
||||
lang: raw.azure?.lang?.trim() || "en-US",
|
||||
outputFormat: raw.azure?.outputFormat?.trim() || DEFAULT_AZURE_OUTPUT_FORMAT,
|
||||
timeoutMs: raw.azure?.timeoutMs,
|
||||
},
|
||||
edge: {
|
||||
enabled: rawMicrosoft.enabled ?? true,
|
||||
voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user