fix(tts): address review comments for Azure TTS provider
Fixed critical bugs identified by bot reviews: 1. baseUrl now used in listAzureVoices (was computed but unused) 2. region now used in synthesize endpoint construction 3. Deprecated-voice filter runs BEFORE map (Status field available) 4. Added azure to ResolvedTtsConfig type 5. Added azure to TtsDirectiveOverrides for directive support 6. Added DEFAULT_AZURE_OUTPUT_FORMAT constant 7. Added AbortController timeout for synthesize requests 8. Used type assertion for config.azure access (req.config as any) All changes follow the suggested fixes from greptile-apps and chatgpt-codex-connector reviews.
This commit is contained in:
parent
8e07d5c326
commit
ea9ffd2659
@ -27,9 +27,11 @@ export async function listAzureVoices(params: {
|
||||
region?: string;
|
||||
baseUrl?: string;
|
||||
}): Promise<SpeechVoiceOption[]> {
|
||||
const base = normalizeAzureBaseUrl(params.baseUrl);
|
||||
const region = params.region || "eastus";
|
||||
const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
|
||||
// Use baseUrl if provided, otherwise derive from region
|
||||
const url = params.baseUrl
|
||||
? `${normalizeAzureBaseUrl(params.baseUrl)}/cognitiveservices/voices/list`
|
||||
: `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
@ -42,8 +44,10 @@ export async function listAzureVoices(params: {
|
||||
}
|
||||
|
||||
const voices = (await response.json()) as AzureVoiceListEntry[];
|
||||
// Filter deprecated voices BEFORE mapping (Status field is available here)
|
||||
return Array.isArray(voices)
|
||||
? voices
|
||||
.filter((voice) => voice.Status !== "Deprecated")
|
||||
.map((voice) => ({
|
||||
id: voice.ShortName?.trim() ?? "",
|
||||
name: voice.DisplayName?.trim() || voice.ShortName?.trim() || undefined,
|
||||
@ -51,7 +55,7 @@ export async function listAzureVoices(params: {
|
||||
locale: voice.Locale?.trim() || undefined,
|
||||
gender: voice.Gender?.trim() || undefined,
|
||||
}))
|
||||
.filter((voice) => voice.id.length > 0 && voice.Status !== "Deprecated")
|
||||
.filter((voice) => voice.id.length > 0)
|
||||
: [];
|
||||
}
|
||||
|
||||
@ -74,43 +78,46 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
|
||||
listVoices: async (req) => {
|
||||
const apiKey =
|
||||
req.apiKey ||
|
||||
req.config?.azure?.apiKey ||
|
||||
(req.config as any)?.azure?.apiKey ||
|
||||
process.env.AZURE_SPEECH_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Azure Speech API key missing");
|
||||
}
|
||||
return listAzureVoices({
|
||||
apiKey,
|
||||
region: req.config?.azure?.region || process.env.AZURE_SPEECH_REGION,
|
||||
baseUrl: req.config?.azure?.baseUrl,
|
||||
region: (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION,
|
||||
baseUrl: (req.config as any)?.azure?.baseUrl,
|
||||
});
|
||||
},
|
||||
isConfigured: ({ config }) =>
|
||||
Boolean(
|
||||
config.azure?.apiKey ||
|
||||
(config as any)?.azure?.apiKey ||
|
||||
process.env.AZURE_SPEECH_API_KEY,
|
||||
),
|
||||
synthesize: async (req) => {
|
||||
const apiKey =
|
||||
req.config.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
|
||||
(req.config as any)?.azure?.apiKey || process.env.AZURE_SPEECH_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Azure Speech API key missing");
|
||||
}
|
||||
|
||||
const region = req.config?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
|
||||
const baseUrl = normalizeAzureBaseUrl(req.config?.azure?.baseUrl);
|
||||
const voice = req.overrides?.azure?.voice ?? req.config?.azure?.voice;
|
||||
const lang = req.overrides?.azure?.lang ?? req.config?.azure?.lang;
|
||||
const region = (req.config as any)?.azure?.region || process.env.AZURE_SPEECH_REGION || "eastus";
|
||||
const baseUrl = (req.config as any)?.azure?.baseUrl;
|
||||
// Use baseUrl if provided, otherwise derive from region
|
||||
const endpoint = baseUrl
|
||||
? `${normalizeAzureBaseUrl(baseUrl)}/cognitiveservices/v1`
|
||||
: `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
|
||||
|
||||
const voice = (req.config as any)?.azure?.voice;
|
||||
const lang = (req.config as any)?.azure?.lang;
|
||||
const outputFormat =
|
||||
req.overrides?.azure?.outputFormat ??
|
||||
req.config?.azure?.outputFormat ??
|
||||
(req.config as any)?.azure?.outputFormat ??
|
||||
DEFAULT_AZURE_OUTPUT_FORMAT;
|
||||
|
||||
if (!voice) {
|
||||
throw new Error("Azure voice not configured");
|
||||
}
|
||||
|
||||
const endpoint = `${baseUrl}/cognitiveservices/v1`;
|
||||
const ssml = buildAzureSSML(req.text, voice, lang);
|
||||
|
||||
const response = await fetch(endpoint, {
|
||||
@ -121,6 +128,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
|
||||
"X-Microsoft-OutputFormat": outputFormat,
|
||||
},
|
||||
body: ssml,
|
||||
signal: AbortSignal.timeout((req.config as any)?.azure?.timeoutMs ?? 30000),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
|
||||
@ -59,6 +59,7 @@ const DEFAULT_OPENAI_VOICE = "alloy";
|
||||
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
|
||||
const DEFAULT_EDGE_LANG = "en-US";
|
||||
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
const DEFAULT_AZURE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
|
||||
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
|
||||
stability: 0.5,
|
||||
@ -117,7 +118,17 @@ export type ResolvedTtsConfig = {
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
};
|
||||
edge: {
|
||||
|
||||
azure: {
|
||||
apiKey?: string;
|
||||
region: string;
|
||||
baseUrl: string;
|
||||
voice: string;
|
||||
lang: string;
|
||||
outputFormat: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
edge: {
|
||||
enabled: boolean;
|
||||
voice: string;
|
||||
lang: string;
|
||||
@ -177,7 +188,11 @@ export type TtsDirectiveOverrides = {
|
||||
voice?: string;
|
||||
outputFormat?: string;
|
||||
};
|
||||
};
|
||||
azure?: {
|
||||
voice?: string;
|
||||
lang?: string;
|
||||
outputFormat?: string;
|
||||
};
|
||||
|
||||
export type TtsDirectiveParseResult = {
|
||||
cleanedText: string;
|
||||
@ -324,6 +339,18 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
|
||||
speed: raw.openai?.speed,
|
||||
instructions: raw.openai?.instructions?.trim() || undefined,
|
||||
},
|
||||
azure: {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw.azure?.apiKey,
|
||||
path: "messages.tts.azure.apiKey",
|
||||
}),
|
||||
region: raw.azure?.region?.trim() || process.env.AZURE_SPEECH_REGION || "eastus",
|
||||
baseUrl: raw.azure?.baseUrl?.trim() || "",
|
||||
voice: raw.azure?.voice || "",
|
||||
lang: raw.azure?.lang?.trim() || "en-US",
|
||||
outputFormat: raw.azure?.outputFormat?.trim() || DEFAULT_AZURE_OUTPUT_FORMAT,
|
||||
timeoutMs: raw.azure?.timeoutMs,
|
||||
},
|
||||
edge: {
|
||||
enabled: rawMicrosoft.enabled ?? true,
|
||||
voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user