From 5350f5b03596e1ee7b7407ea4ac7630b8272909b Mon Sep 17 00:00:00 2001 From: smthfoxy Date: Sat, 28 Feb 2026 13:41:22 +0800 Subject: [PATCH] fix(tts): use opus format and enable voice bubbles for feishu and whatsapp (#27366) * fix(tts): use opus format and enable voice bubbles for feishu and whatsapp Previously only Telegram received opus output and had `shouldVoice=true`. Feishu and WhatsApp also support voice-bubble playback and require opus audio, but were falling back to mp3 with `audioAsVoice=false`. - Extract VOICE_BUBBLE_CHANNELS set (telegram, feishu, whatsapp) - resolveOutputFormat: return TELEGRAM_OUTPUT (opus) for all voice-bubble channels - shouldVoice: enable for all voice-bubble channels, not just telegram - Update test to cover feishu and whatsapp cases * Changelog: add TTS voice-bubble channel coverage note --------- Co-authored-by: Ning Hu Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com> --- CHANGELOG.md | 1 + src/tts/tts.test.ts | 20 +++++++++++++++++++- src/tts/tts.ts | 8 ++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5953d01fee1..a3be77ed840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ Docs: https://docs.openclaw.ai - Feishu/Docx convert fallback chunking: recursively split oversized markdown chunks (including long no-heading sections) when `document.convert` hits content limits, while keeping fenced-code-aware split boundaries whenever possible. (#14402) Thanks @lml2468. - Feishu/Inbound media regression coverage: add explicit tests for message resource type mapping (`image` stays `image`, non-image maps to `file`) to prevent reintroducing unsupported Feishu `type=audio` fetches. (#16311, #8746) Thanks @Yaxuan42. - Feishu/API quota controls: add `typingIndicator` and `resolveSenderNames` config flags (top-level and per-account) so operators can disable typing reactions and sender-name lookup requests while keeping default behavior unchanged. (#10513) Thanks @BigUncle. +- TTS/Voice bubbles: use opus output and enable `audioAsVoice` routing for Feishu and WhatsApp (in addition to Telegram) so supported channels receive voice-bubble playback instead of file-style audio attachments. (#27366) Thanks @smthfoxy. - Security/Feishu webhook ingress: bound unauthenticated webhook rate-limit state with stale-window pruning and a hard key cap to prevent unbounded pre-auth memory growth from rotating source keys. (#26050) Thanks @bmendonca3. - Security/Compaction audit: remove the post-compaction audit injection message. (#28507) Thanks @fuller-stack-dev and @vincentkoc. - Telegram/Reply media context: include replied media files in inbound context when replying to media, defer reply-media downloads to debounce flush, gate reply-media fetch behind DM authorization, and preserve replied media when non-vision sticker fallback runs (including cached-sticker paths). (#28488) Thanks @obviyus. diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 559c52bb7e3..d6bc88db4fa 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -154,7 +154,7 @@ describe("tts", () => { }); describe("resolveOutputFormat", () => { - it("selects opus for Telegram and mp3 for other channels", () => { + it("selects opus for voice-bubble channels (telegram/feishu/whatsapp) and mp3 for others", () => { const cases = [ { channel: "telegram", @@ -165,6 +165,24 @@ describe("tts", () => { voiceCompatible: true, }, }, + { + channel: "feishu", + expected: { + openai: "opus", + elevenlabs: "opus_48000_64", + extension: ".opus", + voiceCompatible: true, + }, + }, + { + channel: "whatsapp", + expected: { + openai: "opus", + elevenlabs: "opus_48000_64", + extension: ".opus", + voiceCompatible: true, + }, + }, { channel: "discord", expected: { diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 3130cf396b8..c11cfaf1d87 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -480,8 +480,11 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { lastTtsAttempt = entry; } +/** Channels that require opus audio and support voice-bubble playback */ +const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp"]); + function resolveOutputFormat(channelId?: string | null) { - if (channelId === "telegram") { + if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) { return TELEGRAM_OUTPUT; } return DEFAULT_OUTPUT; @@ -911,7 +914,8 @@ export async function maybeApplyTtsToPayload(params: { }; const channelId = resolveChannelId(params.channel); - const shouldVoice = channelId === "telegram" && result.voiceCompatible === true; + const shouldVoice = + channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true; const finalPayload = { ...nextPayload, mediaUrl: result.audioPath,