From 5350f5b03596e1ee7b7407ea4ac7630b8272909b Mon Sep 17 00:00:00 2001
From: smthfoxy <ning.hu@gmail.com>
Date: Sat, 28 Feb 2026 13:41:22 +0800
Subject: [PATCH] fix(tts): use opus format and enable voice bubbles for feishu
 and whatsapp (#27366)

* fix(tts): use opus format and enable voice bubbles for feishu and whatsapp

Previously only Telegram received opus output and had `shouldVoice=true`.
Feishu and WhatsApp also support voice-bubble playback and require opus audio,
but were falling back to mp3 with `audioAsVoice=false`.

- Extract VOICE_BUBBLE_CHANNELS set (telegram, feishu, whatsapp)
- resolveOutputFormat: return TELEGRAM_OUTPUT (opus) for all voice-bubble channels
- shouldVoice: enable for all voice-bubble channels, not just telegram
- Update test to cover feishu and whatsapp cases

* Changelog: add TTS voice-bubble channel coverage note

---------

Co-authored-by: Ning Hu <ninghu@Nings-MacBook-Pro.local>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
---
 CHANGELOG.md        |  1 +
 src/tts/tts.test.ts | 20 +++++++++++++++++++-
 src/tts/tts.ts      |  8 ++++++--
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5953d01fee1..a3be77ed840 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,7 @@ Docs: https://docs.openclaw.ai
 - Feishu/Docx convert fallback chunking: recursively split oversized markdown chunks (including long no-heading sections) when `document.convert` hits content limits, while keeping fenced-code-aware split boundaries whenever possible. (#14402) Thanks @lml2468.
 - Feishu/Inbound media regression coverage: add explicit tests for message resource type mapping (`image` stays `image`, non-image maps to `file`) to prevent reintroducing unsupported Feishu `type=audio` fetches. (#16311, #8746) Thanks @Yaxuan42.
 - Feishu/API quota controls: add `typingIndicator` and `resolveSenderNames` config flags (top-level and per-account) so operators can disable typing reactions and sender-name lookup requests while keeping default behavior unchanged. (#10513) Thanks @BigUncle.
+- TTS/Voice bubbles: use opus output and enable `audioAsVoice` routing for Feishu and WhatsApp (in addition to Telegram) so supported channels receive voice-bubble playback instead of file-style audio attachments. (#27366) Thanks @smthfoxy.
 - Security/Feishu webhook ingress: bound unauthenticated webhook rate-limit state with stale-window pruning and a hard key cap to prevent unbounded pre-auth memory growth from rotating source keys. (#26050) Thanks @bmendonca3.
 - Security/Compaction audit: remove the post-compaction audit injection message. (#28507) Thanks @fuller-stack-dev and @vincentkoc.
 - Telegram/Reply media context: include replied media files in inbound context when replying to media, defer reply-media downloads to debounce flush, gate reply-media fetch behind DM authorization, and preserve replied media when non-vision sticker fallback runs (including cached-sticker paths). (#28488) Thanks @obviyus.
diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts
index 559c52bb7e3..d6bc88db4fa 100644
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@@ -154,7 +154,7 @@ describe("tts", () => {
   });
 
   describe("resolveOutputFormat", () => {
-    it("selects opus for Telegram and mp3 for other channels", () => {
+    it("selects opus for voice-bubble channels (telegram/feishu/whatsapp) and mp3 for others", () => {
       const cases = [
         {
           channel: "telegram",
@@ -165,6 +165,24 @@ describe("tts", () => {
             voiceCompatible: true,
           },
         },
+        {
+          channel: "feishu",
+          expected: {
+            openai: "opus",
+            elevenlabs: "opus_48000_64",
+            extension: ".opus",
+            voiceCompatible: true,
+          },
+        },
+        {
+          channel: "whatsapp",
+          expected: {
+            openai: "opus",
+            elevenlabs: "opus_48000_64",
+            extension: ".opus",
+            voiceCompatible: true,
+          },
+        },
         {
           channel: "discord",
           expected: {
diff --git a/src/tts/tts.ts b/src/tts/tts.ts
index 3130cf396b8..c11cfaf1d87 100644
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -480,8 +480,11 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
   lastTtsAttempt = entry;
 }
 
+/** Channels that require opus audio and support voice-bubble playback */
+const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp"]);
+
 function resolveOutputFormat(channelId?: string | null) {
-  if (channelId === "telegram") {
+  if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) {
     return TELEGRAM_OUTPUT;
   }
   return DEFAULT_OUTPUT;
@@ -911,7 +914,8 @@ export async function maybeApplyTtsToPayload(params: {
     };
 
     const channelId = resolveChannelId(params.channel);
-    const shouldVoice = channelId === "telegram" && result.voiceCompatible === true;
+    const shouldVoice =
+      channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true;
     const finalPayload = {
       ...nextPayload,
       mediaUrl: result.audioPath,