From c2a0cf0c28a2596b995ca94952b6f278b0767dac Mon Sep 17 00:00:00 2001 From: zerone0x Date: Mon, 16 Feb 2026 21:09:02 +0800 Subject: [PATCH] fix(tts): update tool description to prevent duplicate audio delivery (#18046) Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: 70c096abaa37fa506c4d86526e696828bf1acd86 Co-authored-by: zerone0x <39543393+zerone0x@users.noreply.github.com> Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com> Reviewed-by: @sebslight --- CHANGELOG.md | 1 + src/agents/tools/tts-tool.ts | 2 +- .../reply/dispatch-from-config.test.ts | 31 +++++++++-- src/auto-reply/reply/dispatch-from-config.ts | 53 ++++++++++++------- 4 files changed, 63 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37391c7c3a4..e49d930be46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,7 @@ Docs: https://docs.openclaw.ai - Web UI/Agents: hide `BOOTSTRAP.md` in the Agents Files list after onboarding is completed, avoiding confusing missing-file warnings for completed workspaces. (#17491) Thanks @gumadeiras. - Memory/QMD: scope managed collection names per agent and precreate glob-backed collection directories before registration, preventing cross-agent collection clobbering and startup ENOENT failures in fresh workspaces. (#17194) Thanks @jonathanadams96. - Auto-reply/WhatsApp/TUI/Web: when a final assistant message is `NO_REPLY` and a messaging tool send succeeded, mirror the delivered messaging-tool text into session-visible assistant output so TUI/Web no longer show `NO_REPLY` placeholders. (#7010) Thanks @Morrowind-Xie. +- Auto-reply/TTS: keep tool-result media delivery enabled in group chats and native command sessions (while still suppressing tool summary text) so `NO_REPLY` follow-ups do not drop successful TTS audio. (#17991) Thanks @zerone0x. - Cron: infer `payload.kind="agentTurn"` for model-only `cron.update` payload patches, so partial agent-turn updates do not fail validation when `kind` is omitted. (#15664) Thanks @rodrigouroz. - TUI: make searchable-select filtering and highlight rendering ANSI-aware so queries ignore hidden escape codes and no longer corrupt ANSI styling sequences during match highlighting. (#4519) Thanks @bee4come. - TUI/Windows: coalesce rapid single-line submit bursts in Git Bash into one multiline message as a fallback when bracketed paste is unavailable, preventing pasted multiline text from being split into multiple sends. (#4986) Thanks @adamkane. diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index 1add5054db6..9296c649698 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -21,7 +21,7 @@ export function createTtsTool(opts?: { label: "TTS", name: "tts", description: - "Convert text to speech and return a MEDIA: path. Use when the user requests audio or TTS is enabled. Copy the MEDIA line exactly.", + "Convert text to speech. Audio is delivered automatically from the tool result — reply with NO_REPLY after a successful call to avoid duplicate messages.", parameters: TtsToolSchema, execute: async (_toolCallId, args) => { const params = args as Record; diff --git a/src/auto-reply/reply/dispatch-from-config.test.ts b/src/auto-reply/reply/dispatch-from-config.test.ts index 4cc6657d2a2..09203ddba35 100644 --- a/src/auto-reply/reply/dispatch-from-config.test.ts +++ b/src/auto-reply/reply/dispatch-from-config.test.ts @@ -165,7 +165,7 @@ describe("dispatchReplyFromConfig", () => { expect(dispatcher.sendFinalReply).toHaveBeenCalledTimes(1); }); - it("does not provide onToolResult in group sessions", async () => { + it("suppresses group tool summaries but still forwards tool media", async () => { mocks.tryFastAbortFromMessage.mockResolvedValue({ handled: false, aborted: false, @@ -182,11 +182,23 @@ describe("dispatchReplyFromConfig", () => { opts: GetReplyOptions | undefined, _cfg: OpenClawConfig, ) => { - expect(opts?.onToolResult).toBeUndefined(); + expect(opts?.onToolResult).toBeDefined(); + await opts?.onToolResult?.({ text: "🔧 exec: ls" }); + await opts?.onToolResult?.({ + text: "NO_REPLY", + mediaUrls: ["https://example.com/tts-group.opus"], + }); return { text: "hi" } satisfies ReplyPayload; }; await dispatchReplyFromConfig({ ctx, cfg, dispatcher, replyResolver }); + + expect(dispatcher.sendToolResult).toHaveBeenCalledTimes(1); + const sent = (dispatcher.sendToolResult as ReturnType).mock.calls[0]?.[0] as + | ReplyPayload + | undefined; + expect(sent?.mediaUrls).toEqual(["https://example.com/tts-group.opus"]); + expect(sent?.text).toBeUndefined(); expect(dispatcher.sendFinalReply).toHaveBeenCalledTimes(1); }); @@ -219,7 +231,7 @@ describe("dispatchReplyFromConfig", () => { expect(dispatcher.sendFinalReply).toHaveBeenCalledTimes(1); }); - it("does not provide onToolResult for native slash commands", async () => { + it("suppresses native tool summaries but still forwards tool media", async () => { mocks.tryFastAbortFromMessage.mockResolvedValue({ handled: false, aborted: false, @@ -237,11 +249,22 @@ describe("dispatchReplyFromConfig", () => { opts: GetReplyOptions | undefined, _cfg: OpenClawConfig, ) => { - expect(opts?.onToolResult).toBeUndefined(); + expect(opts?.onToolResult).toBeDefined(); + await opts?.onToolResult?.({ text: "🔧 tools/sessions_send" }); + await opts?.onToolResult?.({ + mediaUrl: "https://example.com/tts-native.opus", + }); return { text: "hi" } satisfies ReplyPayload; }; await dispatchReplyFromConfig({ ctx, cfg, dispatcher, replyResolver }); + + expect(dispatcher.sendToolResult).toHaveBeenCalledTimes(1); + const sent = (dispatcher.sendToolResult as ReturnType).mock.calls[0]?.[0] as + | ReplyPayload + | undefined; + expect(sent?.mediaUrl).toBe("https://example.com/tts-native.opus"); + expect(sent?.text).toBeUndefined(); expect(dispatcher.sendFinalReply).toHaveBeenCalledTimes(1); }); diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 45bd75040aa..b605a0ff2d7 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -293,30 +293,45 @@ export async function dispatchReplyFromConfig(params: { const shouldSendToolSummaries = ctx.ChatType !== "group" && ctx.CommandSource !== "native"; + const resolveToolDeliveryPayload = (payload: ReplyPayload): ReplyPayload | null => { + if (shouldSendToolSummaries) { + return payload; + } + // Group/native flows intentionally suppress tool summary text, but media-only + // tool results (for example TTS audio) must still be delivered. + const hasMedia = Boolean(payload.mediaUrl) || (payload.mediaUrls?.length ?? 0) > 0; + if (!hasMedia) { + return null; + } + return { ...payload, text: undefined }; + }; + const replyResult = await (params.replyResolver ?? getReplyFromConfig)( ctx, { ...params.replyOptions, - onToolResult: shouldSendToolSummaries - ? (payload: ReplyPayload) => { - const run = async () => { - const ttsPayload = await maybeApplyTtsToPayload({ - payload, - cfg, - channel: ttsChannel, - kind: "tool", - inboundAudio, - ttsAuto: sessionTtsAuto, - }); - if (shouldRouteToOriginating) { - await sendPayloadAsync(ttsPayload, undefined, false); - } else { - dispatcher.sendToolResult(ttsPayload); - } - }; - return run(); + onToolResult: (payload: ReplyPayload) => { + const run = async () => { + const ttsPayload = await maybeApplyTtsToPayload({ + payload, + cfg, + channel: ttsChannel, + kind: "tool", + inboundAudio, + ttsAuto: sessionTtsAuto, + }); + const deliveryPayload = resolveToolDeliveryPayload(ttsPayload); + if (!deliveryPayload) { + return; } - : undefined, + if (shouldRouteToOriginating) { + await sendPayloadAsync(deliveryPayload, undefined, false); + } else { + dispatcher.sendToolResult(deliveryPayload); + } + }; + return run(); + }, onBlockReply: (payload: ReplyPayload, context) => { const run = async () => { // Accumulate block text for TTS generation after streaming