From fd2f47a5e49664880ca70231991b3c73ea478c65 Mon Sep 17 00:00:00 2001 From: Eulices Lopez Date: Tue, 17 Mar 2026 12:25:51 -0400 Subject: [PATCH] fix: add placeholder transcript for silent voice notes --- src/media-understanding/apply.test.ts | 19 +++++++++++++--- src/media-understanding/apply.ts | 32 +++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index bea9c6bc2bb..61649e7a3aa 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript"); }); - it("skips URL-only audio when remote file is too small", async () => { - // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES) + it("injects a placeholder transcript when URL-only audio is too small", async () => { mockedFetchRemoteMedia.mockResolvedValueOnce({ buffer: Buffer.alloc(100), contentType: "audio/ogg", @@ -483,7 +482,21 @@ describe("applyMediaUnderstanding", () => { }); expect(transcribeAudio).not.toHaveBeenCalled(); - expect(result.appliedAudio).toBe(false); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note was empty or contained only silence — no speech detected]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note was empty or contained only silence — no speech detected]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]", + ); }); it("skips audio transcription when attachment exceeds maxBytes", async () => { diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 7721dae16b0..65a02930661 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = { }; const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; +const EMPTY_VOICE_NOTE_PLACEHOLDER = + "[Voice note was empty or contained only silence — no speech detected]"; const EXTRA_TEXT_MIMES = [ "application/xml", "text/xml", @@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined { return TEXT_EXT_MIME.get(ext); } +function buildSyntheticSkippedAudioOutputs( + decisions: MediaUnderstandingDecision[], +): MediaUnderstandingOutput[] { + const audioDecision = decisions.find((decision) => decision.capability === "audio"); + if (!audioDecision) { + return []; + } + return audioDecision.attachments.flatMap((attachment) => { + const reason = attachment.attempts + .map((attempt) => attempt.reason?.trim()) + .find((value): value is string => Boolean(value)); + if (!reason?.startsWith("tooSmall")) { + return []; + } + return [ + { + kind: "audio.transcription" as const, + attachmentIndex: attachment.attachmentIndex, + text: EMPTY_VOICE_NOTE_PLACEHOLDER, + provider: "openclaw", + model: "synthetic-empty-audio", + }, + ]; + }); +} + function isBinaryMediaMime(mime?: string): boolean { if (!mime) { return false; @@ -495,6 +523,10 @@ export async function applyMediaUnderstanding(params: { decisions.push(entry.decision); } + if (!outputs.some((output) => output.kind === "audio.transcription")) { + outputs.push(...buildSyntheticSkippedAudioOutputs(decisions)); + } + if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; }