From fd2f47a5e49664880ca70231991b3c73ea478c65 Mon Sep 17 00:00:00 2001 From: Eulices Lopez Date: Tue, 17 Mar 2026 12:25:51 -0400 Subject: [PATCH 1/3] fix: add placeholder transcript for silent voice notes --- src/media-understanding/apply.test.ts | 19 +++++++++++++--- src/media-understanding/apply.ts | 32 +++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index bea9c6bc2bb..61649e7a3aa 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript"); }); - it("skips URL-only audio when remote file is too small", async () => { - // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES) + it("injects a placeholder transcript when URL-only audio is too small", async () => { mockedFetchRemoteMedia.mockResolvedValueOnce({ buffer: Buffer.alloc(100), contentType: "audio/ogg", @@ -483,7 +482,21 @@ describe("applyMediaUnderstanding", () => { }); expect(transcribeAudio).not.toHaveBeenCalled(); - expect(result.appliedAudio).toBe(false); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note was empty or contained only silence — no speech detected]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note was empty or contained only silence — no speech detected]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]", + ); }); it("skips audio transcription when attachment exceeds maxBytes", async () => { diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 7721dae16b0..65a02930661 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = { }; const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; +const EMPTY_VOICE_NOTE_PLACEHOLDER = + "[Voice note was empty or contained only silence — no speech detected]"; const EXTRA_TEXT_MIMES = [ "application/xml", "text/xml", @@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined { return TEXT_EXT_MIME.get(ext); } +function buildSyntheticSkippedAudioOutputs( + decisions: MediaUnderstandingDecision[], +): MediaUnderstandingOutput[] { + const audioDecision = decisions.find((decision) => decision.capability === "audio"); + if (!audioDecision) { + return []; + } + return audioDecision.attachments.flatMap((attachment) => { + const reason = attachment.attempts + .map((attempt) => attempt.reason?.trim()) + .find((value): value is string => Boolean(value)); + if (!reason?.startsWith("tooSmall")) { + return []; + } + return [ + { + kind: "audio.transcription" as const, + attachmentIndex: attachment.attachmentIndex, + text: EMPTY_VOICE_NOTE_PLACEHOLDER, + provider: "openclaw", + model: "synthetic-empty-audio", + }, + ]; + }); +} + function isBinaryMediaMime(mime?: string): boolean { if (!mime) { return false; @@ -495,6 +523,10 @@ export async function applyMediaUnderstanding(params: { decisions.push(entry.decision); } + if (!outputs.some((output) => output.kind === "audio.transcription")) { + outputs.push(...buildSyntheticSkippedAudioOutputs(decisions)); + } + if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; } From bac89c2ec8c5daa2aa0a1aaa941d14ab6a1cbcef Mon Sep 17 00:00:00 2001 From: Eulices Lopez Date: Tue, 17 Mar 2026 12:43:19 -0400 Subject: [PATCH 2/3] fix: handle placeholder transcripts per skipped attachment --- src/media-understanding/apply.test.ts | 45 +++++++++++++++++++++++++++ src/media-understanding/apply.ts | 20 +++++++----- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 61649e7a3aa..0152e1d0a70 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -499,6 +499,51 @@ describe("applyMediaUnderstanding", () => { ); }); + it("injects a placeholder transcript when local-path audio is too small", async () => { + const ctx = await createAudioCtx({ + fileName: "tiny.ogg", + mediaType: "audio/ogg", + content: Buffer.alloc(100), + }); + const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" })); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { id: "groq", transcribeAudio }, + }, + }); + + expect(transcribeAudio).not.toHaveBeenCalled(); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note was empty or contained only silence — no speech detected]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note was empty or contained only silence — no speech detected]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]", + ); + }); + it("skips audio transcription when attachment exceeds maxBytes", async () => { const ctx = await createAudioCtx({ fileName: "large.wav", diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 65a02930661..1f97eaba5d7 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -293,10 +293,10 @@ function buildSyntheticSkippedAudioOutputs( return []; } return audioDecision.attachments.flatMap((attachment) => { - const reason = attachment.attempts - .map((attempt) => attempt.reason?.trim()) - .find((value): value is string => Boolean(value)); - if (!reason?.startsWith("tooSmall")) { + const hasTooSmallAttempt = attachment.attempts.some((attempt) => + attempt.reason?.trim().startsWith("tooSmall"), + ); + if (!hasTooSmallAttempt) { return []; } return [ @@ -523,9 +523,15 @@ export async function applyMediaUnderstanding(params: { decisions.push(entry.decision); } - if (!outputs.some((output) => output.kind === "audio.transcription")) { - outputs.push(...buildSyntheticSkippedAudioOutputs(decisions)); - } + const audioOutputAttachmentIndexes = new Set( + outputs + .filter((output) => output.kind === "audio.transcription") + .map((output) => output.attachmentIndex), + ); + const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter( + (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex), + ); + outputs.push(...syntheticSkippedAudioOutputs); if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; From d6648e77bc6b4ddd731c811500d3dfa227a1bb67 Mon Sep 17 00:00:00 2001 From: Eulices Lopez Date: Wed, 18 Mar 2026 01:52:58 -0400 Subject: [PATCH 3/3] fix: preserve synthetic transcript attachment order --- src/media-understanding/apply.ts | 1 + src/media-understanding/format.test.ts | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 1f97eaba5d7..d18bdbb282b 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -532,6 +532,7 @@ export async function applyMediaUnderstanding(params: { (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex), ); outputs.push(...syntheticSkippedAudioOutputs); + outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex); if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts index 172ecadf985..57ea5ebf83b 100644 --- a/src/media-understanding/format.test.ts +++ b/src/media-understanding/format.test.ts @@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => { }); expect(body).toBe("[Image]\nDescription:\na cat"); }); + + it("labels audio transcripts by their attachment order", () => { + const body = formatMediaUnderstandingBody({ + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "first clip was silent", + provider: "openclaw", + }, + { + kind: "audio.transcription", + attachmentIndex: 1, + text: "second clip has speech", + provider: "groq", + }, + ], + }); + expect(body).toBe( + [ + "[Audio 1/2]\nTranscript:\nfirst clip was silent", + "[Audio 2/2]\nTranscript:\nsecond clip has speech", + ].join("\n\n"), + ); + }); });