From bac89c2ec8c5daa2aa0a1aaa941d14ab6a1cbcef Mon Sep 17 00:00:00 2001 From: Eulices Lopez Date: Tue, 17 Mar 2026 12:43:19 -0400 Subject: [PATCH] fix: handle placeholder transcripts per skipped attachment --- src/media-understanding/apply.test.ts | 45 +++++++++++++++++++++++++++ src/media-understanding/apply.ts | 20 +++++++----- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 61649e7a3aa..0152e1d0a70 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -499,6 +499,51 @@ describe("applyMediaUnderstanding", () => { ); }); + it("injects a placeholder transcript when local-path audio is too small", async () => { + const ctx = await createAudioCtx({ + fileName: "tiny.ogg", + mediaType: "audio/ogg", + content: Buffer.alloc(100), + }); + const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" })); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { id: "groq", transcribeAudio }, + }, + }); + + expect(transcribeAudio).not.toHaveBeenCalled(); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note was empty or contained only silence — no speech detected]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note was empty or contained only silence — no speech detected]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]", + ); + }); + it("skips audio transcription when attachment exceeds maxBytes", async () => { const ctx = await createAudioCtx({ fileName: "large.wav", diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 65a02930661..1f97eaba5d7 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -293,10 +293,10 @@ function buildSyntheticSkippedAudioOutputs( return []; } return audioDecision.attachments.flatMap((attachment) => { - const reason = attachment.attempts - .map((attempt) => attempt.reason?.trim()) - .find((value): value is string => Boolean(value)); - if (!reason?.startsWith("tooSmall")) { + const hasTooSmallAttempt = attachment.attempts.some((attempt) => + attempt.reason?.trim().startsWith("tooSmall"), + ); + if (!hasTooSmallAttempt) { return []; } return [ @@ -523,9 +523,15 @@ export async function applyMediaUnderstanding(params: { decisions.push(entry.decision); } - if (!outputs.some((output) => output.kind === "audio.transcription")) { - outputs.push(...buildSyntheticSkippedAudioOutputs(decisions)); - } + const audioOutputAttachmentIndexes = new Set( + outputs + .filter((output) => output.kind === "audio.transcription") + .map((output) => output.attachmentIndex), + ); + const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter( + (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex), + ); + outputs.push(...syntheticSkippedAudioOutputs); if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];