diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index bea9c6bc2bb..0152e1d0a70 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript"); }); - it("skips URL-only audio when remote file is too small", async () => { - // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES) + it("injects a placeholder transcript when URL-only audio is too small", async () => { mockedFetchRemoteMedia.mockResolvedValueOnce({ buffer: Buffer.alloc(100), contentType: "audio/ogg", @@ -483,7 +482,66 @@ describe("applyMediaUnderstanding", () => { }); expect(transcribeAudio).not.toHaveBeenCalled(); - expect(result.appliedAudio).toBe(false); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note was empty or contained only silence — no speech detected]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note was empty or contained only silence — no speech detected]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]", + ); + }); + + it("injects a placeholder transcript when local-path audio is too small", async () => { + const ctx = await createAudioCtx({ + fileName: "tiny.ogg", + mediaType: "audio/ogg", + content: Buffer.alloc(100), + }); + const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" })); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { id: "groq", transcribeAudio }, + }, + }); + + expect(transcribeAudio).not.toHaveBeenCalled(); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note was empty or contained only silence — no speech detected]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note was empty or contained only silence — no speech detected]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]", + ); }); it("skips audio transcription when attachment exceeds maxBytes", async () => { diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 7721dae16b0..d18bdbb282b 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = { }; const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; +const EMPTY_VOICE_NOTE_PLACEHOLDER = + "[Voice note was empty or contained only silence — no speech detected]"; const EXTRA_TEXT_MIMES = [ "application/xml", "text/xml", @@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined { return TEXT_EXT_MIME.get(ext); } +function buildSyntheticSkippedAudioOutputs( + decisions: MediaUnderstandingDecision[], +): MediaUnderstandingOutput[] { + const audioDecision = decisions.find((decision) => decision.capability === "audio"); + if (!audioDecision) { + return []; + } + return audioDecision.attachments.flatMap((attachment) => { + const hasTooSmallAttempt = attachment.attempts.some((attempt) => + attempt.reason?.trim().startsWith("tooSmall"), + ); + if (!hasTooSmallAttempt) { + return []; + } + return [ + { + kind: "audio.transcription" as const, + attachmentIndex: attachment.attachmentIndex, + text: EMPTY_VOICE_NOTE_PLACEHOLDER, + provider: "openclaw", + model: "synthetic-empty-audio", + }, + ]; + }); +} + function isBinaryMediaMime(mime?: string): boolean { if (!mime) { return false; @@ -495,6 +523,17 @@ export async function applyMediaUnderstanding(params: { decisions.push(entry.decision); } + const audioOutputAttachmentIndexes = new Set( + outputs + .filter((output) => output.kind === "audio.transcription") + .map((output) => output.attachmentIndex), + ); + const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter( + (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex), + ); + outputs.push(...syntheticSkippedAudioOutputs); + outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex); + if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; } diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts index 172ecadf985..57ea5ebf83b 100644 --- a/src/media-understanding/format.test.ts +++ b/src/media-understanding/format.test.ts @@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => { }); expect(body).toBe("[Image]\nDescription:\na cat"); }); + + it("labels audio transcripts by their attachment order", () => { + const body = formatMediaUnderstandingBody({ + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "first clip was silent", + provider: "openclaw", + }, + { + kind: "audio.transcription", + attachmentIndex: 1, + text: "second clip has speech", + provider: "groq", + }, + ], + }); + expect(body).toBe( + [ + "[Audio 1/2]\nTranscript:\nfirst clip was silent", + "[Audio 2/2]\nTranscript:\nsecond clip has speech", + ].join("\n\n"), + ); + }); });