Merge d6648e77bc6b4ddd731c811500d3dfa227a1bb67 into 9fb78453e088cd7b553d7779faa0de5c83708e70

This commit is contained in:
Eulices 2026-03-20 22:02:18 -07:00 committed by GitHub
commit 6e9fd851fd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 125 additions and 3 deletions

View File

@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
});
it("skips URL-only audio when remote file is too small", async () => {
// Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
it("injects a placeholder transcript when URL-only audio is too small", async () => {
mockedFetchRemoteMedia.mockResolvedValueOnce({
buffer: Buffer.alloc(100),
contentType: "audio/ogg",
@ -483,7 +482,66 @@ describe("applyMediaUnderstanding", () => {
});
expect(transcribeAudio).not.toHaveBeenCalled();
expect(result.appliedAudio).toBe(false);
expect(result.appliedAudio).toBe(true);
expect(result.outputs).toEqual([
expect.objectContaining({
kind: "audio.transcription",
text: "[Voice note was empty or contained only silence — no speech detected]",
provider: "openclaw",
model: "synthetic-empty-audio",
}),
]);
expect(ctx.Transcript).toBe(
"[Voice note was empty or contained only silence — no speech detected]",
);
expect(ctx.Body).toBe(
"[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
);
});
it("injects a placeholder transcript when local-path audio is too small", async () => {
const ctx = await createAudioCtx({
fileName: "tiny.ogg",
mediaType: "audio/ogg",
content: Buffer.alloc(100),
});
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: { id: "groq", transcribeAudio },
},
});
expect(transcribeAudio).not.toHaveBeenCalled();
expect(result.appliedAudio).toBe(true);
expect(result.outputs).toEqual([
expect.objectContaining({
kind: "audio.transcription",
text: "[Voice note was empty or contained only silence — no speech detected]",
provider: "openclaw",
model: "synthetic-empty-audio",
}),
]);
expect(ctx.Transcript).toBe(
"[Voice note was empty or contained only silence — no speech detected]",
);
expect(ctx.Body).toBe(
"[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
);
});
it("skips audio transcription when attachment exceeds maxBytes", async () => {

View File

@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
};
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
const EMPTY_VOICE_NOTE_PLACEHOLDER =
"[Voice note was empty or contained only silence — no speech detected]";
const EXTRA_TEXT_MIMES = [
"application/xml",
"text/xml",
@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
return TEXT_EXT_MIME.get(ext);
}
function buildSyntheticSkippedAudioOutputs(
decisions: MediaUnderstandingDecision[],
): MediaUnderstandingOutput[] {
const audioDecision = decisions.find((decision) => decision.capability === "audio");
if (!audioDecision) {
return [];
}
return audioDecision.attachments.flatMap((attachment) => {
const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
attempt.reason?.trim().startsWith("tooSmall"),
);
if (!hasTooSmallAttempt) {
return [];
}
return [
{
kind: "audio.transcription" as const,
attachmentIndex: attachment.attachmentIndex,
text: EMPTY_VOICE_NOTE_PLACEHOLDER,
provider: "openclaw",
model: "synthetic-empty-audio",
},
];
});
}
function isBinaryMediaMime(mime?: string): boolean {
if (!mime) {
return false;
@ -495,6 +523,17 @@ export async function applyMediaUnderstanding(params: {
decisions.push(entry.decision);
}
const audioOutputAttachmentIndexes = new Set(
outputs
.filter((output) => output.kind === "audio.transcription")
.map((output) => output.attachmentIndex),
);
const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
(output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
);
outputs.push(...syntheticSkippedAudioOutputs);
outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex);
if (decisions.length > 0) {
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
}

View File

@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
});
expect(body).toBe("[Image]\nDescription:\na cat");
});
it("labels audio transcripts by their attachment order", () => {
const body = formatMediaUnderstandingBody({
outputs: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "first clip was silent",
provider: "openclaw",
},
{
kind: "audio.transcription",
attachmentIndex: 1,
text: "second clip has speech",
provider: "groq",
},
],
});
expect(body).toBe(
[
"[Audio 1/2]\nTranscript:\nfirst clip was silent",
"[Audio 2/2]\nTranscript:\nsecond clip has speech",
].join("\n\n"),
);
});
});