Merge d6648e77bc6b4ddd731c811500d3dfa227a1bb67 into 9fb78453e088cd7b553d7779faa0de5c83708e70
This commit is contained in:
commit
6e9fd851fd
@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
|
||||
expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
|
||||
});
|
||||
|
||||
it("skips URL-only audio when remote file is too small", async () => {
|
||||
// Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
|
||||
it("injects a placeholder transcript when URL-only audio is too small", async () => {
|
||||
mockedFetchRemoteMedia.mockResolvedValueOnce({
|
||||
buffer: Buffer.alloc(100),
|
||||
contentType: "audio/ogg",
|
||||
@ -483,7 +482,66 @@ describe("applyMediaUnderstanding", () => {
|
||||
});
|
||||
|
||||
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||
expect(result.appliedAudio).toBe(false);
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(result.outputs).toEqual([
|
||||
expect.objectContaining({
|
||||
kind: "audio.transcription",
|
||||
text: "[Voice note was empty or contained only silence — no speech detected]",
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
}),
|
||||
]);
|
||||
expect(ctx.Transcript).toBe(
|
||||
"[Voice note was empty or contained only silence — no speech detected]",
|
||||
);
|
||||
expect(ctx.Body).toBe(
|
||||
"[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
|
||||
);
|
||||
});
|
||||
|
||||
it("injects a placeholder transcript when local-path audio is too small", async () => {
|
||||
const ctx = await createAudioCtx({
|
||||
fileName: "tiny.ogg",
|
||||
mediaType: "audio/ogg",
|
||||
content: Buffer.alloc(100),
|
||||
});
|
||||
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
models: [{ provider: "groq" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: {
|
||||
groq: { id: "groq", transcribeAudio },
|
||||
},
|
||||
});
|
||||
|
||||
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(result.outputs).toEqual([
|
||||
expect.objectContaining({
|
||||
kind: "audio.transcription",
|
||||
text: "[Voice note was empty or contained only silence — no speech detected]",
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
}),
|
||||
]);
|
||||
expect(ctx.Transcript).toBe(
|
||||
"[Voice note was empty or contained only silence — no speech detected]",
|
||||
);
|
||||
expect(ctx.Body).toBe(
|
||||
"[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
|
||||
);
|
||||
});
|
||||
|
||||
it("skips audio transcription when attachment exceeds maxBytes", async () => {
|
||||
|
||||
@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
|
||||
};
|
||||
|
||||
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
|
||||
const EMPTY_VOICE_NOTE_PLACEHOLDER =
|
||||
"[Voice note was empty or contained only silence — no speech detected]";
|
||||
const EXTRA_TEXT_MIMES = [
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
|
||||
return TEXT_EXT_MIME.get(ext);
|
||||
}
|
||||
|
||||
function buildSyntheticSkippedAudioOutputs(
|
||||
decisions: MediaUnderstandingDecision[],
|
||||
): MediaUnderstandingOutput[] {
|
||||
const audioDecision = decisions.find((decision) => decision.capability === "audio");
|
||||
if (!audioDecision) {
|
||||
return [];
|
||||
}
|
||||
return audioDecision.attachments.flatMap((attachment) => {
|
||||
const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
|
||||
attempt.reason?.trim().startsWith("tooSmall"),
|
||||
);
|
||||
if (!hasTooSmallAttempt) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
{
|
||||
kind: "audio.transcription" as const,
|
||||
attachmentIndex: attachment.attachmentIndex,
|
||||
text: EMPTY_VOICE_NOTE_PLACEHOLDER,
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
},
|
||||
];
|
||||
});
|
||||
}
|
||||
|
||||
function isBinaryMediaMime(mime?: string): boolean {
|
||||
if (!mime) {
|
||||
return false;
|
||||
@ -495,6 +523,17 @@ export async function applyMediaUnderstanding(params: {
|
||||
decisions.push(entry.decision);
|
||||
}
|
||||
|
||||
const audioOutputAttachmentIndexes = new Set(
|
||||
outputs
|
||||
.filter((output) => output.kind === "audio.transcription")
|
||||
.map((output) => output.attachmentIndex),
|
||||
);
|
||||
const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
|
||||
(output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
|
||||
);
|
||||
outputs.push(...syntheticSkippedAudioOutputs);
|
||||
outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex);
|
||||
|
||||
if (decisions.length > 0) {
|
||||
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
|
||||
}
|
||||
|
||||
@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
|
||||
});
|
||||
expect(body).toBe("[Image]\nDescription:\na cat");
|
||||
});
|
||||
|
||||
it("labels audio transcripts by their attachment order", () => {
|
||||
const body = formatMediaUnderstandingBody({
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "first clip was silent",
|
||||
provider: "openclaw",
|
||||
},
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 1,
|
||||
text: "second clip has speech",
|
||||
provider: "groq",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(body).toBe(
|
||||
[
|
||||
"[Audio 1/2]\nTranscript:\nfirst clip was silent",
|
||||
"[Audio 2/2]\nTranscript:\nsecond clip has speech",
|
||||
].join("\n\n"),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user