fix: handle placeholder transcripts per skipped attachment

This commit is contained in:
Eulices Lopez 2026-03-17 12:43:19 -04:00 committed by Eulices Lopez
parent fd2f47a5e4
commit bac89c2ec8
2 changed files with 58 additions and 7 deletions

View File

@ -499,6 +499,51 @@ describe("applyMediaUnderstanding", () => {
);
});
it("injects a placeholder transcript when local-path audio is too small", async () => {
const ctx = await createAudioCtx({
fileName: "tiny.ogg",
mediaType: "audio/ogg",
content: Buffer.alloc(100),
});
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: { id: "groq", transcribeAudio },
},
});
expect(transcribeAudio).not.toHaveBeenCalled();
expect(result.appliedAudio).toBe(true);
expect(result.outputs).toEqual([
expect.objectContaining({
kind: "audio.transcription",
text: "[Voice note was empty or contained only silence — no speech detected]",
provider: "openclaw",
model: "synthetic-empty-audio",
}),
]);
expect(ctx.Transcript).toBe(
"[Voice note was empty or contained only silence — no speech detected]",
);
expect(ctx.Body).toBe(
"[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
);
});
it("skips audio transcription when attachment exceeds maxBytes", async () => {
const ctx = await createAudioCtx({
fileName: "large.wav",

View File

@ -293,10 +293,10 @@ function buildSyntheticSkippedAudioOutputs(
return [];
}
return audioDecision.attachments.flatMap((attachment) => {
const reason = attachment.attempts
.map((attempt) => attempt.reason?.trim())
.find((value): value is string => Boolean(value));
if (!reason?.startsWith("tooSmall")) {
const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
attempt.reason?.trim().startsWith("tooSmall"),
);
if (!hasTooSmallAttempt) {
return [];
}
return [
@ -523,9 +523,15 @@ export async function applyMediaUnderstanding(params: {
decisions.push(entry.decision);
}
if (!outputs.some((output) => output.kind === "audio.transcription")) {
outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
}
const audioOutputAttachmentIndexes = new Set(
outputs
.filter((output) => output.kind === "audio.transcription")
.map((output) => output.attachmentIndex),
);
const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
(output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
);
outputs.push(...syntheticSkippedAudioOutputs);
if (decisions.length > 0) {
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];