fix: add placeholder transcript for silent voice notes

This commit is contained in:
Eulices Lopez 2026-03-17 12:25:51 -04:00 committed by Eulices Lopez
parent dc86b6d72a
commit fd2f47a5e4
2 changed files with 48 additions and 3 deletions

View File

@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
});
it("skips URL-only audio when remote file is too small", async () => {
// Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
it("injects a placeholder transcript when URL-only audio is too small", async () => {
mockedFetchRemoteMedia.mockResolvedValueOnce({
buffer: Buffer.alloc(100),
contentType: "audio/ogg",
@ -483,7 +482,21 @@ describe("applyMediaUnderstanding", () => {
});
expect(transcribeAudio).not.toHaveBeenCalled();
expect(result.appliedAudio).toBe(false);
expect(result.appliedAudio).toBe(true);
expect(result.outputs).toEqual([
expect.objectContaining({
kind: "audio.transcription",
text: "[Voice note was empty or contained only silence — no speech detected]",
provider: "openclaw",
model: "synthetic-empty-audio",
}),
]);
expect(ctx.Transcript).toBe(
"[Voice note was empty or contained only silence — no speech detected]",
);
expect(ctx.Body).toBe(
"[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
);
});
it("skips audio transcription when attachment exceeds maxBytes", async () => {

View File

@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
};
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
const EMPTY_VOICE_NOTE_PLACEHOLDER =
"[Voice note was empty or contained only silence — no speech detected]";
const EXTRA_TEXT_MIMES = [
"application/xml",
"text/xml",
@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
return TEXT_EXT_MIME.get(ext);
}
function buildSyntheticSkippedAudioOutputs(
decisions: MediaUnderstandingDecision[],
): MediaUnderstandingOutput[] {
const audioDecision = decisions.find((decision) => decision.capability === "audio");
if (!audioDecision) {
return [];
}
return audioDecision.attachments.flatMap((attachment) => {
const reason = attachment.attempts
.map((attempt) => attempt.reason?.trim())
.find((value): value is string => Boolean(value));
if (!reason?.startsWith("tooSmall")) {
return [];
}
return [
{
kind: "audio.transcription" as const,
attachmentIndex: attachment.attachmentIndex,
text: EMPTY_VOICE_NOTE_PLACEHOLDER,
provider: "openclaw",
model: "synthetic-empty-audio",
},
];
});
}
function isBinaryMediaMime(mime?: string): boolean {
if (!mime) {
return false;
@ -495,6 +523,10 @@ export async function applyMediaUnderstanding(params: {
decisions.push(entry.decision);
}
if (!outputs.some((output) => output.kind === "audio.transcription")) {
outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
}
if (decisions.length > 0) {
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
}