fix: add placeholder transcript for silent voice notes
This commit is contained in:
parent
dc86b6d72a
commit
fd2f47a5e4
@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
|
||||
expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
|
||||
});
|
||||
|
||||
it("skips URL-only audio when remote file is too small", async () => {
|
||||
// Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
|
||||
it("injects a placeholder transcript when URL-only audio is too small", async () => {
|
||||
mockedFetchRemoteMedia.mockResolvedValueOnce({
|
||||
buffer: Buffer.alloc(100),
|
||||
contentType: "audio/ogg",
|
||||
@ -483,7 +482,21 @@ describe("applyMediaUnderstanding", () => {
|
||||
});
|
||||
|
||||
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||
expect(result.appliedAudio).toBe(false);
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(result.outputs).toEqual([
|
||||
expect.objectContaining({
|
||||
kind: "audio.transcription",
|
||||
text: "[Voice note was empty or contained only silence — no speech detected]",
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
}),
|
||||
]);
|
||||
expect(ctx.Transcript).toBe(
|
||||
"[Voice note was empty or contained only silence — no speech detected]",
|
||||
);
|
||||
expect(ctx.Body).toBe(
|
||||
"[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
|
||||
);
|
||||
});
|
||||
|
||||
it("skips audio transcription when attachment exceeds maxBytes", async () => {
|
||||
|
||||
@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
|
||||
};
|
||||
|
||||
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
|
||||
const EMPTY_VOICE_NOTE_PLACEHOLDER =
|
||||
"[Voice note was empty or contained only silence — no speech detected]";
|
||||
const EXTRA_TEXT_MIMES = [
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
|
||||
return TEXT_EXT_MIME.get(ext);
|
||||
}
|
||||
|
||||
function buildSyntheticSkippedAudioOutputs(
|
||||
decisions: MediaUnderstandingDecision[],
|
||||
): MediaUnderstandingOutput[] {
|
||||
const audioDecision = decisions.find((decision) => decision.capability === "audio");
|
||||
if (!audioDecision) {
|
||||
return [];
|
||||
}
|
||||
return audioDecision.attachments.flatMap((attachment) => {
|
||||
const reason = attachment.attempts
|
||||
.map((attempt) => attempt.reason?.trim())
|
||||
.find((value): value is string => Boolean(value));
|
||||
if (!reason?.startsWith("tooSmall")) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
{
|
||||
kind: "audio.transcription" as const,
|
||||
attachmentIndex: attachment.attachmentIndex,
|
||||
text: EMPTY_VOICE_NOTE_PLACEHOLDER,
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
},
|
||||
];
|
||||
});
|
||||
}
|
||||
|
||||
function isBinaryMediaMime(mime?: string): boolean {
|
||||
if (!mime) {
|
||||
return false;
|
||||
@ -495,6 +523,10 @@ export async function applyMediaUnderstanding(params: {
|
||||
decisions.push(entry.decision);
|
||||
}
|
||||
|
||||
if (!outputs.some((output) => output.kind === "audio.transcription")) {
|
||||
outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
|
||||
}
|
||||
|
||||
if (decisions.length > 0) {
|
||||
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user