fix: add placeholder transcript for silent voice notes

2026-03-17 12:25:51 -04:00 · 2026-03-17 12:25:51 -04:00 · fd2f47a5e4
commit fd2f47a5e4
parent dc86b6d72a
2 changed files with 48 additions and 3 deletions
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
  });

-  it("skips URL-only audio when remote file is too small", async () => {
-    // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
+  it("injects a placeholder transcript when URL-only audio is too small", async () => {
    mockedFetchRemoteMedia.mockResolvedValueOnce({
      buffer: Buffer.alloc(100),
      contentType: "audio/ogg",
@ -483,7 +482,21 @@ describe("applyMediaUnderstanding", () => {
    });

    expect(transcribeAudio).not.toHaveBeenCalled();
-    expect(result.appliedAudio).toBe(false);
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
  });

  it("skips audio transcription when attachment exceeds maxBytes", async () => {
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
 };

 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
+const EMPTY_VOICE_NOTE_PLACEHOLDER =
+  "[Voice note was empty or contained only silence — no speech detected]";
 const EXTRA_TEXT_MIMES = [
  "application/xml",
  "text/xml",
@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
  return TEXT_EXT_MIME.get(ext);
 }

+function buildSyntheticSkippedAudioOutputs(
+  decisions: MediaUnderstandingDecision[],
+): MediaUnderstandingOutput[] {
+  const audioDecision = decisions.find((decision) => decision.capability === "audio");
+  if (!audioDecision) {
+    return [];
+  }
+  return audioDecision.attachments.flatMap((attachment) => {
+    const reason = attachment.attempts
+      .map((attempt) => attempt.reason?.trim())
+      .find((value): value is string => Boolean(value));
+    if (!reason?.startsWith("tooSmall")) {
+      return [];
+    }
+    return [
+      {
+        kind: "audio.transcription" as const,
+        attachmentIndex: attachment.attachmentIndex,
+        text: EMPTY_VOICE_NOTE_PLACEHOLDER,
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      },
+    ];
+  });
+}
+
 function isBinaryMediaMime(mime?: string): boolean {
  if (!mime) {
    return false;
@ -495,6 +523,10 @@ export async function applyMediaUnderstanding(params: {
      decisions.push(entry.decision);
    }

+    if (!outputs.some((output) => output.kind === "audio.transcription")) {
+      outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
+    }
+
    if (decisions.length > 0) {
      ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
    }