From fd2f47a5e49664880ca70231991b3c73ea478c65 Mon Sep 17 00:00:00 2001
From: Eulices Lopez <eulices@users.noreply.github.com>
Date: Tue, 17 Mar 2026 12:25:51 -0400
Subject: [PATCH] fix: add placeholder transcript for silent voice notes

---
 src/media-understanding/apply.test.ts | 19 +++++++++++++---
 src/media-understanding/apply.ts      | 32 +++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index bea9c6bc2bb..61649e7a3aa 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
     expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
   });
 
-  it("skips URL-only audio when remote file is too small", async () => {
-    // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
+  it("injects a placeholder transcript when URL-only audio is too small", async () => {
     mockedFetchRemoteMedia.mockResolvedValueOnce({
       buffer: Buffer.alloc(100),
       contentType: "audio/ogg",
@@ -483,7 +482,21 @@ describe("applyMediaUnderstanding", () => {
     });
 
     expect(transcribeAudio).not.toHaveBeenCalled();
-    expect(result.appliedAudio).toBe(false);
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
   });
 
   it("skips audio transcription when attachment exceeds maxBytes", async () => {
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 7721dae16b0..65a02930661 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
 };
 
 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
+const EMPTY_VOICE_NOTE_PLACEHOLDER =
+  "[Voice note was empty or contained only silence — no speech detected]";
 const EXTRA_TEXT_MIMES = [
   "application/xml",
   "text/xml",
@@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
   return TEXT_EXT_MIME.get(ext);
 }
 
+function buildSyntheticSkippedAudioOutputs(
+  decisions: MediaUnderstandingDecision[],
+): MediaUnderstandingOutput[] {
+  const audioDecision = decisions.find((decision) => decision.capability === "audio");
+  if (!audioDecision) {
+    return [];
+  }
+  return audioDecision.attachments.flatMap((attachment) => {
+    const reason = attachment.attempts
+      .map((attempt) => attempt.reason?.trim())
+      .find((value): value is string => Boolean(value));
+    if (!reason?.startsWith("tooSmall")) {
+      return [];
+    }
+    return [
+      {
+        kind: "audio.transcription" as const,
+        attachmentIndex: attachment.attachmentIndex,
+        text: EMPTY_VOICE_NOTE_PLACEHOLDER,
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      },
+    ];
+  });
+}
+
 function isBinaryMediaMime(mime?: string): boolean {
   if (!mime) {
     return false;
@@ -495,6 +523,10 @@ export async function applyMediaUnderstanding(params: {
       decisions.push(entry.decision);
     }
 
+    if (!outputs.some((output) => output.kind === "audio.transcription")) {
+      outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
+    }
+
     if (decisions.length > 0) {
       ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
     }