From fd2f47a5e49664880ca70231991b3c73ea478c65 Mon Sep 17 00:00:00 2001
From: Eulices Lopez <eulices@users.noreply.github.com>
Date: Tue, 17 Mar 2026 12:25:51 -0400
Subject: [PATCH 1/3] fix: add placeholder transcript for silent voice notes

---
 src/media-understanding/apply.test.ts | 19 +++++++++++++---
 src/media-understanding/apply.ts      | 32 +++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index bea9c6bc2bb..61649e7a3aa 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
     expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
   });
 
-  it("skips URL-only audio when remote file is too small", async () => {
-    // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
+  it("injects a placeholder transcript when URL-only audio is too small", async () => {
     mockedFetchRemoteMedia.mockResolvedValueOnce({
       buffer: Buffer.alloc(100),
       contentType: "audio/ogg",
@@ -483,7 +482,21 @@ describe("applyMediaUnderstanding", () => {
     });
 
     expect(transcribeAudio).not.toHaveBeenCalled();
-    expect(result.appliedAudio).toBe(false);
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
   });
 
   it("skips audio transcription when attachment exceeds maxBytes", async () => {
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 7721dae16b0..65a02930661 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
 };
 
 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
+const EMPTY_VOICE_NOTE_PLACEHOLDER =
+  "[Voice note was empty or contained only silence — no speech detected]";
 const EXTRA_TEXT_MIMES = [
   "application/xml",
   "text/xml",
@@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
   return TEXT_EXT_MIME.get(ext);
 }
 
+function buildSyntheticSkippedAudioOutputs(
+  decisions: MediaUnderstandingDecision[],
+): MediaUnderstandingOutput[] {
+  const audioDecision = decisions.find((decision) => decision.capability === "audio");
+  if (!audioDecision) {
+    return [];
+  }
+  return audioDecision.attachments.flatMap((attachment) => {
+    const reason = attachment.attempts
+      .map((attempt) => attempt.reason?.trim())
+      .find((value): value is string => Boolean(value));
+    if (!reason?.startsWith("tooSmall")) {
+      return [];
+    }
+    return [
+      {
+        kind: "audio.transcription" as const,
+        attachmentIndex: attachment.attachmentIndex,
+        text: EMPTY_VOICE_NOTE_PLACEHOLDER,
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      },
+    ];
+  });
+}
+
 function isBinaryMediaMime(mime?: string): boolean {
   if (!mime) {
     return false;
@@ -495,6 +523,10 @@ export async function applyMediaUnderstanding(params: {
       decisions.push(entry.decision);
     }
 
+    if (!outputs.some((output) => output.kind === "audio.transcription")) {
+      outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
+    }
+
     if (decisions.length > 0) {
       ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
     }

From bac89c2ec8c5daa2aa0a1aaa941d14ab6a1cbcef Mon Sep 17 00:00:00 2001
From: Eulices Lopez <eulices@users.noreply.github.com>
Date: Tue, 17 Mar 2026 12:43:19 -0400
Subject: [PATCH 2/3] fix: handle placeholder transcripts per skipped
 attachment

---
 src/media-understanding/apply.test.ts | 45 +++++++++++++++++++++++++++
 src/media-understanding/apply.ts      | 20 +++++++-----
 2 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index 61649e7a3aa..0152e1d0a70 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -499,6 +499,51 @@ describe("applyMediaUnderstanding", () => {
     );
   });
 
+  it("injects a placeholder transcript when local-path audio is too small", async () => {
+    const ctx = await createAudioCtx({
+      fileName: "tiny.ogg",
+      mediaType: "audio/ogg",
+      content: Buffer.alloc(100),
+    });
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: { id: "groq", transcribeAudio },
+      },
+    });
+
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
+  });
+
   it("skips audio transcription when attachment exceeds maxBytes", async () => {
     const ctx = await createAudioCtx({
       fileName: "large.wav",
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 65a02930661..1f97eaba5d7 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -293,10 +293,10 @@ function buildSyntheticSkippedAudioOutputs(
     return [];
   }
   return audioDecision.attachments.flatMap((attachment) => {
-    const reason = attachment.attempts
-      .map((attempt) => attempt.reason?.trim())
-      .find((value): value is string => Boolean(value));
-    if (!reason?.startsWith("tooSmall")) {
+    const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
+      attempt.reason?.trim().startsWith("tooSmall"),
+    );
+    if (!hasTooSmallAttempt) {
       return [];
     }
     return [
@@ -523,9 +523,15 @@ export async function applyMediaUnderstanding(params: {
       decisions.push(entry.decision);
     }
 
-    if (!outputs.some((output) => output.kind === "audio.transcription")) {
-      outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
-    }
+    const audioOutputAttachmentIndexes = new Set(
+      outputs
+        .filter((output) => output.kind === "audio.transcription")
+        .map((output) => output.attachmentIndex),
+    );
+    const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
+      (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
+    );
+    outputs.push(...syntheticSkippedAudioOutputs);
 
     if (decisions.length > 0) {
       ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];

From d6648e77bc6b4ddd731c811500d3dfa227a1bb67 Mon Sep 17 00:00:00 2001
From: Eulices Lopez <eulices@users.noreply.github.com>
Date: Wed, 18 Mar 2026 01:52:58 -0400
Subject: [PATCH 3/3] fix: preserve synthetic transcript attachment order

---
 src/media-understanding/apply.ts       |  1 +
 src/media-understanding/format.test.ts | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 1f97eaba5d7..d18bdbb282b 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -532,6 +532,7 @@ export async function applyMediaUnderstanding(params: {
       (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
     );
     outputs.push(...syntheticSkippedAudioOutputs);
+    outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex);
 
     if (decisions.length > 0) {
       ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts
index 172ecadf985..57ea5ebf83b 100644
--- a/src/media-understanding/format.test.ts
+++ b/src/media-understanding/format.test.ts
@@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
     });
     expect(body).toBe("[Image]\nDescription:\na cat");
   });
+
+  it("labels audio transcripts by their attachment order", () => {
+    const body = formatMediaUnderstandingBody({
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "first clip was silent",
+          provider: "openclaw",
+        },
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 1,
+          text: "second clip has speech",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe(
+      [
+        "[Audio 1/2]\nTranscript:\nfirst clip was silent",
+        "[Audio 2/2]\nTranscript:\nsecond clip has speech",
+      ].join("\n\n"),
+    );
+  });
 });