diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index bea9c6bc2bb..0152e1d0a70 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
     expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
   });
 
-  it("skips URL-only audio when remote file is too small", async () => {
-    // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
+  it("injects a placeholder transcript when URL-only audio is too small", async () => {
     mockedFetchRemoteMedia.mockResolvedValueOnce({
       buffer: Buffer.alloc(100),
       contentType: "audio/ogg",
@@ -483,7 +482,66 @@ describe("applyMediaUnderstanding", () => {
     });
 
     expect(transcribeAudio).not.toHaveBeenCalled();
-    expect(result.appliedAudio).toBe(false);
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
+  });
+
+  it("injects a placeholder transcript when local-path audio is too small", async () => {
+    const ctx = await createAudioCtx({
+      fileName: "tiny.ogg",
+      mediaType: "audio/ogg",
+      content: Buffer.alloc(100),
+    });
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: { id: "groq", transcribeAudio },
+      },
+    });
+
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
   });
 
   it("skips audio transcription when attachment exceeds maxBytes", async () => {
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 7721dae16b0..d18bdbb282b 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
 };
 
 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
+const EMPTY_VOICE_NOTE_PLACEHOLDER =
+  "[Voice note was empty or contained only silence — no speech detected]";
 const EXTRA_TEXT_MIMES = [
   "application/xml",
   "text/xml",
@@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
   return TEXT_EXT_MIME.get(ext);
 }
 
+function buildSyntheticSkippedAudioOutputs(
+  decisions: MediaUnderstandingDecision[],
+): MediaUnderstandingOutput[] {
+  const audioDecision = decisions.find((decision) => decision.capability === "audio");
+  if (!audioDecision) {
+    return [];
+  }
+  return audioDecision.attachments.flatMap((attachment) => {
+    const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
+      attempt.reason?.trim().startsWith("tooSmall"),
+    );
+    if (!hasTooSmallAttempt) {
+      return [];
+    }
+    return [
+      {
+        kind: "audio.transcription" as const,
+        attachmentIndex: attachment.attachmentIndex,
+        text: EMPTY_VOICE_NOTE_PLACEHOLDER,
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      },
+    ];
+  });
+}
+
 function isBinaryMediaMime(mime?: string): boolean {
   if (!mime) {
     return false;
@@ -495,6 +523,17 @@ export async function applyMediaUnderstanding(params: {
       decisions.push(entry.decision);
     }
 
+    const audioOutputAttachmentIndexes = new Set(
+      outputs
+        .filter((output) => output.kind === "audio.transcription")
+        .map((output) => output.attachmentIndex),
+    );
+    const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
+      (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
+    );
+    outputs.push(...syntheticSkippedAudioOutputs);
+    outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex);
+
     if (decisions.length > 0) {
       ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
     }
diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts
index 172ecadf985..57ea5ebf83b 100644
--- a/src/media-understanding/format.test.ts
+++ b/src/media-understanding/format.test.ts
@@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
     });
     expect(body).toBe("[Image]\nDescription:\na cat");
   });
+
+  it("labels audio transcripts by their attachment order", () => {
+    const body = formatMediaUnderstandingBody({
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "first clip was silent",
+          provider: "openclaw",
+        },
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 1,
+          text: "second clip has speech",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe(
+      [
+        "[Audio 1/2]\nTranscript:\nfirst clip was silent",
+        "[Audio 2/2]\nTranscript:\nsecond clip has speech",
+      ].join("\n\n"),
+    );
+  });
 });