From bac89c2ec8c5daa2aa0a1aaa941d14ab6a1cbcef Mon Sep 17 00:00:00 2001
From: Eulices Lopez <eulices@users.noreply.github.com>
Date: Tue, 17 Mar 2026 12:43:19 -0400
Subject: [PATCH] fix: handle placeholder transcripts per skipped attachment

---
 src/media-understanding/apply.test.ts | 45 +++++++++++++++++++++++++++
 src/media-understanding/apply.ts      | 20 +++++++-----
 2 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index 61649e7a3aa..0152e1d0a70 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -499,6 +499,51 @@ describe("applyMediaUnderstanding", () => {
     );
   });
 
+  it("injects a placeholder transcript when local-path audio is too small", async () => {
+    const ctx = await createAudioCtx({
+      fileName: "tiny.ogg",
+      mediaType: "audio/ogg",
+      content: Buffer.alloc(100),
+    });
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: { id: "groq", transcribeAudio },
+      },
+    });
+
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
+  });
+
   it("skips audio transcription when attachment exceeds maxBytes", async () => {
     const ctx = await createAudioCtx({
       fileName: "large.wav",
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 65a02930661..1f97eaba5d7 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -293,10 +293,10 @@ function buildSyntheticSkippedAudioOutputs(
     return [];
   }
   return audioDecision.attachments.flatMap((attachment) => {
-    const reason = attachment.attempts
-      .map((attempt) => attempt.reason?.trim())
-      .find((value): value is string => Boolean(value));
-    if (!reason?.startsWith("tooSmall")) {
+    const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
+      attempt.reason?.trim().startsWith("tooSmall"),
+    );
+    if (!hasTooSmallAttempt) {
       return [];
     }
     return [
@@ -523,9 +523,15 @@ export async function applyMediaUnderstanding(params: {
       decisions.push(entry.decision);
     }
 
-    if (!outputs.some((output) => output.kind === "audio.transcription")) {
-      outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
-    }
+    const audioOutputAttachmentIndexes = new Set(
+      outputs
+        .filter((output) => output.kind === "audio.transcription")
+        .map((output) => output.attachmentIndex),
+    );
+    const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
+      (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
+    );
+    outputs.push(...syntheticSkippedAudioOutputs);
 
     if (decisions.length > 0) {
       ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];