fix: handle placeholder transcripts per skipped attachment

2026-03-17 12:43:19 -04:00 · 2026-03-17 12:43:19 -04:00 · bac89c2ec8
commit bac89c2ec8
parent fd2f47a5e4
2 changed files with 58 additions and 7 deletions
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@ -499,6 +499,51 @@ describe("applyMediaUnderstanding", () => {
    );
  });

+  it("injects a placeholder transcript when local-path audio is too small", async () => {
+    const ctx = await createAudioCtx({
+      fileName: "tiny.ogg",
+      mediaType: "audio/ogg",
+      content: Buffer.alloc(100),
+    });
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: { id: "groq", transcribeAudio },
+      },
+    });
+
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
+  });
+
  it("skips audio transcription when attachment exceeds maxBytes", async () => {
    const ctx = await createAudioCtx({
      fileName: "large.wav",
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@ -293,10 +293,10 @@ function buildSyntheticSkippedAudioOutputs(
    return [];
  }
  return audioDecision.attachments.flatMap((attachment) => {
-    const reason = attachment.attempts
-      .map((attempt) => attempt.reason?.trim())
-      .find((value): value is string => Boolean(value));
-    if (!reason?.startsWith("tooSmall")) {
+    const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
+      attempt.reason?.trim().startsWith("tooSmall"),
+    );
+    if (!hasTooSmallAttempt) {
      return [];
    }
    return [
@ -523,9 +523,15 @@ export async function applyMediaUnderstanding(params: {
      decisions.push(entry.decision);
    }

-    if (!outputs.some((output) => output.kind === "audio.transcription")) {
-      outputs.push(...buildSyntheticSkippedAudioOutputs(decisions));
-    }
+    const audioOutputAttachmentIndexes = new Set(
+      outputs
+        .filter((output) => output.kind === "audio.transcription")
+        .map((output) => output.attachmentIndex),
+    );
+    const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
+      (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
+    );
+    outputs.push(...syntheticSkippedAudioOutputs);

    if (decisions.length > 0) {
      ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];