Merge d6648e77bc6b4ddd731c811500d3dfa227a1bb67 into 9fb78453e088cd7b553d7779faa0de5c83708e70

2026-03-20 22:02:18 -07:00 · 2026-03-20 22:02:18 -07:00 · 6e9fd851fd
commit 6e9fd851fd
parent 9fb78453e0 d6648e77bc
3 changed files with 125 additions and 3 deletions
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@ -443,8 +443,7 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
  });

-  it("skips URL-only audio when remote file is too small", async () => {
-    // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
+  it("injects a placeholder transcript when URL-only audio is too small", async () => {
    mockedFetchRemoteMedia.mockResolvedValueOnce({
      buffer: Buffer.alloc(100),
      contentType: "audio/ogg",
@ -483,7 +482,66 @@ describe("applyMediaUnderstanding", () => {
    });

    expect(transcribeAudio).not.toHaveBeenCalled();
-    expect(result.appliedAudio).toBe(false);
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
+  });
+
+  it("injects a placeholder transcript when local-path audio is too small", async () => {
+    const ctx = await createAudioCtx({
+      fileName: "tiny.ogg",
+      mediaType: "audio/ogg",
+      content: Buffer.alloc(100),
+    });
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: { id: "groq", transcribeAudio },
+      },
+    });
+
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note was empty or contained only silence — no speech detected]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note was empty or contained only silence — no speech detected]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note was empty or contained only silence — no speech detected]",
+    );
  });

  it("skips audio transcription when attachment exceeds maxBytes", async () => {
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@ -43,6 +43,8 @@ export type ApplyMediaUnderstandingResult = {
 };

 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
+const EMPTY_VOICE_NOTE_PLACEHOLDER =
+  "[Voice note was empty or contained only silence — no speech detected]";
 const EXTRA_TEXT_MIMES = [
  "application/xml",
  "text/xml",
@ -283,6 +285,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
  return TEXT_EXT_MIME.get(ext);
 }

+function buildSyntheticSkippedAudioOutputs(
+  decisions: MediaUnderstandingDecision[],
+): MediaUnderstandingOutput[] {
+  const audioDecision = decisions.find((decision) => decision.capability === "audio");
+  if (!audioDecision) {
+    return [];
+  }
+  return audioDecision.attachments.flatMap((attachment) => {
+    const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
+      attempt.reason?.trim().startsWith("tooSmall"),
+    );
+    if (!hasTooSmallAttempt) {
+      return [];
+    }
+    return [
+      {
+        kind: "audio.transcription" as const,
+        attachmentIndex: attachment.attachmentIndex,
+        text: EMPTY_VOICE_NOTE_PLACEHOLDER,
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      },
+    ];
+  });
+}
+
 function isBinaryMediaMime(mime?: string): boolean {
  if (!mime) {
    return false;
@ -495,6 +523,17 @@ export async function applyMediaUnderstanding(params: {
      decisions.push(entry.decision);
    }

+    const audioOutputAttachmentIndexes = new Set(
+      outputs
+        .filter((output) => output.kind === "audio.transcription")
+        .map((output) => output.attachmentIndex),
+    );
+    const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
+      (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
+    );
+    outputs.push(...syntheticSkippedAudioOutputs);
+    outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex);
+
    if (decisions.length > 0) {
      ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
    }
--- a/src/media-understanding/format.test.ts
+++ b/src/media-understanding/format.test.ts
@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
    });
    expect(body).toBe("[Image]\nDescription:\na cat");
  });
+
+  it("labels audio transcripts by their attachment order", () => {
+    const body = formatMediaUnderstandingBody({
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "first clip was silent",
+          provider: "openclaw",
+        },
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 1,
+          text: "second clip has speech",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe(
+      [
+        "[Audio 1/2]\nTranscript:\nfirst clip was silent",
+        "[Audio 2/2]\nTranscript:\nsecond clip has speech",
+      ].join("\n\n"),
+    );
+  });
 });