fix: rebuild queued followup media prompts

2026-03-14 15:45:17 -04:00 · 2026-03-14 15:45:17 -04:00 · be3eec46e2
commit be3eec46e2
parent 6edb3b7e34
2 changed files with 196 additions and 5 deletions
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@ -818,6 +818,138 @@ describe("createFollowupRunner media understanding", () => {
    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
  });

+  it("strips the full media line when attachment paths or URLs contain brackets", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("[media attached:");
+    expect(agentCall?.prompt).not.toContain("files[0].ogg]");
+  });
+
+  it("only strips leading synthetic media lines and preserves literal user text later in the prompt", async () => {
+    const transcriptText = "Transcript with literal token";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = "I literally typed [media attached: keep me] in this message.";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice.ogg (audio/ogg)]\nI literally typed [media attached: keep me] in this message.",
+        mediaContext: {
+          Body: "I literally typed [media attached: keep me] in this message.",
+          CommandBody: "I literally typed [media attached: keep me] in this message.",
+          RawBody: "I literally typed [media attached: keep me] in this message.",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(
+      "I literally typed [media attached: keep me] in this message.",
+    );
+    expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg (audio/ogg)]");
+  });
+
  it("skips media understanding when MediaUnderstanding is already populated", async () => {
    runEmbeddedPiAgentMock.mockResolvedValueOnce({
      payloads: [{ text: "reply" }],
@ -920,6 +1052,54 @@ describe("createFollowupRunner media understanding", () => {
    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" }));
  });

+  it("rebuilds the prompt when file extraction succeeds without media outputs", async () => {
+    const fileBlock = '<file name="notes.txt" mime="text/plain">\nline one\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `some text\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "file processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/notes.txt (text/plain)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/notes.txt (text/plain)]");
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt?.match(/<file\b/g)).toHaveLength(1);
+  });
+
  it("preserves non-audio media lines when only audio is transcribed", async () => {
    applyMediaUnderstandingMock.mockImplementationOnce(
      async (params: { ctx: Record<string, unknown> }) => {
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@ -43,13 +43,16 @@ import type { TypingController } from "./typing.js";

 const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+const LEADING_MEDIA_ATTACHED_LINE_RE =
+  /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
+const FILE_BLOCK_RE = /<file\b/i;

 function stripLeadingMediaAttachedLines(prompt: string): string {
  const lines = prompt.split("\n");
  let index = 0;
  while (index < lines.length) {
    const trimmed = lines[index]?.trim() ?? "";
-    if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
+    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
      break;
    }
    index += 1;
@ -252,8 +255,14 @@ export function createFollowupRunner(params: {
        );
        if (hasMedia) {
          try {
-            const mediaCtx = { ...queued.mediaContext } as MsgContext;
-            const originalBody = mediaCtx.Body;
+            const mediaCtx = {
+              ...queued.mediaContext,
+              Body:
+                queued.mediaContext.CommandBody ??
+                queued.mediaContext.RawBody ??
+                queued.mediaContext.Body,
+            } as MsgContext;
+            const originalBody = queued.mediaContext.Body;
            const muResult = await applyMediaUnderstanding({
              ctx: mediaCtx,
              cfg: queued.run.config,
@ -263,7 +272,10 @@ export function createFollowupRunner(params: {
                model: queued.run.model,
              },
            });
-            if (muResult.outputs.length > 0 || muResult.appliedFile) {
+            const shouldRebuildPrompt =
+              muResult.outputs.length > 0 ||
+              (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt));
+            if (shouldRebuildPrompt) {
              // Rebuild the queued prompt from the mutated media context so the
              // deferred path matches the primary path's prompt shape.
              const newMediaNote = buildInboundMediaNote(mediaCtx);
@ -273,7 +285,6 @@ export function createFollowupRunner(params: {
                updatedBody: mediaCtx.Body,
                mediaNote: newMediaNote,
              });
-
              logVerbose(
                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
              );