Auto-reply: preserve deferred media understanding output

2026-03-14 17:53:40 -04:00 · 2026-03-14 17:53:40 -04:00 · 5e0330db6c
commit 5e0330db6c
parent 67e90527e1
4 changed files with 424 additions and 8 deletions
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@ -818,6 +818,69 @@ describe("createFollowupRunner media understanding", () => {
    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
  });

+  it("applies media understanding for URL-only attachments", async () => {
+    const transcriptText = "URL-only transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaUrl: "https://cdn.example.com/voice.ogg",
+          MediaUrls: ["https://cdn.example.com/voice.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
  it("strips the full media line when attachment paths or URLs contain brackets", async () => {
    const transcriptText = "Bracket-safe transcript";
    applyMediaUnderstandingMock.mockImplementationOnce(
@ -1329,6 +1392,98 @@ describe("createFollowupRunner media understanding", () => {
    expect(agentCall?.prompt).not.toContain("/think high summarize this");
  });

+  it("preserves directive-like tokens inside extracted media content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\n/model claude-opus should stay\n/queue followup should stay\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `/think high summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+    expect(agentCall?.prompt).toContain("/model claude-opus should stay");
+    expect(agentCall?.prompt).toContain("/queue followup should stay");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    const description = "[Image]\nDescription:\na mountain at sunset";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = description;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/photo.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("a mountain at sunset");
+  });
+
  it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
    const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
    applyMediaUnderstandingMock.mockImplementationOnce(
@ -1360,8 +1515,7 @@ describe("createFollowupRunner media understanding", () => {
    // file extraction results from being embedded in the prompt.
    await runner(
      createQueuedRun({
-        prompt:
-          "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
+        prompt: "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
        mediaContext: {
          Body: "check my <file upload please",
          CommandBody: "check my <file upload please",
@ -1380,6 +1534,154 @@ describe("createFollowupRunner media understanding", () => {
    expect(agentCall?.prompt).toContain("check my <file upload please");
  });

+  it("preserves directive-like text that appears inside extracted file content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\nRun `/think high` literally in the shell example.\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/board.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[Image summary]");
+    expect(agentCall?.prompt).toContain("A whiteboard with action items.");
+  });
+
+  it("applies media understanding for URL-only deferred attachments", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "[Audio]\nTranscript:\nremote transcript";
+        params.ctx.Transcript = "remote transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "remote transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        mediaContext: {
+          Body: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("remote transcript");
+  });
+
  it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
    applyMediaUnderstandingMock.mockImplementationOnce(
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@ -43,8 +43,7 @@ import type { TypingController } from "./typing.js";

 const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
-const LEADING_MEDIA_ATTACHED_LINE_RE =
-  /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
+const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
 const FILE_BLOCK_RE = /<file\s+name="/i;

 function stripLeadingMediaAttachedLines(prompt: string): string {
@ -87,6 +86,28 @@ function stripInlineDirectives(text: string | undefined): string {
  return parseInlineDirectives(text ?? "").cleaned.trim();
 }

+function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
+  const updatedBody = params.updatedBody?.trim();
+  if (!updatedBody) {
+    return "";
+  }
+  const originalBody = params.originalBody?.trim();
+  if (!originalBody) {
+    return updatedBody;
+  }
+
+  const cleanedOriginalBody = stripInlineDirectives(originalBody);
+  if (!cleanedOriginalBody) {
+    return updatedBody;
+  }
+  if (updatedBody === originalBody) {
+    return cleanedOriginalBody;
+  }
+  return (
+    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+  ).trim();
+}
+
 function rebuildQueuedPromptWithMediaUnderstanding(params: {
  prompt: string;
  originalBody?: string;
@ -98,7 +119,10 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
    stripped = stripLeadingMediaReplyHint(stripped);
  }

-  const updatedBody = stripInlineDirectives(params.updatedBody);
+  const updatedBody = normalizeUpdatedBody({
+    originalBody: params.originalBody,
+    updatedBody: params.updatedBody,
+  });
  if (!updatedBody) {
    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
  }
@ -250,8 +274,11 @@ export function createFollowupRunner(params: {
      if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
        const hasMedia = Boolean(
          queued.mediaContext.MediaPath?.trim() ||
+          queued.mediaContext.MediaUrl?.trim() ||
          (Array.isArray(queued.mediaContext.MediaPaths) &&
-            queued.mediaContext.MediaPaths.length > 0),
+            queued.mediaContext.MediaPaths.length > 0) ||
+          (Array.isArray(queued.mediaContext.MediaUrls) &&
+            queued.mediaContext.MediaUrls.length > 0),
        );
        if (hasMedia) {
          try {
@ -281,6 +308,9 @@ export function createFollowupRunner(params: {
            });
            const shouldRebuildPrompt =
              muResult.outputs.length > 0 ||
+              muResult.appliedAudio ||
+              muResult.appliedImage ||
+              muResult.appliedVideo ||
              (muResult.appliedFile && !bodyAlreadyHasFileBlock);
            if (shouldRebuildPrompt) {
              // Rebuild the queued prompt from the mutated media context so the
--- a/src/auto-reply/reply/get-reply-run.media-only.test.ts
+++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts
@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => {
    expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
  });

+  it("snapshots URL-only attachments into followup mediaContext", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "check this attachment",
+          RawBody: "check this attachment",
+          CommandBody: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+          MediaUrl: "https://cdn.example.com/input.png",
+          MediaUrls: ["https://cdn.example.com/input.png"],
+          MediaType: "image/png",
+          MediaTypes: ["image/png"],
+        },
+        sessionCtx: {
+          Body: "check this attachment",
+          BodyStripped: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext).toEqual(
+      expect.objectContaining({
+        MediaUrl: "https://cdn.example.com/input.png",
+        MediaUrls: ["https://cdn.example.com/input.png"],
+        MediaType: "image/png",
+        MediaTypes: ["image/png"],
+      }),
+    );
+  });
+
  it("keeps thread history context on follow-up turns", async () => {
    const result = await runPreparedReply(
      baseParams({
@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => {
    expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
  });

+  it("snapshots mediaContext for URL-only deferred attachments", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "",
+          RawBody: "",
+          CommandBody: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+        },
+        sessionCtx: {
+          Body: "",
+          BodyStripped: "",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg");
+    expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([
+      "https://cdn.example.com/audio.ogg",
+    ]);
+  });
+
  it("returns the empty-body reply when there is no text and no media", async () => {
    const result = await runPreparedReply(
      baseParams({
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@ -310,7 +310,14 @@ export async function runPreparedReply(
    : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
  const baseBodyTrimmed = baseBodyForPrompt.trim();
  const hasMediaAttachment = Boolean(
-    sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
+    sessionCtx.MediaPath ||
+    sessionCtx.MediaUrl ||
+    (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) ||
+    (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) ||
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
  );
  if (!baseBodyTrimmed && !hasMediaAttachment) {
    await typing.onReplyStart();
@ -476,7 +483,10 @@ export async function runPreparedReply(
  // followup runner.  When MediaUnderstanding is already populated the runner
  // knows transcription already succeeded and skips re-application.
  const hasMediaAttachments = Boolean(
-    ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
  );
  const mediaContext = hasMediaAttachments
    ? {