From 6edb3b7e345573df8fe92485bd6d45e604de29b9 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 15:29:55 -0400
Subject: [PATCH] fix queued media-understanding prompt rebuild

---
 src/auto-reply/reply/followup-runner.test.ts | 170 ++++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts      | 116 ++++++++++---
 2 files changed, 260 insertions(+), 26 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index f0a060af4ac..71451526400 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -67,6 +67,9 @@ beforeEach(() => {
 const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
   createMockFollowupRun({ run: { messageProvider } });
 
+const MEDIA_REPLY_HINT =
+  "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body.";
+
 function createQueuedRun(
   overrides: Partial<Omit<FollowupRun, "run">> & {
     run?: Partial<FollowupRun["run"]>;
@@ -741,7 +744,7 @@ describe("createFollowupRunner media understanding", () => {
   it("applies audio transcription when mediaContext has untranscribed audio", async () => {
     const transcriptText = "Hello, this is a voice note.";
     // The real applyMediaUnderstanding mutates the ctx; the mock must do the same
-    // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line.
+    // so buildInboundMediaNote and queued prompt rebuilding see the transcribed body.
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
         params.ctx.MediaUnderstanding = [
@@ -753,6 +756,7 @@ describe("createFollowupRunner media understanding", () => {
           },
         ];
         params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
         return {
           outputs: [
             {
@@ -929,6 +933,7 @@ describe("createFollowupRunner media understanding", () => {
           },
         ];
         params.ctx.Transcript = "voice transcript";
+        params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript";
         return {
           outputs: [
             {
@@ -980,4 +985,167 @@ describe("createFollowupRunner media understanding", () => {
     // The transcript should be present
     expect(agentCall?.prompt).toContain("voice transcript");
   });
+
+  it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "ok" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" +
+          MEDIA_REPLY_HINT +
+          "\n" +
+          "some text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg");
+    expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123");
+    expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT);
+  });
+
+  it("preserves file-only media understanding when outputs are empty", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '<file name="report.pdf" mime="application/pdf">\nQuarterly report body\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`,
+        mediaContext: {
+          Body: "",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]");
+    expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT);
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).toContain("Quarterly report body");
+    expect(agentCall?.prompt).not.toContain("[User sent media without caption]");
+  });
+
+  it("replaces the queued body when inline directives were already stripped from the prompt", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '/think high summarize this\n\n<file name="report.pdf" mime="application/pdf">\nreport\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+  });
 });
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 1e65380a020..ab2169fc09e 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -14,7 +14,6 @@ import type { TypingMode } from "../../config/types.js";
 import { logVerbose } from "../../globals.js";
 import { registerAgentRunContext } from "../../infra/agent-events.js";
 import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
-import { formatMediaUnderstandingBody } from "../../media-understanding/format.js";
 import { defaultRuntime } from "../../runtime.js";
 import { isInternalMessageChannel } from "../../utils/message-channel.js";
 import { stripHeartbeatToken } from "../heartbeat.js";
@@ -23,6 +22,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
+import { parseInlineDirectives } from "./directive-handling.js";
 import {
   resolveOriginAccountId,
   resolveOriginMessageProvider,
@@ -41,6 +41,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r
 import { createTypingSignaler } from "./typing-mode.js";
 import type { TypingController } from "./typing.js";
 
+const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
+const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+
+function stripLeadingMediaAttachedLines(prompt: string): string {
+  const lines = prompt.split("\n");
+  let index = 0;
+  while (index < lines.length) {
+    const trimmed = lines[index]?.trim() ?? "";
+    if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
+      break;
+    }
+    index += 1;
+  }
+  return lines.slice(index).join("\n").trim();
+}
+
+function stripLeadingMediaReplyHint(prompt: string): string {
+  const lines = prompt.split("\n");
+  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
+    return lines.slice(1).join("\n").trim();
+  }
+  return prompt.trim();
+}
+
+function replaceLastOccurrence(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = value.lastIndexOf(search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
+function stripInlineDirectives(text: string | undefined): string {
+  return parseInlineDirectives(text ?? "").cleaned.trim();
+}
+
+function rebuildQueuedPromptWithMediaUnderstanding(params: {
+  prompt: string;
+  originalBody?: string;
+  updatedBody?: string;
+  mediaNote?: string;
+}): string {
+  let stripped = stripLeadingMediaAttachedLines(params.prompt);
+  if (!params.mediaNote) {
+    stripped = stripLeadingMediaReplyHint(stripped);
+  }
+
+  const updatedBody = stripInlineDirectives(params.updatedBody);
+  if (!updatedBody) {
+    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
+  }
+
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
+  let rebuilt = stripped;
+  for (const target of replacementTargets) {
+    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
+    if (replaced !== undefined) {
+      rebuilt = replaced;
+      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+    }
+  }
+
+  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
+  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+}
+
 export function createFollowupRunner(params: {
   opts?: GetReplyOptions;
   typing: TypingController;
@@ -173,6 +253,7 @@ export function createFollowupRunner(params: {
         if (hasMedia) {
           try {
             const mediaCtx = { ...queued.mediaContext } as MsgContext;
+            const originalBody = mediaCtx.Body;
             const muResult = await applyMediaUnderstanding({
               ctx: mediaCtx,
               cfg: queued.run.config,
@@ -182,34 +263,19 @@ export function createFollowupRunner(params: {
                 model: queued.run.model,
               },
             });
-            if (muResult.outputs.length > 0) {
-              // Rebuild the prompt with media understanding results baked in,
-              // matching the primary path's formatting.
+            if (muResult.outputs.length > 0 || muResult.appliedFile) {
+              // Rebuild the queued prompt from the mutated media context so the
+              // deferred path matches the primary path's prompt shape.
               const newMediaNote = buildInboundMediaNote(mediaCtx);
-              const transcriptBody = formatMediaUnderstandingBody({
-                body: undefined,
-                outputs: muResult.outputs,
+              queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
+                prompt: queued.prompt,
+                originalBody,
+                updatedBody: mediaCtx.Body,
+                mediaNote: newMediaNote,
               });
 
-              // Strip existing [media attached ...] lines from the prompt so
-              // they can be replaced by the updated media note (which excludes
-              // successfully-understood attachments like transcribed audio).
-              const stripped = queued.prompt
-                .replace(/\[media attached: \d+ files\]\n?/g, "")
-                .replace(/\[media attached[^\]]*\]\n?/g, "");
-
-              const parts: string[] = [];
-              if (newMediaNote) {
-                parts.push(newMediaNote);
-              }
-              if (transcriptBody) {
-                parts.push(transcriptBody);
-              }
-              parts.push(stripped.trim());
-              queued.prompt = parts.filter(Boolean).join("\n\n");
-
               logVerbose(
-                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`,
+                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
               );
             }
           } catch (err) {