From 3bf55561cbb972df5c122da75ce7a26e7b947c8b Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 13:40:52 -0400
Subject: [PATCH 01/17] fix: apply media understanding to followup-queued
 messages (#44682)

Voice notes arriving while the agent is mid-turn were queued as
followup messages without audio transcription. The followup runner
called runEmbeddedPiAgent directly, bypassing applyMediaUnderstanding.

This adds a mediaContext field to FollowupRun that snapshots the
original message's media fields. Before the agent run, the followup
runner checks whether media understanding was applied. If not (empty
MediaUnderstanding), it calls applyMediaUnderstanding and rebuilds the
prompt with the transcript, matching the primary path's formatting.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/auto-reply/reply/followup-runner.test.ts | 285 ++++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts      |  65 ++++-
 src/auto-reply/reply/get-reply-run.ts        |  34 ++-
 src/auto-reply/reply/queue.ts                |   5 +-
 src/auto-reply/reply/queue/types.ts          |  37 +++
 5 files changed, 416 insertions(+), 10 deletions(-)
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 0e93ab156a8..f0a060af4ac 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2,7 +2,7 @@ import fs from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
-import { loadSessionStore, saveSessionStore, type SessionEntry } from "../../config/sessions.js";
+import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js";
 import type { FollowupRun } from "./queue.js";
 import * as sessionRunAccounting from "./session-run-accounting.js";
 import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js";
@@ -10,6 +10,7 @@ import { createMockFollowupRun, createMockTypingController } from "./test-helper
 const runEmbeddedPiAgentMock = vi.fn();
 const routeReplyMock = vi.fn();
 const isRoutableChannelMock = vi.fn();
+const applyMediaUnderstandingMock = vi.fn();
 
 vi.mock(
   "../../agents/model-fallback.js",
@@ -20,6 +21,10 @@ vi.mock("../../agents/pi-embedded.js", () => ({
   runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params),
 }));
 
+vi.mock("../../media-understanding/apply.js", () => ({
+  applyMediaUnderstanding: (params: unknown) => applyMediaUnderstandingMock(params),
+}));
+
 vi.mock("./route-reply.js", async (importOriginal) => {
   const actual = await importOriginal<typeof import("./route-reply.js")>();
   return {
@@ -48,13 +53,24 @@ beforeEach(() => {
   isRoutableChannelMock.mockImplementation((ch: string | undefined) =>
     Boolean(ch?.trim() && ROUTABLE_TEST_CHANNELS.has(ch.trim().toLowerCase())),
   );
+  applyMediaUnderstandingMock.mockReset();
+  applyMediaUnderstandingMock.mockResolvedValue({
+    outputs: [],
+    decisions: [],
+    appliedImage: false,
+    appliedAudio: false,
+    appliedVideo: false,
+    appliedFile: false,
+  });
 });
 
 const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
   createMockFollowupRun({ run: { messageProvider } });
 
 function createQueuedRun(
-  overrides: Partial<Omit<FollowupRun, "run">> & { run?: Partial<FollowupRun["run"]> } = {},
+  overrides: Partial<Omit<FollowupRun, "run">> & {
+    run?: Partial<FollowupRun["run"]>;
+  } = {},
 ): FollowupRun {
   return createMockFollowupRun(overrides);
 }
@@ -401,7 +417,12 @@ describe("createFollowupRunner messaging tool dedupe", () => {
       agentResult: {
         ...makeTextReplyDedupeResult(),
         messagingToolSentTargets: [
-          { tool: "telegram", provider: "telegram", to: "268300329", accountId: "work" },
+          {
+            tool: "telegram",
+            provider: "telegram",
+            to: "268300329",
+            accountId: "work",
+          },
         ],
       },
       queued: {
@@ -451,8 +472,13 @@ describe("createFollowupRunner messaging tool dedupe", () => {
       "sessions.json",
     );
     const sessionKey = "main";
-    const sessionEntry: SessionEntry = { sessionId: "session", updatedAt: Date.now() };
-    const sessionStore: Record<string, SessionEntry> = { [sessionKey]: sessionEntry };
+    const sessionEntry: SessionEntry = {
+      sessionId: "session",
+      updatedAt: Date.now(),
+    };
+    const sessionStore: Record<string, SessionEntry> = {
+      [sessionKey]: sessionEntry,
+    };
     await saveSessionStore(storePath, sessionStore);
 
     const { onBlockReply } = await runMessagingCase({
@@ -704,7 +730,254 @@ describe("createFollowupRunner agentDir forwarding", () => {
     });
 
     expect(runEmbeddedPiAgentMock).toHaveBeenCalledTimes(1);
-    const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { agentDir?: string };
+    const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      agentDir?: string;
+    };
     expect(call?.agentDir).toBe(agentDir);
   });
 });
+
+describe("createFollowupRunner media understanding", () => {
+  it("applies audio transcription when mediaContext has untranscribed audio", async () => {
+    const transcriptText = "Hello, this is a voice note.";
+    // The real applyMediaUnderstanding mutates the ctx; the mock must do the same
+    // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line.
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+        // MediaUnderstanding is empty — transcription not yet applied
+      },
+    });
+    await runner(queued);
+
+    // applyMediaUnderstanding should have been called
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        cfg: queued.run.config,
+        agentDir: queued.run.agentDir,
+      }),
+    );
+
+    // The prompt passed to the agent should include the transcript, not the
+    // raw audio attachment line.
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg");
+
+    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
+  });
+
+  it("skips media understanding when MediaUnderstanding is already populated", async () => {
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[Audio]\nTranscript:\nAlready transcribed.\n\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+        // MediaUnderstanding already populated — transcription was applied in primary path
+        MediaUnderstanding: [
+          {
+            kind: "audio.transcription",
+            text: "Already transcribed.",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ],
+      },
+    });
+    await runner(queued);
+
+    // Should NOT re-run media understanding
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+
+    // The original prompt should be passed through unchanged
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Already transcribed.");
+  });
+
+  it("skips media understanding when no mediaContext is present", async () => {
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // No mediaContext (plain text message)
+    const queued = createQueuedRun({ prompt: "just text" });
+    await runner(queued);
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+  });
+
+  it("continues with raw prompt when media understanding fails", async () => {
+    applyMediaUnderstandingMock.mockRejectedValueOnce(new Error("transcription service down"));
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "fallback reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const originalPrompt = "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text";
+    const queued = createQueuedRun({
+      prompt: originalPrompt,
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+      },
+    });
+    await runner(queued);
+
+    // Should have attempted media understanding
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+
+    // Agent should still run with the original prompt
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toBe(originalPrompt);
+
+    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" }));
+  });
+
+  it("preserves non-audio media lines when only audio is transcribed", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        // Simulate transcription updating the context
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "voice transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "voice transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "voice transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "got both" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt:
+        "[media attached: 2 files]\n[media attached 1/2: /tmp/voice.ogg (audio/ogg)]\n[media attached 2/2: /tmp/photo.jpg (image/jpeg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg", "/tmp/photo.jpg"],
+        MediaTypes: ["audio/ogg", "image/jpeg"],
+      },
+    });
+    await runner(queued);
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // Audio attachment line should be stripped
+    expect(agentCall?.prompt).not.toContain("voice.ogg");
+    // Image attachment line should also be stripped (all media-attached lines are
+    // removed and replaced by the new buildInboundMediaNote output)
+    // The transcript should be present
+    expect(agentCall?.prompt).toContain("voice transcript");
+  });
+});
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 2fd21607095..1e65380a020 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -13,10 +13,13 @@ import type { SessionEntry } from "../../config/sessions.js";
 import type { TypingMode } from "../../config/types.js";
 import { logVerbose } from "../../globals.js";
 import { registerAgentRunContext } from "../../infra/agent-events.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
+import { formatMediaUnderstandingBody } from "../../media-understanding/format.js";
 import { defaultRuntime } from "../../runtime.js";
 import { isInternalMessageChannel } from "../../utils/message-channel.js";
 import { stripHeartbeatToken } from "../heartbeat.js";
-import type { OriginatingChannelType } from "../templating.js";
+import { buildInboundMediaNote } from "../media-note.js";
+import type { MsgContext, OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
@@ -157,6 +160,66 @@ export function createFollowupRunner(params: {
       let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen(
         activeSessionEntry?.systemPromptReport,
       );
+
+      // Apply media understanding for followup-queued messages when it was
+      // not applied (or failed) in the primary path.  This ensures voice
+      // notes that arrived while the agent was mid-turn still get transcribed.
+      if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
+        const hasMedia = Boolean(
+          queued.mediaContext.MediaPath?.trim() ||
+          (Array.isArray(queued.mediaContext.MediaPaths) &&
+            queued.mediaContext.MediaPaths.length > 0),
+        );
+        if (hasMedia) {
+          try {
+            const mediaCtx = { ...queued.mediaContext } as MsgContext;
+            const muResult = await applyMediaUnderstanding({
+              ctx: mediaCtx,
+              cfg: queued.run.config,
+              agentDir: queued.run.agentDir,
+              activeModel: {
+                provider: queued.run.provider,
+                model: queued.run.model,
+              },
+            });
+            if (muResult.outputs.length > 0) {
+              // Rebuild the prompt with media understanding results baked in,
+              // matching the primary path's formatting.
+              const newMediaNote = buildInboundMediaNote(mediaCtx);
+              const transcriptBody = formatMediaUnderstandingBody({
+                body: undefined,
+                outputs: muResult.outputs,
+              });
+
+              // Strip existing [media attached ...] lines from the prompt so
+              // they can be replaced by the updated media note (which excludes
+              // successfully-understood attachments like transcribed audio).
+              const stripped = queued.prompt
+                .replace(/\[media attached: \d+ files\]\n?/g, "")
+                .replace(/\[media attached[^\]]*\]\n?/g, "");
+
+              const parts: string[] = [];
+              if (newMediaNote) {
+                parts.push(newMediaNote);
+              }
+              if (transcriptBody) {
+                parts.push(transcriptBody);
+              }
+              parts.push(stripped.trim());
+              queued.prompt = parts.filter(Boolean).join("\n\n");
+
+              logVerbose(
+                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`,
+              );
+            }
+          } catch (err) {
+            logVerbose(
+              `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
+            );
+          }
+        }
+      }
+
       try {
         const fallbackResult = await runWithModelFallback({
           cfg: queued.run.config,
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index c8451fd88f6..fe87e3919d0 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -387,7 +387,7 @@ export async function runPreparedReply(
   const mediaReplyHint = mediaNote
     ? "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body."
     : undefined;
-  let prefixedCommandBody = mediaNote
+  const prefixedCommandBody = mediaNote
     ? [mediaNote, mediaReplyHint, prefixedBody ?? ""].filter(Boolean).join("\n").trim()
     : prefixedBody;
   if (!resolvedThinkLevel) {
@@ -472,11 +472,43 @@ export async function runPreparedReply(
     isNewSession,
   });
   const authProfileIdSource = sessionEntry?.authProfileOverrideSource;
+  // Snapshot media-related context for deferred media understanding in the
+  // followup runner.  When MediaUnderstanding is already populated the runner
+  // knows transcription already succeeded and skips re-application.
+  const hasMediaAttachments = Boolean(
+    ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
+  );
+  const mediaContext = hasMediaAttachments
+    ? {
+        Body: ctx.Body,
+        CommandBody: ctx.CommandBody,
+        RawBody: ctx.RawBody,
+        MediaPath: ctx.MediaPath,
+        MediaUrl: ctx.MediaUrl,
+        MediaType: ctx.MediaType,
+        MediaDir: ctx.MediaDir,
+        MediaPaths: ctx.MediaPaths ? [...ctx.MediaPaths] : undefined,
+        MediaUrls: ctx.MediaUrls ? [...ctx.MediaUrls] : undefined,
+        MediaTypes: ctx.MediaTypes ? [...ctx.MediaTypes] : undefined,
+        MediaRemoteHost: ctx.MediaRemoteHost,
+        Transcript: ctx.Transcript,
+        MediaUnderstanding: ctx.MediaUnderstanding ? [...ctx.MediaUnderstanding] : undefined,
+        MediaUnderstandingDecisions: ctx.MediaUnderstandingDecisions
+          ? [...ctx.MediaUnderstandingDecisions]
+          : undefined,
+        OriginatingChannel: ctx.OriginatingChannel,
+        OriginatingTo: ctx.OriginatingTo,
+        AccountId: ctx.AccountId,
+        MessageThreadId: ctx.MessageThreadId,
+      }
+    : undefined;
+
   const followupRun = {
     prompt: queuedBody,
     messageId: sessionCtx.MessageSidFull ?? sessionCtx.MessageSid,
     summaryLine: baseBodyTrimmedRaw,
     enqueuedAt: Date.now(),
+    mediaContext,
     // Originating channel for reply routing.
     originatingChannel: ctx.OriginatingChannel,
     originatingTo: ctx.OriginatingTo,
diff --git a/src/auto-reply/reply/queue.ts b/src/auto-reply/reply/queue.ts
index b097b6c5193..5c3a56f64af 100644
--- a/src/auto-reply/reply/queue.ts
+++ b/src/auto-reply/reply/queue.ts
@@ -1,6 +1,6 @@
-export { extractQueueDirective } from "./queue/directive.js";
-export { clearSessionQueues } from "./queue/cleanup.js";
 export type { ClearSessionQueueResult } from "./queue/cleanup.js";
+export { clearSessionQueues } from "./queue/cleanup.js";
+export { extractQueueDirective } from "./queue/directive.js";
 export { scheduleFollowupDrain } from "./queue/drain.js";
 export {
   enqueueFollowupRun,
@@ -10,6 +10,7 @@ export {
 export { resolveQueueSettings } from "./queue/settings.js";
 export { clearFollowupQueue } from "./queue/state.js";
 export type {
+  FollowupMediaContext,
   FollowupRun,
   QueueDedupeMode,
   QueueDropPolicy,
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index 507f77d499d..637ceb0a149 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -2,6 +2,10 @@ import type { ExecToolDefaults } from "../../../agents/bash-tools.js";
 import type { SkillSnapshot } from "../../../agents/skills.js";
 import type { OpenClawConfig } from "../../../config/config.js";
 import type { SessionEntry } from "../../../config/sessions.js";
+import type {
+  MediaUnderstandingDecision,
+  MediaUnderstandingOutput,
+} from "../../../media-understanding/types.js";
 import type { InputProvenance } from "../../../sessions/input-provenance.js";
 import type { OriginatingChannelType } from "../../templating.js";
 import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "../directives.js";
@@ -19,12 +23,45 @@ export type QueueSettings = {
 
 export type QueueDedupeMode = "message-id" | "prompt" | "none";
 
+/**
+ * Snapshot of media-related context fields carried on a FollowupRun so that
+ * the followup runner can apply media understanding (e.g. voice-note
+ * transcription) when it was not applied — or failed — in the primary path.
+ */
+export type FollowupMediaContext = {
+  Body?: string;
+  CommandBody?: string;
+  RawBody?: string;
+  MediaPath?: string;
+  MediaUrl?: string;
+  MediaType?: string;
+  MediaDir?: string;
+  MediaPaths?: string[];
+  MediaUrls?: string[];
+  MediaTypes?: string[];
+  MediaRemoteHost?: string;
+  Transcript?: string;
+  MediaUnderstanding?: MediaUnderstandingOutput[];
+  MediaUnderstandingDecisions?: MediaUnderstandingDecision[];
+  OriginatingChannel?: OriginatingChannelType;
+  OriginatingTo?: string;
+  AccountId?: string;
+  MessageThreadId?: string | number;
+};
+
 export type FollowupRun = {
   prompt: string;
   /** Provider message ID, when available (for deduplication). */
   messageId?: string;
   summaryLine?: string;
   enqueuedAt: number;
+  /**
+   * Media context snapshot from the original inbound message.
+   * When present and MediaUnderstanding is empty, the followup runner will
+   * attempt to apply media understanding (audio transcription, etc.) before
+   * passing the prompt to the agent.
+   */
+  mediaContext?: FollowupMediaContext;
   /**
    * Originating channel for reply routing.
    * When set, replies should be routed back to this provider

From 6edb3b7e345573df8fe92485bd6d45e604de29b9 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 15:29:55 -0400
Subject: [PATCH 02/17] fix queued media-understanding prompt rebuild

---
 src/auto-reply/reply/followup-runner.test.ts | 170 ++++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts      | 116 ++++++++++---
 2 files changed, 260 insertions(+), 26 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index f0a060af4ac..71451526400 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -67,6 +67,9 @@ beforeEach(() => {
 const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
   createMockFollowupRun({ run: { messageProvider } });
 
+const MEDIA_REPLY_HINT =
+  "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body.";
+
 function createQueuedRun(
   overrides: Partial<Omit<FollowupRun, "run">> & {
     run?: Partial<FollowupRun["run"]>;
@@ -741,7 +744,7 @@ describe("createFollowupRunner media understanding", () => {
   it("applies audio transcription when mediaContext has untranscribed audio", async () => {
     const transcriptText = "Hello, this is a voice note.";
     // The real applyMediaUnderstanding mutates the ctx; the mock must do the same
-    // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line.
+    // so buildInboundMediaNote and queued prompt rebuilding see the transcribed body.
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
         params.ctx.MediaUnderstanding = [
@@ -753,6 +756,7 @@ describe("createFollowupRunner media understanding", () => {
           },
         ];
         params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
         return {
           outputs: [
             {
@@ -929,6 +933,7 @@ describe("createFollowupRunner media understanding", () => {
           },
         ];
         params.ctx.Transcript = "voice transcript";
+        params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript";
         return {
           outputs: [
             {
@@ -980,4 +985,167 @@ describe("createFollowupRunner media understanding", () => {
     // The transcript should be present
     expect(agentCall?.prompt).toContain("voice transcript");
   });
+
+  it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "ok" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" +
+          MEDIA_REPLY_HINT +
+          "\n" +
+          "some text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg");
+    expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123");
+    expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT);
+  });
+
+  it("preserves file-only media understanding when outputs are empty", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '<file name="report.pdf" mime="application/pdf">\nQuarterly report body\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`,
+        mediaContext: {
+          Body: "",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]");
+    expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT);
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).toContain("Quarterly report body");
+    expect(agentCall?.prompt).not.toContain("[User sent media without caption]");
+  });
+
+  it("replaces the queued body when inline directives were already stripped from the prompt", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '/think high summarize this\n\n<file name="report.pdf" mime="application/pdf">\nreport\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+  });
 });
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 1e65380a020..ab2169fc09e 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -14,7 +14,6 @@ import type { TypingMode } from "../../config/types.js";
 import { logVerbose } from "../../globals.js";
 import { registerAgentRunContext } from "../../infra/agent-events.js";
 import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
-import { formatMediaUnderstandingBody } from "../../media-understanding/format.js";
 import { defaultRuntime } from "../../runtime.js";
 import { isInternalMessageChannel } from "../../utils/message-channel.js";
 import { stripHeartbeatToken } from "../heartbeat.js";
@@ -23,6 +22,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
+import { parseInlineDirectives } from "./directive-handling.js";
 import {
   resolveOriginAccountId,
   resolveOriginMessageProvider,
@@ -41,6 +41,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r
 import { createTypingSignaler } from "./typing-mode.js";
 import type { TypingController } from "./typing.js";
 
+const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
+const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+
+function stripLeadingMediaAttachedLines(prompt: string): string {
+  const lines = prompt.split("\n");
+  let index = 0;
+  while (index < lines.length) {
+    const trimmed = lines[index]?.trim() ?? "";
+    if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
+      break;
+    }
+    index += 1;
+  }
+  return lines.slice(index).join("\n").trim();
+}
+
+function stripLeadingMediaReplyHint(prompt: string): string {
+  const lines = prompt.split("\n");
+  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
+    return lines.slice(1).join("\n").trim();
+  }
+  return prompt.trim();
+}
+
+function replaceLastOccurrence(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = value.lastIndexOf(search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
+function stripInlineDirectives(text: string | undefined): string {
+  return parseInlineDirectives(text ?? "").cleaned.trim();
+}
+
+function rebuildQueuedPromptWithMediaUnderstanding(params: {
+  prompt: string;
+  originalBody?: string;
+  updatedBody?: string;
+  mediaNote?: string;
+}): string {
+  let stripped = stripLeadingMediaAttachedLines(params.prompt);
+  if (!params.mediaNote) {
+    stripped = stripLeadingMediaReplyHint(stripped);
+  }
+
+  const updatedBody = stripInlineDirectives(params.updatedBody);
+  if (!updatedBody) {
+    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
+  }
+
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
+  let rebuilt = stripped;
+  for (const target of replacementTargets) {
+    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
+    if (replaced !== undefined) {
+      rebuilt = replaced;
+      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+    }
+  }
+
+  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
+  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+}
+
 export function createFollowupRunner(params: {
   opts?: GetReplyOptions;
   typing: TypingController;
@@ -173,6 +253,7 @@ export function createFollowupRunner(params: {
         if (hasMedia) {
           try {
             const mediaCtx = { ...queued.mediaContext } as MsgContext;
+            const originalBody = mediaCtx.Body;
             const muResult = await applyMediaUnderstanding({
               ctx: mediaCtx,
               cfg: queued.run.config,
@@ -182,34 +263,19 @@ export function createFollowupRunner(params: {
                 model: queued.run.model,
               },
             });
-            if (muResult.outputs.length > 0) {
-              // Rebuild the prompt with media understanding results baked in,
-              // matching the primary path's formatting.
+            if (muResult.outputs.length > 0 || muResult.appliedFile) {
+              // Rebuild the queued prompt from the mutated media context so the
+              // deferred path matches the primary path's prompt shape.
               const newMediaNote = buildInboundMediaNote(mediaCtx);
-              const transcriptBody = formatMediaUnderstandingBody({
-                body: undefined,
-                outputs: muResult.outputs,
+              queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
+                prompt: queued.prompt,
+                originalBody,
+                updatedBody: mediaCtx.Body,
+                mediaNote: newMediaNote,
               });
 
-              // Strip existing [media attached ...] lines from the prompt so
-              // they can be replaced by the updated media note (which excludes
-              // successfully-understood attachments like transcribed audio).
-              const stripped = queued.prompt
-                .replace(/\[media attached: \d+ files\]\n?/g, "")
-                .replace(/\[media attached[^\]]*\]\n?/g, "");
-
-              const parts: string[] = [];
-              if (newMediaNote) {
-                parts.push(newMediaNote);
-              }
-              if (transcriptBody) {
-                parts.push(transcriptBody);
-              }
-              parts.push(stripped.trim());
-              queued.prompt = parts.filter(Boolean).join("\n\n");
-
               logVerbose(
-                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`,
+                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
               );
             }
           } catch (err) {

From be3eec46e23c394d9405f80121c79fd2212f4af5 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 15:45:17 -0400
Subject: [PATCH 03/17] fix: rebuild queued followup media prompts

---
 src/auto-reply/reply/followup-runner.test.ts | 180 +++++++++++++++++++
 src/auto-reply/reply/followup-runner.ts      |  21 ++-
 2 files changed, 196 insertions(+), 5 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 71451526400..334bf9b6a13 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -818,6 +818,138 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
   });
 
+  it("strips the full media line when attachment paths or URLs contain brackets", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("[media attached:");
+    expect(agentCall?.prompt).not.toContain("files[0].ogg]");
+  });
+
+  it("only strips leading synthetic media lines and preserves literal user text later in the prompt", async () => {
+    const transcriptText = "Transcript with literal token";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = "I literally typed [media attached: keep me] in this message.";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice.ogg (audio/ogg)]\nI literally typed [media attached: keep me] in this message.",
+        mediaContext: {
+          Body: "I literally typed [media attached: keep me] in this message.",
+          CommandBody: "I literally typed [media attached: keep me] in this message.",
+          RawBody: "I literally typed [media attached: keep me] in this message.",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(
+      "I literally typed [media attached: keep me] in this message.",
+    );
+    expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg (audio/ogg)]");
+  });
+
   it("skips media understanding when MediaUnderstanding is already populated", async () => {
     runEmbeddedPiAgentMock.mockResolvedValueOnce({
       payloads: [{ text: "reply" }],
@@ -920,6 +1052,54 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" }));
   });
 
+  it("rebuilds the prompt when file extraction succeeds without media outputs", async () => {
+    const fileBlock = '<file name="notes.txt" mime="text/plain">\nline one\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `some text\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "file processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/notes.txt (text/plain)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/notes.txt (text/plain)]");
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt?.match(/<file\b/g)).toHaveLength(1);
+  });
+
   it("preserves non-audio media lines when only audio is transcribed", async () => {
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index ab2169fc09e..94047155a61 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -43,13 +43,16 @@ import type { TypingController } from "./typing.js";
 
 const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+const LEADING_MEDIA_ATTACHED_LINE_RE =
+  /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
+const FILE_BLOCK_RE = /<file\b/i;
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
   const lines = prompt.split("\n");
   let index = 0;
   while (index < lines.length) {
     const trimmed = lines[index]?.trim() ?? "";
-    if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
+    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
       break;
     }
     index += 1;
@@ -252,8 +255,14 @@ export function createFollowupRunner(params: {
         );
         if (hasMedia) {
           try {
-            const mediaCtx = { ...queued.mediaContext } as MsgContext;
-            const originalBody = mediaCtx.Body;
+            const mediaCtx = {
+              ...queued.mediaContext,
+              Body:
+                queued.mediaContext.CommandBody ??
+                queued.mediaContext.RawBody ??
+                queued.mediaContext.Body,
+            } as MsgContext;
+            const originalBody = queued.mediaContext.Body;
             const muResult = await applyMediaUnderstanding({
               ctx: mediaCtx,
               cfg: queued.run.config,
@@ -263,7 +272,10 @@ export function createFollowupRunner(params: {
                 model: queued.run.model,
               },
             });
-            if (muResult.outputs.length > 0 || muResult.appliedFile) {
+            const shouldRebuildPrompt =
+              muResult.outputs.length > 0 ||
+              (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt));
+            if (shouldRebuildPrompt) {
               // Rebuild the queued prompt from the mutated media context so the
               // deferred path matches the primary path's prompt shape.
               const newMediaNote = buildInboundMediaNote(mediaCtx);
@@ -273,7 +285,6 @@ export function createFollowupRunner(params: {
                 updatedBody: mediaCtx.Body,
                 mediaNote: newMediaNote,
               });
-
               logVerbose(
                 `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
               );

From 67e90527e15ebb47ab00ad3e0283e99d6a94585a Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 16:56:36 -0400
Subject: [PATCH 04/17] fix: narrow FILE_BLOCK_RE, align originalBody, check
 body not prompt

---
 src/auto-reply/reply/followup-runner.test.ts | 109 +++++++++++++++++++
 src/auto-reply/reply/followup-runner.ts      |  21 ++--
 2 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 334bf9b6a13..a0c5306380d 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1328,4 +1328,113 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
     expect(agentCall?.prompt).not.toContain("/think high summarize this");
   });
+
+  it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
+    const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `check my <file upload please\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "got it" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // User message contains literal "<file" text but that should NOT prevent
+    // file extraction results from being embedded in the prompt.
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
+        mediaContext: {
+          Body: "check my <file upload please",
+          CommandBody: "check my <file upload please",
+          RawBody: "check my <file upload please",
+          MediaPaths: ["/tmp/data.csv"],
+          MediaTypes: ["text/csv"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // The file extraction result should be present in the prompt
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt).toContain("check my <file upload please");
+  });
+
+  it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        // applyMediaUnderstanding mutates the resolved body (which is CommandBody)
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // Body has directive prefix; CommandBody has the cleaned version.
+    // The prompt was built from CommandBody, so originalBody should match CommandBody
+    // for accurate replacement.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // File block should be present (extraction succeeded)
+    expect(agentCall?.prompt).toContain(fileBlock);
+    // The body text should appear once, not duplicated
+    expect(agentCall?.prompt).toContain("summarize this");
+    // Should NOT contain the directive prefix
+    expect(agentCall?.prompt).not.toContain("/think high");
+    // The body should not be duplicated (would happen if originalBody didn't match)
+    const matches = agentCall?.prompt?.match(/summarize this/g);
+    expect(matches?.length).toBe(1);
+  });
 });
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 94047155a61..39728c71def 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -45,7 +45,7 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
 const LEADING_MEDIA_ATTACHED_LINE_RE =
   /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
-const FILE_BLOCK_RE = /<file\b/i;
+const FILE_BLOCK_RE = /<file\s+name="/i;
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
   const lines = prompt.split("\n");
@@ -255,14 +255,21 @@ export function createFollowupRunner(params: {
         );
         if (hasMedia) {
           try {
+            const resolvedOriginalBody =
+              queued.mediaContext.CommandBody ??
+              queued.mediaContext.RawBody ??
+              queued.mediaContext.Body;
             const mediaCtx = {
               ...queued.mediaContext,
-              Body:
-                queued.mediaContext.CommandBody ??
-                queued.mediaContext.RawBody ??
-                queued.mediaContext.Body,
+              Body: resolvedOriginalBody,
             } as MsgContext;
-            const originalBody = queued.mediaContext.Body;
+            const originalBody = resolvedOriginalBody;
+            // Capture whether the resolved body already contains a file block
+            // BEFORE applyMediaUnderstanding mutates it — this detects prior
+            // extraction so we avoid double-inserting.  Checking the body
+            // (not the full queued.prompt) avoids false positives from user
+            // messages that happen to contain literal "<file path=" text.
+            const bodyAlreadyHasFileBlock = FILE_BLOCK_RE.test(resolvedOriginalBody ?? "");
             const muResult = await applyMediaUnderstanding({
               ctx: mediaCtx,
               cfg: queued.run.config,
@@ -274,7 +281,7 @@ export function createFollowupRunner(params: {
             });
             const shouldRebuildPrompt =
               muResult.outputs.length > 0 ||
-              (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt));
+              (muResult.appliedFile && !bodyAlreadyHasFileBlock);
             if (shouldRebuildPrompt) {
               // Rebuild the queued prompt from the mutated media context so the
               // deferred path matches the primary path's prompt shape.

From 5e0330db6ceb28128bbc17785a1ee5adeeccbcbb Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 17:53:40 -0400
Subject: [PATCH 05/17] Auto-reply: preserve deferred media understanding
 output

---
 src/auto-reply/reply/followup-runner.test.ts  | 306 +++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts       |  38 ++-
 .../reply/get-reply-run.media-only.test.ts    |  74 +++++
 src/auto-reply/reply/get-reply-run.ts         |  14 +-
 4 files changed, 424 insertions(+), 8 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index a0c5306380d..d2812ff61b6 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -818,6 +818,69 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
   });
 
+  it("applies media understanding for URL-only attachments", async () => {
+    const transcriptText = "URL-only transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaUrl: "https://cdn.example.com/voice.ogg",
+          MediaUrls: ["https://cdn.example.com/voice.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
   it("strips the full media line when attachment paths or URLs contain brackets", async () => {
     const transcriptText = "Bracket-safe transcript";
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1329,6 +1392,98 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).not.toContain("/think high summarize this");
   });
 
+  it("preserves directive-like tokens inside extracted media content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\n/model claude-opus should stay\n/queue followup should stay\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `/think high summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+    expect(agentCall?.prompt).toContain("/model claude-opus should stay");
+    expect(agentCall?.prompt).toContain("/queue followup should stay");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    const description = "[Image]\nDescription:\na mountain at sunset";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = description;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/photo.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("a mountain at sunset");
+  });
+
   it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
     const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1360,8 +1515,7 @@ describe("createFollowupRunner media understanding", () => {
     // file extraction results from being embedded in the prompt.
     await runner(
       createQueuedRun({
-        prompt:
-          "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
+        prompt: "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
         mediaContext: {
           Body: "check my <file upload please",
           CommandBody: "check my <file upload please",
@@ -1380,6 +1534,154 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).toContain("check my <file upload please");
   });
 
+  it("preserves directive-like text that appears inside extracted file content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\nRun `/think high` literally in the shell example.\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/board.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[Image summary]");
+    expect(agentCall?.prompt).toContain("A whiteboard with action items.");
+  });
+
+  it("applies media understanding for URL-only deferred attachments", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "[Audio]\nTranscript:\nremote transcript";
+        params.ctx.Transcript = "remote transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "remote transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        mediaContext: {
+          Body: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("remote transcript");
+  });
+
   it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
     const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
     applyMediaUnderstandingMock.mockImplementationOnce(
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 39728c71def..64b8a935b2a 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -43,8 +43,7 @@ import type { TypingController } from "./typing.js";
 
 const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
-const LEADING_MEDIA_ATTACHED_LINE_RE =
-  /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
+const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
 const FILE_BLOCK_RE = /<file\s+name="/i;
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
@@ -87,6 +86,28 @@ function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
 
+function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
+  const updatedBody = params.updatedBody?.trim();
+  if (!updatedBody) {
+    return "";
+  }
+  const originalBody = params.originalBody?.trim();
+  if (!originalBody) {
+    return updatedBody;
+  }
+
+  const cleanedOriginalBody = stripInlineDirectives(originalBody);
+  if (!cleanedOriginalBody) {
+    return updatedBody;
+  }
+  if (updatedBody === originalBody) {
+    return cleanedOriginalBody;
+  }
+  return (
+    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+  ).trim();
+}
+
 function rebuildQueuedPromptWithMediaUnderstanding(params: {
   prompt: string;
   originalBody?: string;
@@ -98,7 +119,10 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
-  const updatedBody = stripInlineDirectives(params.updatedBody);
+  const updatedBody = normalizeUpdatedBody({
+    originalBody: params.originalBody,
+    updatedBody: params.updatedBody,
+  });
   if (!updatedBody) {
     return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
   }
@@ -250,8 +274,11 @@ export function createFollowupRunner(params: {
       if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
         const hasMedia = Boolean(
           queued.mediaContext.MediaPath?.trim() ||
+          queued.mediaContext.MediaUrl?.trim() ||
           (Array.isArray(queued.mediaContext.MediaPaths) &&
-            queued.mediaContext.MediaPaths.length > 0),
+            queued.mediaContext.MediaPaths.length > 0) ||
+          (Array.isArray(queued.mediaContext.MediaUrls) &&
+            queued.mediaContext.MediaUrls.length > 0),
         );
         if (hasMedia) {
           try {
@@ -281,6 +308,9 @@ export function createFollowupRunner(params: {
             });
             const shouldRebuildPrompt =
               muResult.outputs.length > 0 ||
+              muResult.appliedAudio ||
+              muResult.appliedImage ||
+              muResult.appliedVideo ||
               (muResult.appliedFile && !bodyAlreadyHasFileBlock);
             if (shouldRebuildPrompt) {
               // Rebuild the queued prompt from the mutated media context so the
diff --git a/src/auto-reply/reply/get-reply-run.media-only.test.ts b/src/auto-reply/reply/get-reply-run.media-only.test.ts
index 829b3937009..f519da10082 100644
--- a/src/auto-reply/reply/get-reply-run.media-only.test.ts
+++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts
@@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
   });
 
+  it("snapshots URL-only attachments into followup mediaContext", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "check this attachment",
+          RawBody: "check this attachment",
+          CommandBody: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+          MediaUrl: "https://cdn.example.com/input.png",
+          MediaUrls: ["https://cdn.example.com/input.png"],
+          MediaType: "image/png",
+          MediaTypes: ["image/png"],
+        },
+        sessionCtx: {
+          Body: "check this attachment",
+          BodyStripped: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext).toEqual(
+      expect.objectContaining({
+        MediaUrl: "https://cdn.example.com/input.png",
+        MediaUrls: ["https://cdn.example.com/input.png"],
+        MediaType: "image/png",
+        MediaTypes: ["image/png"],
+      }),
+    );
+  });
+
   it("keeps thread history context on follow-up turns", async () => {
     const result = await runPreparedReply(
       baseParams({
@@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
   });
 
+  it("snapshots mediaContext for URL-only deferred attachments", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "",
+          RawBody: "",
+          CommandBody: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+        },
+        sessionCtx: {
+          Body: "",
+          BodyStripped: "",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg");
+    expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([
+      "https://cdn.example.com/audio.ogg",
+    ]);
+  });
+
   it("returns the empty-body reply when there is no text and no media", async () => {
     const result = await runPreparedReply(
       baseParams({
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index fe87e3919d0..b4b693b8119 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -310,7 +310,14 @@ export async function runPreparedReply(
     : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
   const baseBodyTrimmed = baseBodyForPrompt.trim();
   const hasMediaAttachment = Boolean(
-    sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
+    sessionCtx.MediaPath ||
+    sessionCtx.MediaUrl ||
+    (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) ||
+    (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) ||
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
   );
   if (!baseBodyTrimmed && !hasMediaAttachment) {
     await typing.onReplyStart();
@@ -476,7 +483,10 @@ export async function runPreparedReply(
   // followup runner.  When MediaUnderstanding is already populated the runner
   // knows transcription already succeeded and skips re-application.
   const hasMediaAttachments = Boolean(
-    ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
   );
   const mediaContext = hasMediaAttachments
     ? {

From 7973b2cc5b2d12593082e6ada14863b9eac70b23 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 21:54:35 -0400
Subject: [PATCH 06/17] Reply: preserve deferred queued media context

---
 src/auto-reply/reply/followup-media.ts       | 241 +++++++++++++++++++
 src/auto-reply/reply/followup-runner.test.ts | 225 ++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts      | 180 +-------------
 src/auto-reply/reply/get-reply-run.ts        |   2 +
 src/auto-reply/reply/queue/drain.ts          |  71 +++++-
 src/auto-reply/reply/queue/enqueue.ts        |  37 ++-
 src/auto-reply/reply/queue/state.ts          |   3 +
 src/auto-reply/reply/queue/types.ts          |   3 +
 8 files changed, 572 insertions(+), 190 deletions(-)
 create mode 100644 src/auto-reply/reply/followup-media.ts

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
new file mode 100644
index 00000000000..5a014e63f9b
--- /dev/null
+++ b/src/auto-reply/reply/followup-media.ts
@@ -0,0 +1,241 @@
+import { logVerbose } from "../../globals.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
+import {
+  normalizeAttachments,
+  resolveAttachmentKind,
+} from "../../media-understanding/attachments.js";
+import { buildInboundMediaNote } from "../media-note.js";
+import type { MsgContext } from "../templating.js";
+import { parseInlineDirectives } from "./directive-handling.js";
+import type { FollowupMediaContext, FollowupRun } from "./queue/types.js";
+
+const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
+const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
+const FILE_BLOCK_RE = /<file\s+name="/i;
+
+function stripLeadingMediaAttachedLines(prompt: string): string {
+  const lines = prompt.split("\n");
+  let index = 0;
+  while (index < lines.length) {
+    const trimmed = lines[index]?.trim() ?? "";
+    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
+      break;
+    }
+    index += 1;
+  }
+  return lines.slice(index).join("\n").trim();
+}
+
+function stripLeadingMediaReplyHint(prompt: string): string {
+  const lines = prompt.split("\n");
+  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
+    return lines.slice(1).join("\n").trim();
+  }
+  return prompt.trim();
+}
+
+function replaceLastOccurrence(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = value.lastIndexOf(search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
+function stripInlineDirectives(text: string | undefined): string {
+  return parseInlineDirectives(text ?? "").cleaned.trim();
+}
+
+function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
+  const updatedBody = params.updatedBody?.trim();
+  if (!updatedBody) {
+    return "";
+  }
+  const originalBody = params.originalBody?.trim();
+  if (!originalBody) {
+    return updatedBody;
+  }
+
+  const cleanedOriginalBody = stripInlineDirectives(originalBody);
+  if (!cleanedOriginalBody) {
+    return updatedBody;
+  }
+  if (updatedBody === originalBody) {
+    return cleanedOriginalBody;
+  }
+  return (
+    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+  ).trim();
+}
+
+function rebuildQueuedPromptWithMediaUnderstanding(params: {
+  prompt: string;
+  originalBody?: string;
+  updatedBody?: string;
+  mediaNote?: string;
+}): string {
+  let stripped = stripLeadingMediaAttachedLines(params.prompt);
+  if (!params.mediaNote) {
+    stripped = stripLeadingMediaReplyHint(stripped);
+  }
+
+  const updatedBody = normalizeUpdatedBody({
+    originalBody: params.originalBody,
+    updatedBody: params.updatedBody,
+  });
+  if (!updatedBody) {
+    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
+  }
+
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
+  let rebuilt = stripped;
+  for (const target of replacementTargets) {
+    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
+    if (replaced !== undefined) {
+      rebuilt = replaced;
+      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+    }
+  }
+
+  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
+  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+}
+
+function hasMediaAttachments(mediaContext: FollowupMediaContext): boolean {
+  return Boolean(
+    mediaContext.MediaPath?.trim() ||
+    mediaContext.MediaUrl?.trim() ||
+    (Array.isArray(mediaContext.MediaPaths) && mediaContext.MediaPaths.length > 0) ||
+    (Array.isArray(mediaContext.MediaUrls) && mediaContext.MediaUrls.length > 0),
+  );
+}
+
+function hasOnlyFileLikeAttachments(mediaContext: FollowupMediaContext): boolean {
+  const attachments = normalizeAttachments(mediaContext as MsgContext);
+  return (
+    attachments.length > 0 &&
+    attachments.every((attachment) => {
+      const kind = resolveAttachmentKind(attachment);
+      return kind !== "audio" && kind !== "image" && kind !== "video";
+    })
+  );
+}
+
+function snapshotUpdatedMediaContext(params: {
+  original: FollowupMediaContext;
+  mediaCtx: MsgContext;
+  updatedBody?: string;
+}): FollowupMediaContext {
+  return {
+    ...params.original,
+    Body: params.updatedBody ?? params.original.Body,
+    Transcript:
+      typeof params.mediaCtx.Transcript === "string"
+        ? params.mediaCtx.Transcript
+        : params.original.Transcript,
+    MediaUnderstanding: Array.isArray(params.mediaCtx.MediaUnderstanding)
+      ? [...params.mediaCtx.MediaUnderstanding]
+      : params.original.MediaUnderstanding,
+    MediaUnderstandingDecisions: Array.isArray(params.mediaCtx.MediaUnderstandingDecisions)
+      ? [...params.mediaCtx.MediaUnderstandingDecisions]
+      : params.original.MediaUnderstandingDecisions,
+    DeferredMediaApplied: true,
+  };
+}
+
+export async function applyDeferredMediaUnderstandingToQueuedRun(
+  queued: FollowupRun,
+  params: { logLabel?: string } = {},
+): Promise<void> {
+  const mediaContext = queued.mediaContext;
+  if (!mediaContext || mediaContext.DeferredMediaApplied) {
+    return;
+  }
+  if (mediaContext.MediaUnderstanding?.length) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+  if (!hasMediaAttachments(mediaContext)) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+
+  const resolvedOriginalBody =
+    mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
+  const bodyAlreadyHasFileBlock =
+    FILE_BLOCK_RE.test(resolvedOriginalBody ?? "") || FILE_BLOCK_RE.test(mediaContext.Body ?? "");
+
+  if (bodyAlreadyHasFileBlock && hasOnlyFileLikeAttachments(mediaContext)) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+
+  try {
+    const mediaCtx = {
+      ...mediaContext,
+      Body: resolvedOriginalBody,
+      Provider:
+        mediaContext.Provider ??
+        queued.run.messageProvider ??
+        (typeof mediaContext.OriginatingChannel === "string"
+          ? mediaContext.OriginatingChannel
+          : undefined),
+      Surface: mediaContext.Surface,
+    } as MsgContext;
+
+    const muResult = await applyMediaUnderstanding({
+      ctx: mediaCtx,
+      cfg: queued.run.config,
+      agentDir: queued.run.agentDir,
+      activeModel: {
+        provider: queued.run.provider,
+        model: queued.run.model,
+      },
+    });
+
+    const shouldRebuildPrompt =
+      muResult.outputs.length > 0 ||
+      muResult.appliedAudio ||
+      muResult.appliedImage ||
+      muResult.appliedVideo ||
+      (muResult.appliedFile && !bodyAlreadyHasFileBlock);
+
+    if (shouldRebuildPrompt) {
+      const newMediaNote = buildInboundMediaNote(mediaCtx);
+      queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
+        prompt: queued.prompt,
+        originalBody: resolvedOriginalBody,
+        updatedBody: mediaCtx.Body,
+        mediaNote: newMediaNote,
+      });
+      logVerbose(
+        `${params.logLabel ?? "followup"}: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
+      );
+    }
+
+    queued.mediaContext = snapshotUpdatedMediaContext({
+      original: mediaContext,
+      mediaCtx,
+      updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined,
+    });
+  } catch (err) {
+    logVerbose(
+      `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
+    );
+  }
+}
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index d2812ff61b6..c3e75e6b856 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2,7 +2,8 @@ import fs from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
-import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js";
+import { loadSessionStore, saveSessionStore, type SessionEntry } from "../../config/sessions.js";
+import { buildCollectPrompt } from "../../utils/queue-helpers.js";
 import type { FollowupRun } from "./queue.js";
 import * as sessionRunAccounting from "./session-run-accounting.js";
 import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js";
@@ -35,6 +36,10 @@ vi.mock("./route-reply.js", async (importOriginal) => {
 });
 
 import { createFollowupRunner } from "./followup-runner.js";
+import {
+  applyDeferredMediaToQueuedRuns,
+  buildMediaAwareQueueSummaryPrompt,
+} from "./queue/drain.js";
 
 const ROUTABLE_TEST_CHANNELS = new Set([
   "telegram",
@@ -818,6 +823,69 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
   });
 
+  it("propagates the queued message provider into deferred media context", async () => {
+    const transcriptText = "Provider-aware transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        expect(params.ctx.Provider).toBe("telegram");
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
   it("applies media understanding for URL-only attachments", async () => {
     const transcriptText = "URL-only transcript";
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1739,4 +1807,159 @@ describe("createFollowupRunner media understanding", () => {
     const matches = agentCall?.prompt?.match(/summarize this/g);
     expect(matches?.length).toBe(1);
   });
+
+  it("does not re-apply file extraction when the stored media body already has a file block", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${fileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
+  });
+});
+
+describe("followup queue drain deferred media understanding", () => {
+  it("preprocesses collect batches before synthesizing the followup prompt", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "collect transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "collect transcript";
+        params.ctx.Body = "[Audio]\nTranscript:\ncollect transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "collect transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    const items: FollowupRun[] = [
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+        summaryLine: "some text",
+        originatingChannel: "telegram",
+        originatingTo: "chat:1",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+      createQueuedRun({
+        prompt: "second text",
+        summaryLine: "second text",
+        originatingChannel: "telegram",
+        originatingTo: "chat:1",
+        run: { messageProvider: "telegram" },
+      }),
+    ];
+
+    await applyDeferredMediaToQueuedRuns(items);
+
+    const prompt = buildCollectPrompt({
+      title: "[Queued messages while agent was busy]",
+      items,
+      renderItem: (item, idx) => `---\nQueued #${idx + 1}\n${item.prompt}`.trim(),
+    });
+
+    expect(prompt).toContain("collect transcript");
+    expect(prompt).toContain("Queued #2\nsecond text");
+    expect(prompt).not.toContain("[media attached: /tmp/voice.ogg");
+  });
+
+  it("preprocesses dropped media items before building overflow summaries", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "overflow transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "overflow transcript";
+        params.ctx.Body = "[Audio]\nTranscript:\noverflow transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "overflow transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({
+      dropPolicy: "summarize",
+      droppedCount: 1,
+      summaryLines: ["[media attached: /tmp/voice.ogg (audio/ogg)]"],
+      summaryItems: [
+        createQueuedRun({
+          prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]",
+          summaryLine: "",
+          run: { messageProvider: "telegram" },
+          mediaContext: {
+            Body: "",
+            MediaPaths: ["/tmp/voice.ogg"],
+            MediaTypes: ["audio/ogg"],
+          },
+        }),
+      ],
+      noun: "message",
+    });
+
+    expect(summaryPrompt).toContain("[Queue overflow] Dropped 1 message due to cap.");
+    expect(summaryPrompt).toContain("overflow transcript");
+    expect(summaryPrompt).not.toContain("[media attached: /tmp/voice.ogg");
+  });
 });
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 64b8a935b2a..9a80d4f78ad 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -13,16 +13,14 @@ import type { SessionEntry } from "../../config/sessions.js";
 import type { TypingMode } from "../../config/types.js";
 import { logVerbose } from "../../globals.js";
 import { registerAgentRunContext } from "../../infra/agent-events.js";
-import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import { defaultRuntime } from "../../runtime.js";
 import { isInternalMessageChannel } from "../../utils/message-channel.js";
 import { stripHeartbeatToken } from "../heartbeat.js";
-import { buildInboundMediaNote } from "../media-note.js";
-import type { MsgContext, OriginatingChannelType } from "../templating.js";
+import type { OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
-import { parseInlineDirectives } from "./directive-handling.js";
+import { applyDeferredMediaUnderstandingToQueuedRun } from "./followup-media.js";
 import {
   resolveOriginAccountId,
   resolveOriginMessageProvider,
@@ -41,113 +39,6 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r
 import { createTypingSignaler } from "./typing-mode.js";
 import type { TypingController } from "./typing.js";
 
-const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
-const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
-const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
-const FILE_BLOCK_RE = /<file\s+name="/i;
-
-function stripLeadingMediaAttachedLines(prompt: string): string {
-  const lines = prompt.split("\n");
-  let index = 0;
-  while (index < lines.length) {
-    const trimmed = lines[index]?.trim() ?? "";
-    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
-      break;
-    }
-    index += 1;
-  }
-  return lines.slice(index).join("\n").trim();
-}
-
-function stripLeadingMediaReplyHint(prompt: string): string {
-  const lines = prompt.split("\n");
-  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
-    return lines.slice(1).join("\n").trim();
-  }
-  return prompt.trim();
-}
-
-function replaceLastOccurrence(
-  value: string,
-  search: string,
-  replacement: string,
-): string | undefined {
-  if (!search) {
-    return undefined;
-  }
-  const index = value.lastIndexOf(search);
-  if (index < 0) {
-    return undefined;
-  }
-  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
-}
-
-function stripInlineDirectives(text: string | undefined): string {
-  return parseInlineDirectives(text ?? "").cleaned.trim();
-}
-
-function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
-  const updatedBody = params.updatedBody?.trim();
-  if (!updatedBody) {
-    return "";
-  }
-  const originalBody = params.originalBody?.trim();
-  if (!originalBody) {
-    return updatedBody;
-  }
-
-  const cleanedOriginalBody = stripInlineDirectives(originalBody);
-  if (!cleanedOriginalBody) {
-    return updatedBody;
-  }
-  if (updatedBody === originalBody) {
-    return cleanedOriginalBody;
-  }
-  return (
-    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
-  ).trim();
-}
-
-function rebuildQueuedPromptWithMediaUnderstanding(params: {
-  prompt: string;
-  originalBody?: string;
-  updatedBody?: string;
-  mediaNote?: string;
-}): string {
-  let stripped = stripLeadingMediaAttachedLines(params.prompt);
-  if (!params.mediaNote) {
-    stripped = stripLeadingMediaReplyHint(stripped);
-  }
-
-  const updatedBody = normalizeUpdatedBody({
-    originalBody: params.originalBody,
-    updatedBody: params.updatedBody,
-  });
-  if (!updatedBody) {
-    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
-  }
-
-  const replacementTargets = [
-    params.originalBody?.trim(),
-    stripInlineDirectives(params.originalBody),
-    MEDIA_ONLY_PLACEHOLDER,
-  ].filter(
-    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
-  );
-
-  let rebuilt = stripped;
-  for (const target of replacementTargets) {
-    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
-    if (replaced !== undefined) {
-      rebuilt = replaced;
-      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
-    }
-  }
-
-  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
-  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
-}
-
 export function createFollowupRunner(params: {
   opts?: GetReplyOptions;
   typing: TypingController;
@@ -267,72 +158,7 @@ export function createFollowupRunner(params: {
       let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen(
         activeSessionEntry?.systemPromptReport,
       );
-
-      // Apply media understanding for followup-queued messages when it was
-      // not applied (or failed) in the primary path.  This ensures voice
-      // notes that arrived while the agent was mid-turn still get transcribed.
-      if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
-        const hasMedia = Boolean(
-          queued.mediaContext.MediaPath?.trim() ||
-          queued.mediaContext.MediaUrl?.trim() ||
-          (Array.isArray(queued.mediaContext.MediaPaths) &&
-            queued.mediaContext.MediaPaths.length > 0) ||
-          (Array.isArray(queued.mediaContext.MediaUrls) &&
-            queued.mediaContext.MediaUrls.length > 0),
-        );
-        if (hasMedia) {
-          try {
-            const resolvedOriginalBody =
-              queued.mediaContext.CommandBody ??
-              queued.mediaContext.RawBody ??
-              queued.mediaContext.Body;
-            const mediaCtx = {
-              ...queued.mediaContext,
-              Body: resolvedOriginalBody,
-            } as MsgContext;
-            const originalBody = resolvedOriginalBody;
-            // Capture whether the resolved body already contains a file block
-            // BEFORE applyMediaUnderstanding mutates it — this detects prior
-            // extraction so we avoid double-inserting.  Checking the body
-            // (not the full queued.prompt) avoids false positives from user
-            // messages that happen to contain literal "<file path=" text.
-            const bodyAlreadyHasFileBlock = FILE_BLOCK_RE.test(resolvedOriginalBody ?? "");
-            const muResult = await applyMediaUnderstanding({
-              ctx: mediaCtx,
-              cfg: queued.run.config,
-              agentDir: queued.run.agentDir,
-              activeModel: {
-                provider: queued.run.provider,
-                model: queued.run.model,
-              },
-            });
-            const shouldRebuildPrompt =
-              muResult.outputs.length > 0 ||
-              muResult.appliedAudio ||
-              muResult.appliedImage ||
-              muResult.appliedVideo ||
-              (muResult.appliedFile && !bodyAlreadyHasFileBlock);
-            if (shouldRebuildPrompt) {
-              // Rebuild the queued prompt from the mutated media context so the
-              // deferred path matches the primary path's prompt shape.
-              const newMediaNote = buildInboundMediaNote(mediaCtx);
-              queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
-                prompt: queued.prompt,
-                originalBody,
-                updatedBody: mediaCtx.Body,
-                mediaNote: newMediaNote,
-              });
-              logVerbose(
-                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
-              );
-            }
-          } catch (err) {
-            logVerbose(
-              `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
-            );
-          }
-        }
-      }
+      await applyDeferredMediaUnderstandingToQueuedRun(queued, { logLabel: "followup" });
 
       try {
         const fallbackResult = await runWithModelFallback({
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index b4b693b8119..252cffbb364 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -493,6 +493,8 @@ export async function runPreparedReply(
         Body: ctx.Body,
         CommandBody: ctx.CommandBody,
         RawBody: ctx.RawBody,
+        Provider: ctx.Provider ?? sessionCtx.Provider,
+        Surface: ctx.Surface ?? sessionCtx.Surface,
         MediaPath: ctx.MediaPath,
         MediaUrl: ctx.MediaUrl,
         MediaType: ctx.MediaType,
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 1e2fb33e4e0..68c660fe2b8 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -3,15 +3,17 @@ import { resolveGlobalMap } from "../../../shared/global-singleton.js";
 import {
   buildCollectPrompt,
   beginQueueDrain,
+  buildQueueSummaryLine,
+  buildQueueSummaryPrompt,
   clearQueueSummaryState,
   drainCollectQueueStep,
   drainNextQueueItem,
   hasCrossChannelItems,
-  previewQueueSummaryPrompt,
   waitForQueueDebounce,
 } from "../../../utils/queue-helpers.js";
+import { applyDeferredMediaUnderstandingToQueuedRun } from "../followup-media.js";
 import { isRoutableChannel } from "../route-reply.js";
-import { FOLLOWUP_QUEUES } from "./state.js";
+import { FOLLOWUP_QUEUES, type FollowupQueueState } from "./state.js";
 import type { FollowupRun } from "./types.js";
 
 // Persists the most recent runFollowup callback per queue key so that
@@ -68,6 +70,50 @@ function resolveCrossChannelKey(item: FollowupRun): { cross?: true; key?: string
   };
 }
 
+function clearFollowupQueueSummaryState(queue: FollowupQueueState): void {
+  clearQueueSummaryState(queue);
+  queue.summaryItems = [];
+}
+
+export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise<void> {
+  for (const item of items) {
+    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
+  }
+}
+
+async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {
+  const summaryLines: string[] = [];
+  for (const item of items) {
+    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
+    summaryLines.push(buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()));
+  }
+  return summaryLines;
+}
+
+export async function buildMediaAwareQueueSummaryPrompt(params: {
+  dropPolicy: FollowupQueueState["dropPolicy"];
+  droppedCount: number;
+  summaryLines: string[];
+  summaryItems: FollowupRun[];
+  noun: string;
+}): Promise<string | undefined> {
+  if (params.dropPolicy !== "summarize" || params.droppedCount <= 0) {
+    return undefined;
+  }
+  const summaryLines =
+    params.summaryItems.length > 0
+      ? await resolveSummaryLines(params.summaryItems)
+      : params.summaryLines;
+  return buildQueueSummaryPrompt({
+    state: {
+      dropPolicy: params.dropPolicy,
+      droppedCount: params.droppedCount,
+      summaryLines: [...summaryLines],
+    },
+    noun: params.noun,
+  });
+}
+
 export function scheduleFollowupDrain(
   key: string,
   runFollowup: (run: FollowupRun) => Promise<void>,
@@ -107,7 +153,14 @@ export function scheduleFollowupDrain(
           }
 
           const items = queue.items.slice();
-          const summary = previewQueueSummaryPrompt({ state: queue, noun: "message" });
+          await applyDeferredMediaToQueuedRuns(items);
+          const summary = await buildMediaAwareQueueSummaryPrompt({
+            dropPolicy: queue.dropPolicy,
+            droppedCount: queue.droppedCount,
+            summaryLines: queue.summaryLines,
+            summaryItems: queue.summaryItems,
+            noun: "message",
+          });
           const run = items.at(-1)?.run ?? queue.lastRun;
           if (!run) {
             break;
@@ -129,12 +182,18 @@ export function scheduleFollowupDrain(
           });
           queue.items.splice(0, items.length);
           if (summary) {
-            clearQueueSummaryState(queue);
+            clearFollowupQueueSummaryState(queue);
           }
           continue;
         }
 
-        const summaryPrompt = previewQueueSummaryPrompt({ state: queue, noun: "message" });
+        const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({
+          dropPolicy: queue.dropPolicy,
+          droppedCount: queue.droppedCount,
+          summaryLines: queue.summaryLines,
+          summaryItems: queue.summaryItems,
+          noun: "message",
+        });
         if (summaryPrompt) {
           const run = queue.lastRun;
           if (!run) {
@@ -155,7 +214,7 @@ export function scheduleFollowupDrain(
           ) {
             break;
           }
-          clearQueueSummaryState(queue);
+          clearFollowupQueueSummaryState(queue);
           continue;
         }
 
diff --git a/src/auto-reply/reply/queue/enqueue.ts b/src/auto-reply/reply/queue/enqueue.ts
index 11da0db98fc..e58cc5ffac5 100644
--- a/src/auto-reply/reply/queue/enqueue.ts
+++ b/src/auto-reply/reply/queue/enqueue.ts
@@ -1,8 +1,8 @@
 import { createDedupeCache } from "../../../infra/dedupe.js";
 import { resolveGlobalSingleton } from "../../../shared/global-singleton.js";
-import { applyQueueDropPolicy, shouldSkipQueueItem } from "../../../utils/queue-helpers.js";
+import { buildQueueSummaryLine, shouldSkipQueueItem } from "../../../utils/queue-helpers.js";
 import { kickFollowupDrainIfIdle } from "./drain.js";
-import { getExistingFollowupQueue, getFollowupQueue } from "./state.js";
+import { getExistingFollowupQueue, getFollowupQueue, type FollowupQueueState } from "./state.js";
 import type { FollowupRun, QueueDedupeMode, QueueSettings } from "./types.js";
 
 /**
@@ -57,6 +57,34 @@ function isRunAlreadyQueued(
   return items.some((item) => item.prompt === run.prompt && hasSameRouting(item));
 }
 
+function applyFollowupQueueDropPolicy(queue: FollowupQueueState): boolean {
+  const cap = queue.cap;
+  if (cap <= 0 || queue.items.length < cap) {
+    return true;
+  }
+  if (queue.dropPolicy === "new") {
+    return false;
+  }
+
+  const dropCount = queue.items.length - cap + 1;
+  const dropped = queue.items.splice(0, dropCount);
+  if (queue.dropPolicy === "summarize") {
+    for (const item of dropped) {
+      queue.droppedCount += 1;
+      queue.summaryItems.push(item);
+      queue.summaryLines.push(
+        buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()),
+      );
+    }
+    const limit = Math.max(0, cap);
+    while (queue.summaryLines.length > limit) {
+      queue.summaryLines.shift();
+      queue.summaryItems.shift();
+    }
+  }
+  return true;
+}
+
 export function enqueueFollowupRun(
   key: string,
   run: FollowupRun,
@@ -83,10 +111,7 @@ export function enqueueFollowupRun(
   queue.lastEnqueuedAt = Date.now();
   queue.lastRun = run.run;
 
-  const shouldEnqueue = applyQueueDropPolicy({
-    queue,
-    summarize: (item) => item.summaryLine?.trim() || item.prompt.trim(),
-  });
+  const shouldEnqueue = applyFollowupQueueDropPolicy(queue);
   if (!shouldEnqueue) {
     return false;
   }
diff --git a/src/auto-reply/reply/queue/state.ts b/src/auto-reply/reply/queue/state.ts
index 44208e727dd..94021dd0c4c 100644
--- a/src/auto-reply/reply/queue/state.ts
+++ b/src/auto-reply/reply/queue/state.ts
@@ -4,6 +4,7 @@ import type { FollowupRun, QueueDropPolicy, QueueMode, QueueSettings } from "./t
 
 export type FollowupQueueState = {
   items: FollowupRun[];
+  summaryItems: FollowupRun[];
   draining: boolean;
   lastEnqueuedAt: number;
   mode: QueueMode;
@@ -47,6 +48,7 @@ export function getFollowupQueue(key: string, settings: QueueSettings): Followup
 
   const created: FollowupQueueState = {
     items: [],
+    summaryItems: [],
     draining: false,
     lastEnqueuedAt: 0,
     mode: settings.mode,
@@ -78,6 +80,7 @@ export function clearFollowupQueue(key: string): number {
   }
   const cleared = queue.items.length + queue.droppedCount;
   queue.items.length = 0;
+  queue.summaryItems.length = 0;
   queue.droppedCount = 0;
   queue.summaryLines = [];
   queue.lastRun = undefined;
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index 637ceb0a149..291059a28d7 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -32,6 +32,8 @@ export type FollowupMediaContext = {
   Body?: string;
   CommandBody?: string;
   RawBody?: string;
+  Provider?: string;
+  Surface?: string;
   MediaPath?: string;
   MediaUrl?: string;
   MediaType?: string;
@@ -47,6 +49,7 @@ export type FollowupMediaContext = {
   OriginatingTo?: string;
   AccountId?: string;
   MessageThreadId?: string | number;
+  DeferredMediaApplied?: boolean;
 };
 
 export type FollowupRun = {

From f1e023c3de3412cc76622ee737c9215a7beb54cc Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 23:03:20 -0400
Subject: [PATCH 07/17] fix: set DeferredMediaApplied on error and strip old
 file blocks on rebuild

---
 src/auto-reply/reply/followup-media.ts       |  15 +++
 src/auto-reply/reply/followup-runner.test.ts | 111 +++++++++++++++++++
 2 files changed, 126 insertions(+)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 5a014e63f9b..5340d0df99a 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -13,6 +13,11 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
 const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
 const FILE_BLOCK_RE = /<file\s+name="/i;
+const FILE_BLOCK_FULL_RE = /<file\s+name="[^"]*"[^>]*>[\s\S]*?<\/file>\n?/gi;
+
+function stripExistingFileBlocks(text: string): string {
+  return text.replace(FILE_BLOCK_FULL_RE, "").trim();
+}
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
   const lines = prompt.split("\n");
@@ -87,6 +92,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
+  // Strip pre-existing file blocks from the prompt when the updated body
+  // contains new file blocks.  Mixed messages (audio + PDF) can arrive with
+  // file extraction already applied in the primary path; without this strip
+  // the old block stays in the prompt while the updated body adds a new one,
+  // duplicating potentially large file payloads.
+  if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
+    stripped = stripExistingFileBlocks(stripped);
+  }
+
   const updatedBody = normalizeUpdatedBody({
     originalBody: params.originalBody,
     updatedBody: params.updatedBody,
@@ -234,6 +248,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined,
     });
   } catch (err) {
+    mediaContext.DeferredMediaApplied = true;
     logVerbose(
       `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
     );
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index c3e75e6b856..cd1951aa748 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1808,6 +1808,117 @@ describe("createFollowupRunner media understanding", () => {
     expect(matches?.length).toBe(1);
   });
 
+  it("does not duplicate file blocks for mixed audio+file messages re-processed in followup", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Mixed message transcript";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nanalyze this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // Simulate a mixed message where the primary path already extracted the
+    // PDF (file block is in the prompt) but audio transcription failed.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nanalyze this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `analyze this\n\n${existingFileBlock}`,
+          CommandBody: "analyze this",
+          RawBody: "analyze this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // Should contain the transcript
+    expect(agentCall?.prompt).toContain(transcriptText);
+    // Should have exactly one file block (the new one), not two
+    expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
+    expect(agentCall?.prompt).toContain("new extracted content");
+    expect(agentCall?.prompt).not.toContain("old extracted content");
+  });
+
+  it("sets DeferredMediaApplied when media understanding throws", async () => {
+    applyMediaUnderstandingMock.mockRejectedValueOnce(
+      new Error("transcription service unavailable"),
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "fallback reply" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+      },
+    });
+
+    await runner(queued);
+
+    // DeferredMediaApplied should be set so re-runs don't retry
+    expect(queued.mediaContext?.DeferredMediaApplied).toBe(true);
+
+    // The agent should still be called with the raw prompt
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("some text");
+  });
+
   it("does not re-apply file extraction when the stored media body already has a file block", async () => {
     const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
     runEmbeddedPiAgentMock.mockResolvedValueOnce({

From ad0a1bdc5e7bd3ecf47bd41d652bd522f978a6ac Mon Sep 17 00:00:00 2001
From: Joseph Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 23:26:27 -0400
Subject: [PATCH 08/17] Update src/auto-reply/reply/followup-runner.test.ts

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/auto-reply/reply/followup-runner.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index cd1951aa748..16f2b8eec90 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1650,7 +1650,7 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
   });
 
-  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+  it("rebuilds the prompt when image understanding mutates the body alongside existing text", async () => {
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
         params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";

From f890bc75dc4b752015aa39ca259043a0ac14cc63 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 23:51:22 -0400
Subject: [PATCH 09/17] fix: address remaining review feedback on followup
 media

- Scope file-block stripping to body region only, preserving file blocks
  in quoted/replied text and thread history above the body
- Gate file-extraction skip on mutation evidence (Body differs from
  resolved original) instead of raw '<file name=' text matching to avoid
  false-positives on user messages containing literal XML
- Document collect-mode scope limitation in applyDeferredMediaUnderstandingToQueuedRun
- Rename duplicate test description to distinguish body-alongside-text case
- Prefer updated prompt over original summaryLine in overflow summaries
  so captioned voice note transcripts are surfaced
---
 src/auto-reply/reply/followup-media.ts | 26 ++++++++++++++++++++------
 src/auto-reply/reply/queue/drain.ts    |  4 +++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 5340d0df99a..61d3b9f9cfb 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -92,13 +92,21 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
-  // Strip pre-existing file blocks from the prompt when the updated body
+  // Strip pre-existing file blocks from the body region when the updated body
   // contains new file blocks.  Mixed messages (audio + PDF) can arrive with
   // file extraction already applied in the primary path; without this strip
   // the old block stays in the prompt while the updated body adds a new one,
   // duplicating potentially large file payloads.
+  // Scope stripping to the body segment so quoted/replied text and thread
+  // history above the body retain any legitimate <file> blocks.
   if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
-    stripped = stripExistingFileBlocks(stripped);
+    const bodyTarget = params.originalBody?.trim();
+    const bodyIdx = bodyTarget ? stripped.lastIndexOf(bodyTarget) : -1;
+    if (bodyIdx >= 0) {
+      stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx));
+    } else {
+      stripped = stripExistingFileBlocks(stripped);
+    }
   }
 
   const updatedBody = normalizeUpdatedBody({
@@ -176,6 +184,9 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
   queued: FollowupRun,
   params: { logLabel?: string } = {},
 ): Promise<void> {
+  // NOTE: collect-mode and overflow-summary queue drains create synthetic
+  // followup runs without mediaContext — those paths are not covered here
+  // and rely on their own prompt-building logic in queue/drain.ts.
   const mediaContext = queued.mediaContext;
   if (!mediaContext || mediaContext.DeferredMediaApplied) {
     return;
@@ -191,10 +202,13 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
 
   const resolvedOriginalBody =
     mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
-  const bodyAlreadyHasFileBlock =
-    FILE_BLOCK_RE.test(resolvedOriginalBody ?? "") || FILE_BLOCK_RE.test(mediaContext.Body ?? "");
+  // Detect file extraction from the primary path by checking whether Body was
+  // mutated AND contains file blocks.  This avoids false-positives when a user
+  // message literally contains "<file name=" text.
+  const fileExtractionAlreadyApplied =
+    mediaContext.Body !== resolvedOriginalBody && FILE_BLOCK_RE.test(mediaContext.Body ?? "");
 
-  if (bodyAlreadyHasFileBlock && hasOnlyFileLikeAttachments(mediaContext)) {
+  if (fileExtractionAlreadyApplied && hasOnlyFileLikeAttachments(mediaContext)) {
     mediaContext.DeferredMediaApplied = true;
     return;
   }
@@ -227,7 +241,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       muResult.appliedAudio ||
       muResult.appliedImage ||
       muResult.appliedVideo ||
-      (muResult.appliedFile && !bodyAlreadyHasFileBlock);
+      (muResult.appliedFile && !fileExtractionAlreadyApplied);
 
     if (shouldRebuildPrompt) {
       const newMediaNote = buildInboundMediaNote(mediaCtx);
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 68c660fe2b8..471c94200ba 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -85,7 +85,9 @@ async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {
   const summaryLines: string[] = [];
   for (const item of items) {
     await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
-    summaryLines.push(buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()));
+    // After deferred media, prefer the updated prompt (which includes transcripts)
+    // over the original summaryLine (which may just be the caption text).
+    summaryLines.push(buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""));
   }
   return summaryLines;
 }

From 0ca772166f6c23b089d93a673960f9e1cf861154 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 00:40:12 -0400
Subject: [PATCH 10/17] Auto-reply: fix followup media prompt rebuild

---
 src/auto-reply/reply/followup-media.ts       |  55 +++++--
 src/auto-reply/reply/followup-runner.test.ts | 143 +++++++++++++++++++
 2 files changed, 183 insertions(+), 15 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 61d3b9f9cfb..d862a77875b 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -55,6 +55,30 @@ function replaceLastOccurrence(
   return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
 }
 
+function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): number {
+  if (!search) {
+    return -1;
+  }
+  const fileBlockIndex = value.search(FILE_BLOCK_RE);
+  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
+  return bodyRegion.indexOf(search);
+}
+
+function replaceFirstOccurrenceBeforeFileBlocks(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = findFirstOccurrenceBeforeFileBlocks(value, search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
 function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
@@ -92,20 +116,29 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
   // Strip pre-existing file blocks from the body region when the updated body
   // contains new file blocks.  Mixed messages (audio + PDF) can arrive with
   // file extraction already applied in the primary path; without this strip
   // the old block stays in the prompt while the updated body adds a new one,
   // duplicating potentially large file payloads.
-  // Scope stripping to the body segment so quoted/replied text and thread
-  // history above the body retain any legitimate <file> blocks.
+  // Scope stripping to the confirmed body segment so quoted/replied text,
+  // thread history above the body, and prompts whose original body no longer
+  // appears all retain any legitimate <file> blocks.
   if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
-    const bodyTarget = params.originalBody?.trim();
-    const bodyIdx = bodyTarget ? stripped.lastIndexOf(bodyTarget) : -1;
+    const bodyIdx =
+      replacementTargets
+        .map((target) => findFirstOccurrenceBeforeFileBlocks(stripped, target))
+        .find((index) => index >= 0) ?? -1;
     if (bodyIdx >= 0) {
       stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx));
-    } else {
-      stripped = stripExistingFileBlocks(stripped);
     }
   }
 
@@ -117,17 +150,9 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
   }
 
-  const replacementTargets = [
-    params.originalBody?.trim(),
-    stripInlineDirectives(params.originalBody),
-    MEDIA_ONLY_PLACEHOLDER,
-  ].filter(
-    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
-  );
-
   let rebuilt = stripped;
   for (const target of replacementTargets) {
-    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
+    const replaced = replaceFirstOccurrenceBeforeFileBlocks(rebuilt, target, updatedBody);
     if (replaced !== undefined) {
       rebuilt = replaced;
       return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 16f2b8eec90..d85223b1afe 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1882,6 +1882,149 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).not.toContain("old extracted content");
   });
 
+  it("preserves unrelated file blocks when the original body is absent from the prompt", async () => {
+    const quotedFileBlock =
+      '<file name="thread.pdf" mime="application/pdf">\nquoted thread attachment\n</file>';
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Transcript from deferred audio";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nQuoted thread above\n\n${quotedFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Quoted thread above");
+    expect(agentCall?.prompt).toContain(quotedFileBlock);
+    expect(agentCall?.prompt).toContain(newFileBlock);
+    expect(agentCall?.prompt?.match(/<file\s+name="/g)).toHaveLength(2);
+  });
+
+  it("replaces the visible body before file blocks instead of matching file content", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nsummary notes:\nsummarize this\n</file>';
+    const transcriptText = "Transcript from deferred audio";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`;
+    expect(agentCall?.prompt).toContain(existingFileBlock);
+    expect(agentCall?.prompt).toContain(transcriptBlock);
+    expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeGreaterThan(-1);
+    expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeLessThan(
+      agentCall?.prompt?.indexOf(existingFileBlock) ?? -1,
+    );
+  });
+
   it("sets DeferredMediaApplied when media understanding throws", async () => {
     applyMediaUnderstandingMock.mockRejectedValueOnce(
       new Error("transcription service unavailable"),

From dc8cdfce5c7b23051291b6d5bd56989db0faa92a Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 12:12:12 -0400
Subject: [PATCH 11/17] fix: address Codex and Greptile review comments on
 #46454

Replace regex-based file extraction detection (FILE_BLOCK_RE.test) with a
DeferredFileBlocksExtracted mutation marker on FollowupMediaContext.  The old
approach scanned user body text for '<file name=' patterns, which could
false-positive on literal user messages.  The new approach compares Body
against RawBody (never mutated by the primary path) to detect file extraction,
then stores a boolean marker for subsequent checks.

Fixes 2 & 4 (linked):
- Added DeferredFileBlocksExtracted marker to FollowupMediaContext type
- Mutation detection uses RawBody (not CommandBody) as reference to avoid
  false-positives when /think directives differ between Body and CommandBody
- snapshotUpdatedMediaContext propagates the marker via appliedFile

Fixes 1, 3, 5, 6 were already addressed by prior commits on this branch.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/auto-reply/reply/followup-media.ts | 33 ++++++++++++++++++++------
 src/auto-reply/reply/queue/types.ts    |  7 ++++++
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index d862a77875b..d9202554526 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -183,10 +183,18 @@ function hasOnlyFileLikeAttachments(mediaContext: FollowupMediaContext): boolean
   );
 }
 
+function hasAnyFileAttachments(mediaContext: FollowupMediaContext): boolean {
+  return normalizeAttachments(mediaContext as MsgContext).some((attachment) => {
+    const kind = resolveAttachmentKind(attachment);
+    return kind !== "audio" && kind !== "image" && kind !== "video";
+  });
+}
+
 function snapshotUpdatedMediaContext(params: {
   original: FollowupMediaContext;
   mediaCtx: MsgContext;
   updatedBody?: string;
+  appliedFile?: boolean;
 }): FollowupMediaContext {
   return {
     ...params.original,
@@ -202,6 +210,8 @@ function snapshotUpdatedMediaContext(params: {
       ? [...params.mediaCtx.MediaUnderstandingDecisions]
       : params.original.MediaUnderstandingDecisions,
     DeferredMediaApplied: true,
+    DeferredFileBlocksExtracted:
+      params.original.DeferredFileBlocksExtracted || params.appliedFile || undefined,
   };
 }
 
@@ -227,13 +237,21 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
 
   const resolvedOriginalBody =
     mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
-  // Detect file extraction from the primary path by checking whether Body was
-  // mutated AND contains file blocks.  This avoids false-positives when a user
-  // message literally contains "<file name=" text.
-  const fileExtractionAlreadyApplied =
-    mediaContext.Body !== resolvedOriginalBody && FILE_BLOCK_RE.test(mediaContext.Body ?? "");
+  // Detect file extraction from the primary path via body mutation instead of
+  // scanning for literal '<file name=' patterns (which false-positives on user
+  // text).  Compare Body against RawBody (never mutated by the primary path's
+  // media/file processing) rather than CommandBody (which differs from Body
+  // when inline directives like /think were stripped).
+  const referenceBody = mediaContext.RawBody ?? mediaContext.Body;
+  if (
+    !mediaContext.DeferredFileBlocksExtracted &&
+    mediaContext.Body !== referenceBody &&
+    hasAnyFileAttachments(mediaContext)
+  ) {
+    mediaContext.DeferredFileBlocksExtracted = true;
+  }
 
-  if (fileExtractionAlreadyApplied && hasOnlyFileLikeAttachments(mediaContext)) {
+  if (mediaContext.DeferredFileBlocksExtracted && hasOnlyFileLikeAttachments(mediaContext)) {
     mediaContext.DeferredMediaApplied = true;
     return;
   }
@@ -266,7 +284,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       muResult.appliedAudio ||
       muResult.appliedImage ||
       muResult.appliedVideo ||
-      (muResult.appliedFile && !fileExtractionAlreadyApplied);
+      (muResult.appliedFile && !mediaContext.DeferredFileBlocksExtracted);
 
     if (shouldRebuildPrompt) {
       const newMediaNote = buildInboundMediaNote(mediaCtx);
@@ -285,6 +303,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       original: mediaContext,
       mediaCtx,
       updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined,
+      appliedFile: muResult.appliedFile,
     });
   } catch (err) {
     mediaContext.DeferredMediaApplied = true;
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index 291059a28d7..e1e9e20e5c8 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -50,6 +50,13 @@ export type FollowupMediaContext = {
   AccountId?: string;
   MessageThreadId?: string | number;
   DeferredMediaApplied?: boolean;
+  /**
+   * Set when file extraction has already been applied to Body (either in the
+   * primary path or by a previous deferred-media run).  Checked instead of
+   * scanning body text for `<file` patterns to avoid false-positives on user
+   * messages that contain literal XML-like text.
+   */
+  DeferredFileBlocksExtracted?: boolean;
 };
 
 export type FollowupRun = {

From 8f6d5a278f0bfa455e0bc9d381b82a72d3fae73e Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 15:51:14 -0400
Subject: [PATCH 12/17] fix: use file-block-safe replacement in
 normalizeUpdatedBody and trailing fallback (#46454)

---
 src/auto-reply/reply/followup-media.test.ts | 107 ++++++++++++++++++++
 src/auto-reply/reply/followup-media.ts      |  44 +++++++-
 2 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 src/auto-reply/reply/followup-media.test.ts

diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts
new file mode 100644
index 00000000000..d8ba453f801
--- /dev/null
+++ b/src/auto-reply/reply/followup-media.test.ts
@@ -0,0 +1,107 @@
+import { describe, expect, it } from "vitest";
+import {
+  _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks,
+  _normalizeUpdatedBody as normalizeUpdatedBody,
+  _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding,
+} from "./followup-media.js";
+
+const FILE_BLOCK = '<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>';
+
+describe("findLastOccurrenceBeforeFileBlocks", () => {
+  it("returns -1 for empty search", () => {
+    expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1);
+  });
+
+  it("finds last occurrence in body region before file blocks", () => {
+    const value = `hello world hello\n${FILE_BLOCK}`;
+    // "hello" appears at 0 and 12 — both before the file block
+    expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12);
+  });
+
+  it("does not match inside file block content", () => {
+    const value = `some text\n${FILE_BLOCK}\nPDF content`;
+    // "PDF content" appears in the file block and after it, but the body region
+    // (before <file) is just "some text\n" — no match there.
+    expect(findLastOccurrenceBeforeFileBlocks(value, "PDF content")).toBe(-1);
+  });
+
+  it("uses lastIndexOf in fallback when search itself contains file blocks", () => {
+    // When the search string contains a <file> block, it can't appear in the
+    // body-only region, so the fallback searches the full value.
+    const bodyWithFile = `caption\n${FILE_BLOCK}`;
+    const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`;
+    // Should find the *last* (trailing) occurrence
+    const expected = value.lastIndexOf(bodyWithFile);
+    expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected);
+    expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile));
+  });
+
+  it("returns index when no file blocks exist in value", () => {
+    expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4);
+  });
+});
+
+describe("normalizeUpdatedBody", () => {
+  it("returns empty string when updatedBody is empty", () => {
+    expect(normalizeUpdatedBody({ originalBody: "foo", updatedBody: "" })).toBe("");
+  });
+
+  it("returns updatedBody when originalBody is empty", () => {
+    expect(normalizeUpdatedBody({ updatedBody: "hello" })).toBe("hello");
+  });
+
+  it("strips directives when updatedBody equals originalBody", () => {
+    const body = "/think high tell me a joke";
+    const result = normalizeUpdatedBody({ originalBody: body, updatedBody: body });
+    expect(result).toBe("tell me a joke");
+  });
+
+  it("does not corrupt file block content during directive cleanup", () => {
+    const originalBody = "/think high tell me about this file";
+    // updatedBody has the original body plus a file block appended by media processing
+    const updatedBody = `${originalBody}\n${FILE_BLOCK}`;
+    const result = normalizeUpdatedBody({ originalBody, updatedBody });
+    // The directive should be stripped from the body portion, file block preserved
+    expect(result).toContain("tell me about this file");
+    expect(result).toContain(FILE_BLOCK);
+    expect(result).not.toContain("/think");
+  });
+
+  it("replaces in body region, not inside file blocks", () => {
+    const originalBody = "PDF content";
+    const updatedBody = `PDF content\n<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>`;
+    // The replacement should target the body region "PDF content" before the
+    // file block, not the "PDF content" inside the <file> block.
+    const result = normalizeUpdatedBody({ originalBody, updatedBody });
+    // With no directives to strip, original === cleaned, updatedBody !== originalBody
+    // because updatedBody has the file block appended.  The replacement targets the
+    // body-region occurrence.
+    expect(result).toContain('<file name="doc.pdf"');
+    expect(result).toContain("PDF content\n</file>");
+  });
+});
+
+describe("rebuildQueuedPromptWithMediaUnderstanding", () => {
+  it("replaces original body with updated body in prompt", () => {
+    const result = rebuildQueuedPromptWithMediaUnderstanding({
+      prompt: "thread context\nhello world",
+      originalBody: "hello world",
+      updatedBody: 'hello world\n<file name="a.pdf">data</file>',
+    });
+    expect(result).toContain('<file name="a.pdf">data</file>');
+    expect(result).toContain("thread context");
+  });
+
+  it("preserves file blocks in thread history when body is replaced", () => {
+    const prompt = `history\n<file name="old.pdf">old</file>\nhello world`;
+    const result = rebuildQueuedPromptWithMediaUnderstanding({
+      prompt,
+      originalBody: "hello world",
+      updatedBody: "hello world transcribed",
+    });
+    // The old file block from history should be preserved since updatedBody
+    // has no file blocks of its own.
+    expect(result).toContain('<file name="old.pdf">old</file>');
+    expect(result).toContain("hello world transcribed");
+  });
+});
diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index d9202554526..f40ec91f109 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -64,6 +64,40 @@ function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): num
   return bodyRegion.indexOf(search);
 }
 
+function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number {
+  if (!search) {
+    return -1;
+  }
+  const fileBlockIndex = value.search(FILE_BLOCK_RE);
+  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
+  const index = bodyRegion.lastIndexOf(search);
+  if (index >= 0) {
+    return index;
+  }
+  // Fallback: search string itself contains file blocks — it can't appear in the
+  // body-only region.  Search the full value with lastIndexOf to pick the trailing
+  // (most recent) occurrence when thread/history has identical <file> bodies.
+  if (FILE_BLOCK_RE.test(search)) {
+    return value.lastIndexOf(search);
+  }
+  return -1;
+}
+
+function replaceLastOccurrenceBeforeFileBlocks(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = findLastOccurrenceBeforeFileBlocks(value, search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
 function replaceFirstOccurrenceBeforeFileBlocks(
   value: string,
   search: string,
@@ -101,7 +135,8 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str
     return cleanedOriginalBody;
   }
   return (
-    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+    replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ??
+    updatedBody
   ).trim();
 }
 
@@ -215,6 +250,13 @@ function snapshotUpdatedMediaContext(params: {
   };
 }
 
+// Exported for unit testing — these are pure string helpers with no side effects.
+export {
+  findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks,
+  normalizeUpdatedBody as _normalizeUpdatedBody,
+  rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding,
+};
+
 export async function applyDeferredMediaUnderstandingToQueuedRun(
   queued: FollowupRun,
   params: { logLabel?: string } = {},

From eb59b9c19d426296ef05f36a6b3c8ec10ca135a0 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 17:34:56 -0400
Subject: [PATCH 13/17] fix: trailing body match and RawBody-missing extraction
 detection (#46454)

---
 src/auto-reply/reply/followup-media.ts       | 171 ++++++++++++-------
 src/auto-reply/reply/followup-runner.test.ts | 158 +++++++++++++++++
 src/auto-reply/reply/queue/drain.ts          |   9 +-
 3 files changed, 272 insertions(+), 66 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index f40ec91f109..425bdd601ea 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -1,3 +1,4 @@
+import path from "node:path";
 import { logVerbose } from "../../globals.js";
 import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import {
@@ -40,30 +41,6 @@ function stripLeadingMediaReplyHint(prompt: string): string {
   return prompt.trim();
 }
 
-function replaceLastOccurrence(
-  value: string,
-  search: string,
-  replacement: string,
-): string | undefined {
-  if (!search) {
-    return undefined;
-  }
-  const index = value.lastIndexOf(search);
-  if (index < 0) {
-    return undefined;
-  }
-  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
-}
-
-function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): number {
-  if (!search) {
-    return -1;
-  }
-  const fileBlockIndex = value.search(FILE_BLOCK_RE);
-  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
-  return bodyRegion.indexOf(search);
-}
-
 function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number {
   if (!search) {
     return -1;
@@ -98,21 +75,81 @@ function replaceLastOccurrenceBeforeFileBlocks(
   return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
 }
 
-function replaceFirstOccurrenceBeforeFileBlocks(
+function findTrailingReplacementTargetBeforeFileBlocks(
+  value: string,
+  targets: string[],
+): { index: number; target: string } | undefined {
+  let bestMatch: { index: number; target: string } | undefined;
+  for (const target of targets) {
+    const index = findLastOccurrenceBeforeFileBlocks(value, target);
+    if (index < 0) {
+      continue;
+    }
+    if (!bestMatch || index > bestMatch.index) {
+      bestMatch = { index, target };
+    }
+  }
+  return bestMatch;
+}
+
+function replaceOccurrenceAtIndex(
   value: string,
   search: string,
   replacement: string,
-): string | undefined {
-  if (!search) {
-    return undefined;
-  }
-  const index = findFirstOccurrenceBeforeFileBlocks(value, search);
-  if (index < 0) {
-    return undefined;
-  }
+  index: number,
+): string {
   return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
 }
 
+function decodeXmlAttr(value: string): string {
+  return value
+    .replace(/&quot;/g, '"')
+    .replace(/&apos;/g, "'")
+    .replace(/&lt;/g, "<")
+    .replace(/&gt;/g, ">")
+    .replace(/&amp;/g, "&");
+}
+
+function extractAttachmentFileName(value?: string): string | undefined {
+  const trimmed = value?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  if (/^[a-zA-Z][a-zA-Z\d+.-]*:/.test(trimmed)) {
+    try {
+      const pathname = new URL(trimmed).pathname;
+      const basename = path.posix.basename(pathname);
+      return basename || undefined;
+    } catch {
+      // Fall back to path-style parsing below.
+    }
+  }
+  const normalized = trimmed.replace(/\\/g, "/");
+  const basename = path.posix.basename(normalized);
+  return basename || undefined;
+}
+
+function bodyContainsMatchingFileBlock(mediaContext: FollowupMediaContext): boolean {
+  const body = mediaContext.Body?.trim();
+  if (!body || !FILE_BLOCK_RE.test(body)) {
+    return false;
+  }
+  const bodyFileNames = new Set<string>();
+  for (const match of body.matchAll(/<file\s+name="([^"]*)"[^>]*>/gi)) {
+    const fileName = match[1]?.trim();
+    if (fileName) {
+      bodyFileNames.add(decodeXmlAttr(fileName));
+    }
+  }
+  if (bodyFileNames.size === 0) {
+    return false;
+  }
+  return normalizeAttachments(mediaContext as MsgContext).some((attachment) => {
+    const fileName = extractAttachmentFileName(attachment.path ?? attachment.url);
+    return Boolean(fileName && bodyFileNames.has(fileName));
+  });
+}
+
 function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
@@ -168,12 +205,14 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
   // thread history above the body, and prompts whose original body no longer
   // appears all retain any legitimate <file> blocks.
   if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
-    const bodyIdx =
-      replacementTargets
-        .map((target) => findFirstOccurrenceBeforeFileBlocks(stripped, target))
-        .find((index) => index >= 0) ?? -1;
-    if (bodyIdx >= 0) {
-      stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx));
+    const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(
+      stripped,
+      replacementTargets,
+    );
+    if (trailingMatch) {
+      stripped =
+        stripped.slice(0, trailingMatch.index) +
+        stripExistingFileBlocks(stripped.slice(trailingMatch.index));
     }
   }
 
@@ -186,12 +225,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
   }
 
   let rebuilt = stripped;
-  for (const target of replacementTargets) {
-    const replaced = replaceFirstOccurrenceBeforeFileBlocks(rebuilt, target, updatedBody);
-    if (replaced !== undefined) {
-      rebuilt = replaced;
-      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
-    }
+  const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(rebuilt, replacementTargets);
+  if (trailingMatch) {
+    rebuilt = replaceOccurrenceAtIndex(
+      rebuilt,
+      trailingMatch.target,
+      updatedBody,
+      trailingMatch.index,
+    );
+    return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
   }
 
   rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
@@ -268,29 +310,29 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
   if (!mediaContext || mediaContext.DeferredMediaApplied) {
     return;
   }
-  if (mediaContext.MediaUnderstanding?.length) {
-    mediaContext.DeferredMediaApplied = true;
-    return;
-  }
   if (!hasMediaAttachments(mediaContext)) {
     mediaContext.DeferredMediaApplied = true;
     return;
   }
-
-  const resolvedOriginalBody =
-    mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
-  // Detect file extraction from the primary path via body mutation instead of
-  // scanning for literal '<file name=' patterns (which false-positives on user
-  // text).  Compare Body against RawBody (never mutated by the primary path's
-  // media/file processing) rather than CommandBody (which differs from Body
-  // when inline directives like /think were stripped).
   const referenceBody = mediaContext.RawBody ?? mediaContext.Body;
-  if (
-    !mediaContext.DeferredFileBlocksExtracted &&
-    mediaContext.Body !== referenceBody &&
-    hasAnyFileAttachments(mediaContext)
-  ) {
-    mediaContext.DeferredFileBlocksExtracted = true;
+  // Prefer RawBody-vs-Body comparison when RawBody exists. If RawBody is
+  // missing, fall back to explicit file-extraction signals instead of
+  // re-running extraction just because the clean pre-extraction body is gone.
+  if (!mediaContext.DeferredFileBlocksExtracted && hasAnyFileAttachments(mediaContext)) {
+    const rawBodyMissing = typeof mediaContext.RawBody !== "string";
+    if (mediaContext.Body !== referenceBody) {
+      mediaContext.DeferredFileBlocksExtracted = true;
+    } else if (
+      rawBodyMissing &&
+      (Boolean(mediaContext.MediaUnderstanding?.length) ||
+        bodyContainsMatchingFileBlock(mediaContext))
+    ) {
+      mediaContext.DeferredFileBlocksExtracted = true;
+    }
+  }
+  if (mediaContext.MediaUnderstanding?.length) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
   }
 
   if (mediaContext.DeferredFileBlocksExtracted && hasOnlyFileLikeAttachments(mediaContext)) {
@@ -298,6 +340,9 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
     return;
   }
 
+  const resolvedOriginalBody =
+    mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
+
   try {
     const mediaCtx = {
       ...mediaContext,
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index d85223b1afe..0db30881b7e 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2095,6 +2095,114 @@ describe("createFollowupRunner media understanding", () => {
     };
     expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
   });
+
+  it("does not re-apply file extraction when RawBody is missing but Body already has a matching file block", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${fileBlock}`,
+          CommandBody: "summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(fileBlock);
+  });
+
+  it("replaces the trailing repeated body segment instead of the first matching thread text", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Deferred transcript";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nThread history: summarize this\n\n` +
+          `summarize this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Thread history: summarize this");
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).toContain(newFileBlock);
+    expect(agentCall?.prompt).not.toContain("old extracted content");
+    expect(agentCall?.prompt?.indexOf(newFileBlock)).toBeGreaterThan(
+      agentCall?.prompt?.lastIndexOf("summarize this") ?? -1,
+    );
+  });
 });
 
 describe("followup queue drain deferred media understanding", () => {
@@ -2163,6 +2271,56 @@ describe("followup queue drain deferred media understanding", () => {
     expect(prompt).not.toContain("[media attached: /tmp/voice.ogg");
   });
 
+  it("preprocesses queued runs in parallel", async () => {
+    const resolvers: Array<() => void> = [];
+    const done = () => ({
+      outputs: [],
+      decisions: [],
+      appliedImage: false,
+      appliedAudio: false,
+      appliedVideo: false,
+      appliedFile: false,
+    });
+    applyMediaUnderstandingMock.mockImplementation(
+      async () =>
+        await new Promise((resolve) => {
+          resolvers.push(() => resolve(done()));
+        }),
+    );
+
+    const items: FollowupRun[] = [
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice-1.ogg (audio/ogg)]\nfirst text",
+        summaryLine: "first text",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "first text",
+          MediaPaths: ["/tmp/voice-1.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice-2.ogg (audio/ogg)]\nsecond text",
+        summaryLine: "second text",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "second text",
+          MediaPaths: ["/tmp/voice-2.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    ];
+
+    const pending = applyDeferredMediaToQueuedRuns(items);
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(2);
+
+    for (const resolve of resolvers) {
+      resolve();
+    }
+    await pending;
+  });
+
   it("preprocesses dropped media items before building overflow summaries", async () => {
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 471c94200ba..5d53df34189 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -76,9 +76,12 @@ function clearFollowupQueueSummaryState(queue: FollowupQueueState): void {
 }
 
 export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise<void> {
-  for (const item of items) {
-    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
-  }
+  await Promise.allSettled(
+    items.map(
+      async (item) =>
+        await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
+    ),
+  );
 }
 
 async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {

From 5ec82dd0e988227f77e79a53bd279255491e5d16 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 18:00:45 -0400
Subject: [PATCH 14/17] fix: parallelize deferred media calls and search full
 prompt outside file blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parallelize deferred media understanding calls in resolveSummaryLines and
applyDeferredMediaToQueuedRuns using Promise.allSettled so media API calls
run concurrently while summary line order stays sequential.

Replace findFirstOccurrenceBeforeFileBlocks (which truncated at the first
<file> tag) with findLastOccurrenceOutsideFileBlocks that searches the full
prompt via lastIndexOf and skips matches inside <file>…</file> blocks. This
fixes body replacement when thread/history context has extracted file blocks
before the current queued message body.

Add regression test for body appearing after thread-history file blocks.
---
 src/auto-reply/reply/followup-media.test.ts  | 40 +++++++----
 src/auto-reply/reply/followup-media.ts       | 58 +++++++++++-----
 src/auto-reply/reply/followup-runner.test.ts | 72 ++++++++++++++++++++
 src/auto-reply/reply/queue/drain.ts          | 21 +++---
 4 files changed, 152 insertions(+), 39 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts
index d8ba453f801..77996f85606 100644
--- a/src/auto-reply/reply/followup-media.test.ts
+++ b/src/auto-reply/reply/followup-media.test.ts
@@ -1,43 +1,57 @@
 import { describe, expect, it } from "vitest";
 import {
-  _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks,
+  _findLastOccurrenceOutsideFileBlocks as findLastOccurrenceOutsideFileBlocks,
   _normalizeUpdatedBody as normalizeUpdatedBody,
   _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding,
 } from "./followup-media.js";
 
 const FILE_BLOCK = '<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>';
 
-describe("findLastOccurrenceBeforeFileBlocks", () => {
+describe("findLastOccurrenceOutsideFileBlocks", () => {
   it("returns -1 for empty search", () => {
-    expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1);
+    expect(findLastOccurrenceOutsideFileBlocks("hello", "")).toBe(-1);
   });
 
   it("finds last occurrence in body region before file blocks", () => {
     const value = `hello world hello\n${FILE_BLOCK}`;
     // "hello" appears at 0 and 12 — both before the file block
-    expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12);
+    expect(findLastOccurrenceOutsideFileBlocks(value, "hello")).toBe(12);
   });
 
-  it("does not match inside file block content", () => {
+  it("skips matches inside file block content", () => {
+    // "PDF content" appears only inside the file block — no valid match outside.
+    const value = `some text\n${FILE_BLOCK}`;
+    expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(-1);
+  });
+
+  it("finds trailing occurrence outside file block even when also inside one", () => {
     const value = `some text\n${FILE_BLOCK}\nPDF content`;
-    // "PDF content" appears in the file block and after it, but the body region
-    // (before <file) is just "some text\n" — no match there.
-    expect(findLastOccurrenceBeforeFileBlocks(value, "PDF content")).toBe(-1);
+    // "PDF content" appears inside the file block AND after it — the function
+    // should return the trailing occurrence that is outside the block.
+    const expected = value.lastIndexOf("PDF content");
+    expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(expected);
   });
 
-  it("uses lastIndexOf in fallback when search itself contains file blocks", () => {
-    // When the search string contains a <file> block, it can't appear in the
-    // body-only region, so the fallback searches the full value.
+  it("finds occurrence when search itself contains file blocks", () => {
     const bodyWithFile = `caption\n${FILE_BLOCK}`;
     const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`;
     // Should find the *last* (trailing) occurrence
     const expected = value.lastIndexOf(bodyWithFile);
-    expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected);
+    expect(findLastOccurrenceOutsideFileBlocks(value, bodyWithFile)).toBe(expected);
     expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile));
   });
 
   it("returns index when no file blocks exist in value", () => {
-    expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4);
+    expect(findLastOccurrenceOutsideFileBlocks("abc abc", "abc")).toBe(4);
+  });
+
+  it("finds body text after thread-history file blocks", () => {
+    const value = `Thread history\n${FILE_BLOCK}\n\ncheck this out`;
+    // The body "check this out" appears after a file block from thread history.
+    // The old truncation approach would miss this; the new approach finds it.
+    expect(findLastOccurrenceOutsideFileBlocks(value, "check this out")).toBe(
+      value.lastIndexOf("check this out"),
+    );
   });
 });
 
diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 425bdd601ea..f0d5d951683 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -41,26 +41,48 @@ function stripLeadingMediaReplyHint(prompt: string): string {
   return prompt.trim();
 }
 
-function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number {
+/** Collect the [start, end) ranges of every `<file …>…</file>` block in `value`. */
+function collectFileBlockRanges(value: string): Array<[number, number]> {
+  const ranges: Array<[number, number]> = [];
+  const re = new RegExp(FILE_BLOCK_FULL_RE.source, FILE_BLOCK_FULL_RE.flags);
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(value)) !== null) {
+    ranges.push([m.index, m.index + m[0].length]);
+  }
+  return ranges;
+}
+
+function isInsideFileBlock(
+  position: number,
+  length: number,
+  ranges: Array<[number, number]>,
+): boolean {
+  for (const [start, end] of ranges) {
+    if (position >= start && position + length <= end) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * Find the last occurrence of `search` in `value` that is NOT inside a
+ * `<file …>…</file>` block.  Searches the full string with lastIndexOf,
+ * then walks backward past any matches that fall inside file blocks.
+ */
+function findLastOccurrenceOutsideFileBlocks(value: string, search: string): number {
   if (!search) {
     return -1;
   }
-  const fileBlockIndex = value.search(FILE_BLOCK_RE);
-  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
-  const index = bodyRegion.lastIndexOf(search);
-  if (index >= 0) {
-    return index;
+  const ranges = collectFileBlockRanges(value);
+  let pos = value.lastIndexOf(search);
+  while (pos >= 0 && isInsideFileBlock(pos, search.length, ranges)) {
+    pos = value.lastIndexOf(search, pos - 1);
   }
-  // Fallback: search string itself contains file blocks — it can't appear in the
-  // body-only region.  Search the full value with lastIndexOf to pick the trailing
-  // (most recent) occurrence when thread/history has identical <file> bodies.
-  if (FILE_BLOCK_RE.test(search)) {
-    return value.lastIndexOf(search);
-  }
-  return -1;
+  return pos;
 }
 
-function replaceLastOccurrenceBeforeFileBlocks(
+function replaceLastOccurrenceOutsideFileBlocks(
   value: string,
   search: string,
   replacement: string,
@@ -68,7 +90,7 @@ function replaceLastOccurrenceBeforeFileBlocks(
   if (!search) {
     return undefined;
   }
-  const index = findLastOccurrenceBeforeFileBlocks(value, search);
+  const index = findLastOccurrenceOutsideFileBlocks(value, search);
   if (index < 0) {
     return undefined;
   }
@@ -81,7 +103,7 @@ function findTrailingReplacementTargetBeforeFileBlocks(
 ): { index: number; target: string } | undefined {
   let bestMatch: { index: number; target: string } | undefined;
   for (const target of targets) {
-    const index = findLastOccurrenceBeforeFileBlocks(value, target);
+    const index = findLastOccurrenceOutsideFileBlocks(value, target);
     if (index < 0) {
       continue;
     }
@@ -172,7 +194,7 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str
     return cleanedOriginalBody;
   }
   return (
-    replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ??
+    replaceLastOccurrenceOutsideFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ??
     updatedBody
   ).trim();
 }
@@ -294,7 +316,7 @@ function snapshotUpdatedMediaContext(params: {
 
 // Exported for unit testing — these are pure string helpers with no side effects.
 export {
-  findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks,
+  findLastOccurrenceOutsideFileBlocks as _findLastOccurrenceOutsideFileBlocks,
   normalizeUpdatedBody as _normalizeUpdatedBody,
   rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding,
 };
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 0db30881b7e..2c94f03fb8d 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2025,6 +2025,78 @@ describe("createFollowupRunner media understanding", () => {
     );
   });
 
+  it("finds the body after thread-history file blocks when body appears after the first <file> tag", async () => {
+    const threadFileBlock =
+      '<file name="thread.pdf" mime="application/pdf">\nolder thread attachment\n</file>';
+    const transcriptText = "Transcript from deferred voice note";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // The prompt has thread history with a file block BEFORE the current
+    // queued body text.  The old truncation approach would miss the body
+    // because it only searched before the first <file> tag.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/voice.ogg]\n${MEDIA_REPLY_HINT}\nThread history\n\n${threadFileBlock}\n\ncheck this out`,
+        mediaContext: {
+          Body: "check this out",
+          RawBody: "check this out",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`;
+    // The body should be replaced with the transcript block
+    expect(agentCall?.prompt).toContain(transcriptBlock);
+    // Thread history and its file block should be preserved
+    expect(agentCall?.prompt).toContain("Thread history");
+    expect(agentCall?.prompt).toContain(threadFileBlock);
+  });
+
   it("sets DeferredMediaApplied when media understanding throws", async () => {
     applyMediaUnderstandingMock.mockRejectedValueOnce(
       new Error("transcription service unavailable"),
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 5d53df34189..41c859d2ec3 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -85,14 +85,19 @@ export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Prom
 }
 
 async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {
-  const summaryLines: string[] = [];
-  for (const item of items) {
-    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
-    // After deferred media, prefer the updated prompt (which includes transcripts)
-    // over the original summaryLine (which may just be the caption text).
-    summaryLines.push(buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""));
-  }
-  return summaryLines;
+  // Parallelize the media understanding API calls upfront (same pattern as
+  // applyDeferredMediaToQueuedRuns), then build summary lines sequentially
+  // so line order matches the original item order.
+  await Promise.allSettled(
+    items.map((item) =>
+      applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
+    ),
+  );
+  // After deferred media, prefer the updated prompt (which includes transcripts)
+  // over the original summaryLine (which may just be the caption text).
+  return items.map((item) =>
+    buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""),
+  );
 }
 
 export async function buildMediaAwareQueueSummaryPrompt(params: {

From e8fd176cf491f2e03c42bd02a190c5910f9db637 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 20:36:20 -0400
Subject: [PATCH 15/17] fix: match file blocks by presence not basename
 (#46454)

---
 src/auto-reply/reply/followup-media.ts       | 63 ++++----------------
 src/auto-reply/reply/followup-runner.test.ts | 35 +++++++++++
 2 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index f0d5d951683..dca7b4b414b 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -1,4 +1,3 @@
-import path from "node:path";
 import { logVerbose } from "../../globals.js";
 import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import {
@@ -14,6 +13,7 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
 const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
 const FILE_BLOCK_RE = /<file\s+name="/i;
+const FILE_BLOCK_BODY_RE = /<file\s+name="[^"]*"[^>]*>[\s\S]*?<\/file>/i;
 const FILE_BLOCK_FULL_RE = /<file\s+name="[^"]*"[^>]*>[\s\S]*?<\/file>\n?/gi;
 
 function stripExistingFileBlocks(text: string): string {
@@ -123,59 +123,14 @@ function replaceOccurrenceAtIndex(
   return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
 }
 
-function decodeXmlAttr(value: string): string {
-  return value
-    .replace(/&quot;/g, '"')
-    .replace(/&apos;/g, "'")
-    .replace(/&lt;/g, "<")
-    .replace(/&gt;/g, ">")
-    .replace(/&amp;/g, "&");
-}
-
-function extractAttachmentFileName(value?: string): string | undefined {
-  const trimmed = value?.trim();
-  if (!trimmed) {
-    return undefined;
-  }
-  if (/^[a-zA-Z][a-zA-Z\d+.-]*:/.test(trimmed)) {
-    try {
-      const pathname = new URL(trimmed).pathname;
-      const basename = path.posix.basename(pathname);
-      return basename || undefined;
-    } catch {
-      // Fall back to path-style parsing below.
-    }
-  }
-  const normalized = trimmed.replace(/\\/g, "/");
-  const basename = path.posix.basename(normalized);
-  return basename || undefined;
-}
-
-function bodyContainsMatchingFileBlock(mediaContext: FollowupMediaContext): boolean {
-  const body = mediaContext.Body?.trim();
-  if (!body || !FILE_BLOCK_RE.test(body)) {
-    return false;
-  }
-  const bodyFileNames = new Set<string>();
-  for (const match of body.matchAll(/<file\s+name="([^"]*)"[^>]*>/gi)) {
-    const fileName = match[1]?.trim();
-    if (fileName) {
-      bodyFileNames.add(decodeXmlAttr(fileName));
-    }
-  }
-  if (bodyFileNames.size === 0) {
-    return false;
-  }
-  return normalizeAttachments(mediaContext as MsgContext).some((attachment) => {
-    const fileName = extractAttachmentFileName(attachment.path ?? attachment.url);
-    return Boolean(fileName && bodyFileNames.has(fileName));
-  });
-}
-
 function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
 
+function bodyContainsExtractedFileBlock(text: string | undefined): boolean {
+  return FILE_BLOCK_BODY_RE.test(text ?? "");
+}
+
 function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
   const updatedBody = params.updatedBody?.trim();
   if (!updatedBody) {
@@ -336,10 +291,12 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
     mediaContext.DeferredMediaApplied = true;
     return;
   }
+
   const referenceBody = mediaContext.RawBody ?? mediaContext.Body;
   // Prefer RawBody-vs-Body comparison when RawBody exists. If RawBody is
-  // missing, fall back to explicit file-extraction signals instead of
-  // re-running extraction just because the clean pre-extraction body is gone.
+  // missing, any real <file>...</file> block plus file-like attachments means
+  // extraction already ran, even if the stored name came from Content-Disposition
+  // instead of the attachment path/url basename.
   if (!mediaContext.DeferredFileBlocksExtracted && hasAnyFileAttachments(mediaContext)) {
     const rawBodyMissing = typeof mediaContext.RawBody !== "string";
     if (mediaContext.Body !== referenceBody) {
@@ -347,7 +304,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
     } else if (
       rawBodyMissing &&
       (Boolean(mediaContext.MediaUnderstanding?.length) ||
-        bodyContainsMatchingFileBlock(mediaContext))
+        bodyContainsExtractedFileBlock(mediaContext.Body))
     ) {
       mediaContext.DeferredFileBlocksExtracted = true;
     }
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 2c94f03fb8d..0c8ac63d4d7 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2201,6 +2201,41 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).toContain(fileBlock);
   });
 
+  it("treats any stored file block as already extracted even when the filename differs from the attachment basename", async () => {
+    const fileBlock =
+      '<file name="statement-from-mail.pdf" mime="application/pdf">\nreport content\n</file>';
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: `[media attached: /tmp/upload-8472.bin]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`,
+      mediaContext: {
+        Body: `summarize this\n\n${fileBlock}`,
+        CommandBody: "summarize this",
+        MediaPaths: ["/tmp/upload-8472.bin"],
+        MediaTypes: ["application/pdf"],
+      },
+    });
+
+    await runner(queued);
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+    expect(queued.mediaContext?.DeferredFileBlocksExtracted).toBe(true);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt?.match(/<file\s+name="statement-from-mail\.pdf"/g)).toHaveLength(1);
+  });
+
   it("replaces the trailing repeated body segment instead of the first matching thread text", async () => {
     const existingFileBlock =
       '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';

From 1b6c43baa47f9e09a670bc055b73ee32b65bc693 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 21:54:48 -0400
Subject: [PATCH 16/17] fix: stop inferring file extraction from Body/RawBody
 mismatch (#46454)

---
 src/auto-reply/reply/followup-media.ts       | 28 +++++-------
 src/auto-reply/reply/followup-runner.test.ts | 48 ++++++++++++++++++++
 src/auto-reply/reply/queue/types.ts          |  6 +--
 3 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index dca7b4b414b..19b2c061c8c 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -292,27 +292,21 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
     return;
   }
 
-  const referenceBody = mediaContext.RawBody ?? mediaContext.Body;
-  // Prefer RawBody-vs-Body comparison when RawBody exists. If RawBody is
-  // missing, any real <file>...</file> block plus file-like attachments means
-  // extraction already ran, even if the stored name came from Content-Disposition
-  // instead of the attachment path/url basename.
-  if (!mediaContext.DeferredFileBlocksExtracted && hasAnyFileAttachments(mediaContext)) {
-    const rawBodyMissing = typeof mediaContext.RawBody !== "string";
-    if (mediaContext.Body !== referenceBody) {
-      mediaContext.DeferredFileBlocksExtracted = true;
-    } else if (
-      rawBodyMissing &&
-      (Boolean(mediaContext.MediaUnderstanding?.length) ||
-        bodyContainsExtractedFileBlock(mediaContext.Body))
-    ) {
-      mediaContext.DeferredFileBlocksExtracted = true;
-    }
-  }
   if (mediaContext.MediaUnderstanding?.length) {
     mediaContext.DeferredMediaApplied = true;
     return;
   }
+  // Treat followup file extraction as already applied only when we have explicit
+  // evidence: the queue snapshot already flagged it or Body already contains a
+  // real extracted <file>...</file> block. Body/RawBody mismatches are not
+  // reliable because some channels wrap Body with envelope metadata.
+  if (
+    !mediaContext.DeferredFileBlocksExtracted &&
+    hasAnyFileAttachments(mediaContext) &&
+    bodyContainsExtractedFileBlock(mediaContext.Body)
+  ) {
+    mediaContext.DeferredFileBlocksExtracted = true;
+  }
 
   if (mediaContext.DeferredFileBlocksExtracted && hasOnlyFileLikeAttachments(mediaContext)) {
     mediaContext.DeferredMediaApplied = true;
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 0c8ac63d4d7..f766aa57321 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1231,6 +1231,54 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt?.match(/<file\b/g)).toHaveLength(1);
   });
 
+  it("does not infer file extraction from wrapped Body/RawBody mismatches", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "file processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/report.pdf]\nLine: Alice\nsummarize this",
+        mediaContext: {
+          Body: "Line: Alice\nsummarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Line: Alice");
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
+  });
+
   it("preserves non-audio media lines when only audio is transcribed", async () => {
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index e1e9e20e5c8..1c2f5e0551a 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -52,9 +52,9 @@ export type FollowupMediaContext = {
   DeferredMediaApplied?: boolean;
   /**
    * Set when file extraction has already been applied to Body (either in the
-   * primary path or by a previous deferred-media run).  Checked instead of
-   * scanning body text for `<file` patterns to avoid false-positives on user
-   * messages that contain literal XML-like text.
+   * primary path or by a previous deferred-media run). This avoids re-running
+   * file extraction when Body already contains real extracted `<file>...</file>`
+   * blocks.
    */
   DeferredFileBlocksExtracted?: boolean;
 };

From fddce3db72f5e22b8d3a1d2f7a12daf15b3e045c Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 21 Mar 2026 00:28:15 -0400
Subject: [PATCH 17/17] fix: remove redundant async wrapper in drain.ts

---
 src/auto-reply/reply/queue/drain.ts | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 41c859d2ec3..27491713848 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -77,9 +77,8 @@ function clearFollowupQueueSummaryState(queue: FollowupQueueState): void {
 
 export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise<void> {
   await Promise.allSettled(
-    items.map(
-      async (item) =>
-        await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
+    items.map((item) =>
+      applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
     ),
   );
 }