diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index a0c5306380d..d2812ff61b6 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -818,6 +818,69 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
   });
 
+  it("applies media understanding for URL-only attachments", async () => {
+    const transcriptText = "URL-only transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaUrl: "https://cdn.example.com/voice.ogg",
+          MediaUrls: ["https://cdn.example.com/voice.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
   it("strips the full media line when attachment paths or URLs contain brackets", async () => {
     const transcriptText = "Bracket-safe transcript";
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1329,6 +1392,98 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).not.toContain("/think high summarize this");
   });
 
+  it("preserves directive-like tokens inside extracted media content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\n/model claude-opus should stay\n/queue followup should stay\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `/think high summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+    expect(agentCall?.prompt).toContain("/model claude-opus should stay");
+    expect(agentCall?.prompt).toContain("/queue followup should stay");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    const description = "[Image]\nDescription:\na mountain at sunset";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = description;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/photo.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("a mountain at sunset");
+  });
+
   it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
     const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1360,8 +1515,7 @@ describe("createFollowupRunner media understanding", () => {
     // file extraction results from being embedded in the prompt.
     await runner(
       createQueuedRun({
-        prompt:
-          "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
+        prompt: "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
         mediaContext: {
           Body: "check my <file upload please",
           CommandBody: "check my <file upload please",
@@ -1380,6 +1534,154 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).toContain("check my <file upload please");
   });
 
+  it("preserves directive-like text that appears inside extracted file content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\nRun `/think high` literally in the shell example.\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/board.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[Image summary]");
+    expect(agentCall?.prompt).toContain("A whiteboard with action items.");
+  });
+
+  it("applies media understanding for URL-only deferred attachments", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "[Audio]\nTranscript:\nremote transcript";
+        params.ctx.Transcript = "remote transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "remote transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        mediaContext: {
+          Body: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("remote transcript");
+  });
+
   it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
     const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
     applyMediaUnderstandingMock.mockImplementationOnce(
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 39728c71def..64b8a935b2a 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -43,8 +43,7 @@ import type { TypingController } from "./typing.js";
 
 const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
-const LEADING_MEDIA_ATTACHED_LINE_RE =
-  /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
+const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
 const FILE_BLOCK_RE = /<file\s+name="/i;
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
@@ -87,6 +86,28 @@ function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
 
+function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
+  const updatedBody = params.updatedBody?.trim();
+  if (!updatedBody) {
+    return "";
+  }
+  const originalBody = params.originalBody?.trim();
+  if (!originalBody) {
+    return updatedBody;
+  }
+
+  const cleanedOriginalBody = stripInlineDirectives(originalBody);
+  if (!cleanedOriginalBody) {
+    return updatedBody;
+  }
+  if (updatedBody === originalBody) {
+    return cleanedOriginalBody;
+  }
+  return (
+    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+  ).trim();
+}
+
 function rebuildQueuedPromptWithMediaUnderstanding(params: {
   prompt: string;
   originalBody?: string;
@@ -98,7 +119,10 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
-  const updatedBody = stripInlineDirectives(params.updatedBody);
+  const updatedBody = normalizeUpdatedBody({
+    originalBody: params.originalBody,
+    updatedBody: params.updatedBody,
+  });
   if (!updatedBody) {
     return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
   }
@@ -250,8 +274,11 @@ export function createFollowupRunner(params: {
       if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
         const hasMedia = Boolean(
           queued.mediaContext.MediaPath?.trim() ||
+          queued.mediaContext.MediaUrl?.trim() ||
           (Array.isArray(queued.mediaContext.MediaPaths) &&
-            queued.mediaContext.MediaPaths.length > 0),
+            queued.mediaContext.MediaPaths.length > 0) ||
+          (Array.isArray(queued.mediaContext.MediaUrls) &&
+            queued.mediaContext.MediaUrls.length > 0),
         );
         if (hasMedia) {
           try {
@@ -281,6 +308,9 @@ export function createFollowupRunner(params: {
             });
             const shouldRebuildPrompt =
               muResult.outputs.length > 0 ||
+              muResult.appliedAudio ||
+              muResult.appliedImage ||
+              muResult.appliedVideo ||
               (muResult.appliedFile && !bodyAlreadyHasFileBlock);
             if (shouldRebuildPrompt) {
               // Rebuild the queued prompt from the mutated media context so the
diff --git a/src/auto-reply/reply/get-reply-run.media-only.test.ts b/src/auto-reply/reply/get-reply-run.media-only.test.ts
index 829b3937009..f519da10082 100644
--- a/src/auto-reply/reply/get-reply-run.media-only.test.ts
+++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts
@@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
   });
 
+  it("snapshots URL-only attachments into followup mediaContext", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "check this attachment",
+          RawBody: "check this attachment",
+          CommandBody: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+          MediaUrl: "https://cdn.example.com/input.png",
+          MediaUrls: ["https://cdn.example.com/input.png"],
+          MediaType: "image/png",
+          MediaTypes: ["image/png"],
+        },
+        sessionCtx: {
+          Body: "check this attachment",
+          BodyStripped: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext).toEqual(
+      expect.objectContaining({
+        MediaUrl: "https://cdn.example.com/input.png",
+        MediaUrls: ["https://cdn.example.com/input.png"],
+        MediaType: "image/png",
+        MediaTypes: ["image/png"],
+      }),
+    );
+  });
+
   it("keeps thread history context on follow-up turns", async () => {
     const result = await runPreparedReply(
       baseParams({
@@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
   });
 
+  it("snapshots mediaContext for URL-only deferred attachments", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "",
+          RawBody: "",
+          CommandBody: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+        },
+        sessionCtx: {
+          Body: "",
+          BodyStripped: "",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg");
+    expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([
+      "https://cdn.example.com/audio.ogg",
+    ]);
+  });
+
   it("returns the empty-body reply when there is no text and no media", async () => {
     const result = await runPreparedReply(
       baseParams({
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index fe87e3919d0..b4b693b8119 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -310,7 +310,14 @@ export async function runPreparedReply(
     : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
   const baseBodyTrimmed = baseBodyForPrompt.trim();
   const hasMediaAttachment = Boolean(
-    sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
+    sessionCtx.MediaPath ||
+    sessionCtx.MediaUrl ||
+    (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) ||
+    (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) ||
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
   );
   if (!baseBodyTrimmed && !hasMediaAttachment) {
     await typing.onReplyStart();
@@ -476,7 +483,10 @@ export async function runPreparedReply(
   // followup runner.  When MediaUnderstanding is already populated the runner
   // knows transcription already succeeded and skips re-application.
   const hasMediaAttachments = Boolean(
-    ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
   );
   const mediaContext = hasMediaAttachments
     ? {