From 3bf55561cbb972df5c122da75ce7a26e7b947c8b Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 13:40:52 -0400 Subject: [PATCH 01/17] fix: apply media understanding to followup-queued messages (#44682) Voice notes arriving while the agent is mid-turn were queued as followup messages without audio transcription. The followup runner called runEmbeddedPiAgent directly, bypassing applyMediaUnderstanding. This adds a mediaContext field to FollowupRun that snapshots the original message's media fields. Before the agent run, the followup runner checks whether media understanding was applied. If not (empty MediaUnderstanding), it calls applyMediaUnderstanding and rebuilds the prompt with the transcript, matching the primary path's formatting. Co-Authored-By: Claude Opus 4.6 --- src/auto-reply/reply/followup-runner.test.ts | 285 ++++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 65 ++++- src/auto-reply/reply/get-reply-run.ts | 34 ++- src/auto-reply/reply/queue.ts | 5 +- src/auto-reply/reply/queue/types.ts | 37 +++ 5 files changed, 416 insertions(+), 10 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 0e93ab156a8..f0a060af4ac 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -2,7 +2,7 @@ import fs from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; import { beforeEach, describe, expect, it, vi } from "vitest"; -import { loadSessionStore, saveSessionStore, type SessionEntry } from "../../config/sessions.js"; +import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js"; import type { FollowupRun } from "./queue.js"; import * as sessionRunAccounting from "./session-run-accounting.js"; import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js"; @@ -10,6 +10,7 @@ import { createMockFollowupRun, createMockTypingController } from "./test-helper const runEmbeddedPiAgentMock = vi.fn(); const routeReplyMock = vi.fn(); const isRoutableChannelMock = vi.fn(); +const applyMediaUnderstandingMock = vi.fn(); vi.mock( "../../agents/model-fallback.js", @@ -20,6 +21,10 @@ vi.mock("../../agents/pi-embedded.js", () => ({ runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params), })); +vi.mock("../../media-understanding/apply.js", () => ({ + applyMediaUnderstanding: (params: unknown) => applyMediaUnderstandingMock(params), +})); + vi.mock("./route-reply.js", async (importOriginal) => { const actual = await importOriginal(); return { @@ -48,13 +53,24 @@ beforeEach(() => { isRoutableChannelMock.mockImplementation((ch: string | undefined) => Boolean(ch?.trim() && ROUTABLE_TEST_CHANNELS.has(ch.trim().toLowerCase())), ); + applyMediaUnderstandingMock.mockReset(); + applyMediaUnderstandingMock.mockResolvedValue({ + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }); }); const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun => createMockFollowupRun({ run: { messageProvider } }); function createQueuedRun( - overrides: Partial> & { run?: Partial } = {}, + overrides: Partial> & { + run?: Partial; + } = {}, ): FollowupRun { return createMockFollowupRun(overrides); } @@ -401,7 +417,12 @@ describe("createFollowupRunner messaging tool dedupe", () => { agentResult: { ...makeTextReplyDedupeResult(), messagingToolSentTargets: [ - { tool: "telegram", provider: "telegram", to: "268300329", accountId: "work" }, + { + tool: "telegram", + provider: "telegram", + to: "268300329", + accountId: "work", + }, ], }, queued: { @@ -451,8 +472,13 @@ describe("createFollowupRunner messaging tool dedupe", () => { "sessions.json", ); const sessionKey = "main"; - const sessionEntry: SessionEntry = { sessionId: "session", updatedAt: Date.now() }; - const sessionStore: Record = { [sessionKey]: sessionEntry }; + const sessionEntry: SessionEntry = { + sessionId: "session", + updatedAt: Date.now(), + }; + const sessionStore: Record = { + [sessionKey]: sessionEntry, + }; await saveSessionStore(storePath, sessionStore); const { onBlockReply } = await runMessagingCase({ @@ -704,7 +730,254 @@ describe("createFollowupRunner agentDir forwarding", () => { }); expect(runEmbeddedPiAgentMock).toHaveBeenCalledTimes(1); - const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { agentDir?: string }; + const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + agentDir?: string; + }; expect(call?.agentDir).toBe(agentDir); }); }); + +describe("createFollowupRunner media understanding", () => { + it("applies audio transcription when mediaContext has untranscribed audio", async () => { + const transcriptText = "Hello, this is a voice note."; + // The real applyMediaUnderstanding mutates the ctx; the mock must do the same + // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line. + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "Got it!" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + // MediaUnderstanding is empty — transcription not yet applied + }, + }); + await runner(queued); + + // applyMediaUnderstanding should have been called + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + expect(applyMediaUnderstandingMock).toHaveBeenCalledWith( + expect.objectContaining({ + cfg: queued.run.config, + agentDir: queued.run.agentDir, + }), + ); + + // The prompt passed to the agent should include the transcript, not the + // raw audio attachment line. + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg"); + + expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); + }); + + it("skips media understanding when MediaUnderstanding is already populated", async () => { + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "reply" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: "[Audio]\nTranscript:\nAlready transcribed.\n\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + // MediaUnderstanding already populated — transcription was applied in primary path + MediaUnderstanding: [ + { + kind: "audio.transcription", + text: "Already transcribed.", + attachmentIndex: 0, + provider: "whisper", + }, + ], + }, + }); + await runner(queued); + + // Should NOT re-run media understanding + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + + // The original prompt should be passed through unchanged + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("Already transcribed."); + }); + + it("skips media understanding when no mediaContext is present", async () => { + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "reply" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // No mediaContext (plain text message) + const queued = createQueuedRun({ prompt: "just text" }); + await runner(queued); + + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + }); + + it("continues with raw prompt when media understanding fails", async () => { + applyMediaUnderstandingMock.mockRejectedValueOnce(new Error("transcription service down")); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "fallback reply" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const originalPrompt = "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text"; + const queued = createQueuedRun({ + prompt: originalPrompt, + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }); + await runner(queued); + + // Should have attempted media understanding + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + + // Agent should still run with the original prompt + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toBe(originalPrompt); + + expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" })); + }); + + it("preserves non-audio media lines when only audio is transcribed", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + // Simulate transcription updating the context + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: "voice transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = "voice transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "voice transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "got both" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: + "[media attached: 2 files]\n[media attached 1/2: /tmp/voice.ogg (audio/ogg)]\n[media attached 2/2: /tmp/photo.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg", "/tmp/photo.jpg"], + MediaTypes: ["audio/ogg", "image/jpeg"], + }, + }); + await runner(queued); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + // Audio attachment line should be stripped + expect(agentCall?.prompt).not.toContain("voice.ogg"); + // Image attachment line should also be stripped (all media-attached lines are + // removed and replaced by the new buildInboundMediaNote output) + // The transcript should be present + expect(agentCall?.prompt).toContain("voice transcript"); + }); +}); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 2fd21607095..1e65380a020 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -13,10 +13,13 @@ import type { SessionEntry } from "../../config/sessions.js"; import type { TypingMode } from "../../config/types.js"; import { logVerbose } from "../../globals.js"; import { registerAgentRunContext } from "../../infra/agent-events.js"; +import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; +import { formatMediaUnderstandingBody } from "../../media-understanding/format.js"; import { defaultRuntime } from "../../runtime.js"; import { isInternalMessageChannel } from "../../utils/message-channel.js"; import { stripHeartbeatToken } from "../heartbeat.js"; -import type { OriginatingChannelType } from "../templating.js"; +import { buildInboundMediaNote } from "../media-note.js"; +import type { MsgContext, OriginatingChannelType } from "../templating.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveRunAuthProfile } from "./agent-runner-utils.js"; @@ -157,6 +160,66 @@ export function createFollowupRunner(params: { let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen( activeSessionEntry?.systemPromptReport, ); + + // Apply media understanding for followup-queued messages when it was + // not applied (or failed) in the primary path. This ensures voice + // notes that arrived while the agent was mid-turn still get transcribed. + if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) { + const hasMedia = Boolean( + queued.mediaContext.MediaPath?.trim() || + (Array.isArray(queued.mediaContext.MediaPaths) && + queued.mediaContext.MediaPaths.length > 0), + ); + if (hasMedia) { + try { + const mediaCtx = { ...queued.mediaContext } as MsgContext; + const muResult = await applyMediaUnderstanding({ + ctx: mediaCtx, + cfg: queued.run.config, + agentDir: queued.run.agentDir, + activeModel: { + provider: queued.run.provider, + model: queued.run.model, + }, + }); + if (muResult.outputs.length > 0) { + // Rebuild the prompt with media understanding results baked in, + // matching the primary path's formatting. + const newMediaNote = buildInboundMediaNote(mediaCtx); + const transcriptBody = formatMediaUnderstandingBody({ + body: undefined, + outputs: muResult.outputs, + }); + + // Strip existing [media attached ...] lines from the prompt so + // they can be replaced by the updated media note (which excludes + // successfully-understood attachments like transcribed audio). + const stripped = queued.prompt + .replace(/\[media attached: \d+ files\]\n?/g, "") + .replace(/\[media attached[^\]]*\]\n?/g, ""); + + const parts: string[] = []; + if (newMediaNote) { + parts.push(newMediaNote); + } + if (transcriptBody) { + parts.push(transcriptBody); + } + parts.push(stripped.trim()); + queued.prompt = parts.filter(Boolean).join("\n\n"); + + logVerbose( + `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`, + ); + } + } catch (err) { + logVerbose( + `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + } + try { const fallbackResult = await runWithModelFallback({ cfg: queued.run.config, diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index c8451fd88f6..fe87e3919d0 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -387,7 +387,7 @@ export async function runPreparedReply( const mediaReplyHint = mediaNote ? "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body." : undefined; - let prefixedCommandBody = mediaNote + const prefixedCommandBody = mediaNote ? [mediaNote, mediaReplyHint, prefixedBody ?? ""].filter(Boolean).join("\n").trim() : prefixedBody; if (!resolvedThinkLevel) { @@ -472,11 +472,43 @@ export async function runPreparedReply( isNewSession, }); const authProfileIdSource = sessionEntry?.authProfileOverrideSource; + // Snapshot media-related context for deferred media understanding in the + // followup runner. When MediaUnderstanding is already populated the runner + // knows transcription already succeeded and skips re-application. + const hasMediaAttachments = Boolean( + ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0), + ); + const mediaContext = hasMediaAttachments + ? { + Body: ctx.Body, + CommandBody: ctx.CommandBody, + RawBody: ctx.RawBody, + MediaPath: ctx.MediaPath, + MediaUrl: ctx.MediaUrl, + MediaType: ctx.MediaType, + MediaDir: ctx.MediaDir, + MediaPaths: ctx.MediaPaths ? [...ctx.MediaPaths] : undefined, + MediaUrls: ctx.MediaUrls ? [...ctx.MediaUrls] : undefined, + MediaTypes: ctx.MediaTypes ? [...ctx.MediaTypes] : undefined, + MediaRemoteHost: ctx.MediaRemoteHost, + Transcript: ctx.Transcript, + MediaUnderstanding: ctx.MediaUnderstanding ? [...ctx.MediaUnderstanding] : undefined, + MediaUnderstandingDecisions: ctx.MediaUnderstandingDecisions + ? [...ctx.MediaUnderstandingDecisions] + : undefined, + OriginatingChannel: ctx.OriginatingChannel, + OriginatingTo: ctx.OriginatingTo, + AccountId: ctx.AccountId, + MessageThreadId: ctx.MessageThreadId, + } + : undefined; + const followupRun = { prompt: queuedBody, messageId: sessionCtx.MessageSidFull ?? sessionCtx.MessageSid, summaryLine: baseBodyTrimmedRaw, enqueuedAt: Date.now(), + mediaContext, // Originating channel for reply routing. originatingChannel: ctx.OriginatingChannel, originatingTo: ctx.OriginatingTo, diff --git a/src/auto-reply/reply/queue.ts b/src/auto-reply/reply/queue.ts index b097b6c5193..5c3a56f64af 100644 --- a/src/auto-reply/reply/queue.ts +++ b/src/auto-reply/reply/queue.ts @@ -1,6 +1,6 @@ -export { extractQueueDirective } from "./queue/directive.js"; -export { clearSessionQueues } from "./queue/cleanup.js"; export type { ClearSessionQueueResult } from "./queue/cleanup.js"; +export { clearSessionQueues } from "./queue/cleanup.js"; +export { extractQueueDirective } from "./queue/directive.js"; export { scheduleFollowupDrain } from "./queue/drain.js"; export { enqueueFollowupRun, @@ -10,6 +10,7 @@ export { export { resolveQueueSettings } from "./queue/settings.js"; export { clearFollowupQueue } from "./queue/state.js"; export type { + FollowupMediaContext, FollowupRun, QueueDedupeMode, QueueDropPolicy, diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts index 507f77d499d..637ceb0a149 100644 --- a/src/auto-reply/reply/queue/types.ts +++ b/src/auto-reply/reply/queue/types.ts @@ -2,6 +2,10 @@ import type { ExecToolDefaults } from "../../../agents/bash-tools.js"; import type { SkillSnapshot } from "../../../agents/skills.js"; import type { OpenClawConfig } from "../../../config/config.js"; import type { SessionEntry } from "../../../config/sessions.js"; +import type { + MediaUnderstandingDecision, + MediaUnderstandingOutput, +} from "../../../media-understanding/types.js"; import type { InputProvenance } from "../../../sessions/input-provenance.js"; import type { OriginatingChannelType } from "../../templating.js"; import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "../directives.js"; @@ -19,12 +23,45 @@ export type QueueSettings = { export type QueueDedupeMode = "message-id" | "prompt" | "none"; +/** + * Snapshot of media-related context fields carried on a FollowupRun so that + * the followup runner can apply media understanding (e.g. voice-note + * transcription) when it was not applied — or failed — in the primary path. + */ +export type FollowupMediaContext = { + Body?: string; + CommandBody?: string; + RawBody?: string; + MediaPath?: string; + MediaUrl?: string; + MediaType?: string; + MediaDir?: string; + MediaPaths?: string[]; + MediaUrls?: string[]; + MediaTypes?: string[]; + MediaRemoteHost?: string; + Transcript?: string; + MediaUnderstanding?: MediaUnderstandingOutput[]; + MediaUnderstandingDecisions?: MediaUnderstandingDecision[]; + OriginatingChannel?: OriginatingChannelType; + OriginatingTo?: string; + AccountId?: string; + MessageThreadId?: string | number; +}; + export type FollowupRun = { prompt: string; /** Provider message ID, when available (for deduplication). */ messageId?: string; summaryLine?: string; enqueuedAt: number; + /** + * Media context snapshot from the original inbound message. + * When present and MediaUnderstanding is empty, the followup runner will + * attempt to apply media understanding (audio transcription, etc.) before + * passing the prompt to the agent. + */ + mediaContext?: FollowupMediaContext; /** * Originating channel for reply routing. * When set, replies should be routed back to this provider From 6edb3b7e345573df8fe92485bd6d45e604de29b9 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 15:29:55 -0400 Subject: [PATCH 02/17] fix queued media-understanding prompt rebuild --- src/auto-reply/reply/followup-runner.test.ts | 170 ++++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 116 ++++++++++--- 2 files changed, 260 insertions(+), 26 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index f0a060af4ac..71451526400 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -67,6 +67,9 @@ beforeEach(() => { const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun => createMockFollowupRun({ run: { messageProvider } }); +const MEDIA_REPLY_HINT = + "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body."; + function createQueuedRun( overrides: Partial> & { run?: Partial; @@ -741,7 +744,7 @@ describe("createFollowupRunner media understanding", () => { it("applies audio transcription when mediaContext has untranscribed audio", async () => { const transcriptText = "Hello, this is a voice note."; // The real applyMediaUnderstanding mutates the ctx; the mock must do the same - // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line. + // so buildInboundMediaNote and queued prompt rebuilding see the transcribed body. applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { params.ctx.MediaUnderstanding = [ @@ -753,6 +756,7 @@ describe("createFollowupRunner media understanding", () => { }, ]; params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; return { outputs: [ { @@ -929,6 +933,7 @@ describe("createFollowupRunner media understanding", () => { }, ]; params.ctx.Transcript = "voice transcript"; + params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript"; return { outputs: [ { @@ -980,4 +985,167 @@ describe("createFollowupRunner media understanding", () => { // The transcript should be present expect(agentCall?.prompt).toContain("voice transcript"); }); + + it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => { + const transcriptText = "Bracket-safe transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "ok" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" + + MEDIA_REPLY_HINT + + "\n" + + "some text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice[0].ogg"], + MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg"); + expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123"); + expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT); + }); + + it("preserves file-only media understanding when outputs are empty", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = + '\nQuarterly report body\n'; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`, + mediaContext: { + Body: "", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]"); + expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT); + expect(agentCall?.prompt).toContain(''); + expect(agentCall?.prompt).toContain("Quarterly report body"); + expect(agentCall?.prompt).not.toContain("[User sent media without caption]"); + }); + + it("replaces the queued body when inline directives were already stripped from the prompt", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = + '/think high summarize this\n\n\nreport\n'; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).toContain(''); + expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this"); + expect(agentCall?.prompt).not.toContain("/think high summarize this"); + }); }); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 1e65380a020..ab2169fc09e 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -14,7 +14,6 @@ import type { TypingMode } from "../../config/types.js"; import { logVerbose } from "../../globals.js"; import { registerAgentRunContext } from "../../infra/agent-events.js"; import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; -import { formatMediaUnderstandingBody } from "../../media-understanding/format.js"; import { defaultRuntime } from "../../runtime.js"; import { isInternalMessageChannel } from "../../utils/message-channel.js"; import { stripHeartbeatToken } from "../heartbeat.js"; @@ -23,6 +22,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveRunAuthProfile } from "./agent-runner-utils.js"; +import { parseInlineDirectives } from "./directive-handling.js"; import { resolveOriginAccountId, resolveOriginMessageProvider, @@ -41,6 +41,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r import { createTypingSignaler } from "./typing-mode.js"; import type { TypingController } from "./typing.js"; +const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; +const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; + +function stripLeadingMediaAttachedLines(prompt: string): string { + const lines = prompt.split("\n"); + let index = 0; + while (index < lines.length) { + const trimmed = lines[index]?.trim() ?? ""; + if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) { + break; + } + index += 1; + } + return lines.slice(index).join("\n").trim(); +} + +function stripLeadingMediaReplyHint(prompt: string): string { + const lines = prompt.split("\n"); + if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) { + return lines.slice(1).join("\n").trim(); + } + return prompt.trim(); +} + +function replaceLastOccurrence( + value: string, + search: string, + replacement: string, +): string | undefined { + if (!search) { + return undefined; + } + const index = value.lastIndexOf(search); + if (index < 0) { + return undefined; + } + return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; +} + +function stripInlineDirectives(text: string | undefined): string { + return parseInlineDirectives(text ?? "").cleaned.trim(); +} + +function rebuildQueuedPromptWithMediaUnderstanding(params: { + prompt: string; + originalBody?: string; + updatedBody?: string; + mediaNote?: string; +}): string { + let stripped = stripLeadingMediaAttachedLines(params.prompt); + if (!params.mediaNote) { + stripped = stripLeadingMediaReplyHint(stripped); + } + + const updatedBody = stripInlineDirectives(params.updatedBody); + if (!updatedBody) { + return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim(); + } + + const replacementTargets = [ + params.originalBody?.trim(), + stripInlineDirectives(params.originalBody), + MEDIA_ONLY_PLACEHOLDER, + ].filter( + (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index, + ); + + let rebuilt = stripped; + for (const target of replacementTargets) { + const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); + if (replaced !== undefined) { + rebuilt = replaced; + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); + } + } + + rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); +} + export function createFollowupRunner(params: { opts?: GetReplyOptions; typing: TypingController; @@ -173,6 +253,7 @@ export function createFollowupRunner(params: { if (hasMedia) { try { const mediaCtx = { ...queued.mediaContext } as MsgContext; + const originalBody = mediaCtx.Body; const muResult = await applyMediaUnderstanding({ ctx: mediaCtx, cfg: queued.run.config, @@ -182,34 +263,19 @@ export function createFollowupRunner(params: { model: queued.run.model, }, }); - if (muResult.outputs.length > 0) { - // Rebuild the prompt with media understanding results baked in, - // matching the primary path's formatting. + if (muResult.outputs.length > 0 || muResult.appliedFile) { + // Rebuild the queued prompt from the mutated media context so the + // deferred path matches the primary path's prompt shape. const newMediaNote = buildInboundMediaNote(mediaCtx); - const transcriptBody = formatMediaUnderstandingBody({ - body: undefined, - outputs: muResult.outputs, + queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({ + prompt: queued.prompt, + originalBody, + updatedBody: mediaCtx.Body, + mediaNote: newMediaNote, }); - // Strip existing [media attached ...] lines from the prompt so - // they can be replaced by the updated media note (which excludes - // successfully-understood attachments like transcribed audio). - const stripped = queued.prompt - .replace(/\[media attached: \d+ files\]\n?/g, "") - .replace(/\[media attached[^\]]*\]\n?/g, ""); - - const parts: string[] = []; - if (newMediaNote) { - parts.push(newMediaNote); - } - if (transcriptBody) { - parts.push(transcriptBody); - } - parts.push(stripped.trim()); - queued.prompt = parts.filter(Boolean).join("\n\n"); - logVerbose( - `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`, + `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, ); } } catch (err) { From be3eec46e23c394d9405f80121c79fd2212f4af5 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 15:45:17 -0400 Subject: [PATCH 03/17] fix: rebuild queued followup media prompts --- src/auto-reply/reply/followup-runner.test.ts | 180 +++++++++++++++++++ src/auto-reply/reply/followup-runner.ts | 21 ++- 2 files changed, 196 insertions(+), 5 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 71451526400..334bf9b6a13 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -818,6 +818,138 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); }); + it("strips the full media line when attachment paths or URLs contain brackets", async () => { + const transcriptText = "Bracket-safe transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "done" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg]\nsome text", + mediaContext: { + Body: "some text", + CommandBody: "some text", + RawBody: "some text", + MediaPaths: ["/tmp/voice[0].ogg"], + MediaUrls: ["https://cdn.example.com/files[0].ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).not.toContain("[media attached:"); + expect(agentCall?.prompt).not.toContain("files[0].ogg]"); + }); + + it("only strips leading synthetic media lines and preserves literal user text later in the prompt", async () => { + const transcriptText = "Transcript with literal token"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = "I literally typed [media attached: keep me] in this message."; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "done" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + "[media attached: /tmp/voice.ogg (audio/ogg)]\nI literally typed [media attached: keep me] in this message.", + mediaContext: { + Body: "I literally typed [media attached: keep me] in this message.", + CommandBody: "I literally typed [media attached: keep me] in this message.", + RawBody: "I literally typed [media attached: keep me] in this message.", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain( + "I literally typed [media attached: keep me] in this message.", + ); + expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg (audio/ogg)]"); + }); + it("skips media understanding when MediaUnderstanding is already populated", async () => { runEmbeddedPiAgentMock.mockResolvedValueOnce({ payloads: [{ text: "reply" }], @@ -920,6 +1052,54 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" })); }); + it("rebuilds the prompt when file extraction succeeds without media outputs", async () => { + const fileBlock = '\nline one\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `some text\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "file processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/notes.txt (text/plain)]\nsome text", + mediaContext: { + Body: "some text", + CommandBody: "some text", + RawBody: "some text", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[media attached: /tmp/notes.txt (text/plain)]"); + expect(agentCall?.prompt).toContain(fileBlock); + expect(agentCall?.prompt?.match(/ { applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index ab2169fc09e..94047155a61 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -43,13 +43,16 @@ import type { TypingController } from "./typing.js"; const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; +const LEADING_MEDIA_ATTACHED_LINE_RE = + /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/; +const FILE_BLOCK_RE = / 0 || muResult.appliedFile) { + const shouldRebuildPrompt = + muResult.outputs.length > 0 || + (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt)); + if (shouldRebuildPrompt) { // Rebuild the queued prompt from the mutated media context so the // deferred path matches the primary path's prompt shape. const newMediaNote = buildInboundMediaNote(mediaCtx); @@ -273,7 +285,6 @@ export function createFollowupRunner(params: { updatedBody: mediaCtx.Body, mediaNote: newMediaNote, }); - logVerbose( `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, ); From 67e90527e15ebb47ab00ad3e0283e99d6a94585a Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 16:56:36 -0400 Subject: [PATCH 04/17] fix: narrow FILE_BLOCK_RE, align originalBody, check body not prompt --- src/auto-reply/reply/followup-runner.test.ts | 109 +++++++++++++++++++ src/auto-reply/reply/followup-runner.ts | 21 ++-- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 334bf9b6a13..a0c5306380d 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1328,4 +1328,113 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this"); expect(agentCall?.prompt).not.toContain("/think high summarize this"); }); + + it("does not false-positive on user text containing literal ' { + const fileBlock = '\ncol1,col2\n1,2\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `check my {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // User message contains literal " { + const fileBlock = '\nreport content\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + // applyMediaUnderstanding mutates the resolved body (which is CommandBody) + params.ctx.Body = `summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // Body has directive prefix; CommandBody has the cleaned version. + // The prompt was built from CommandBody, so originalBody should match CommandBody + // for accurate replacement. + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + CommandBody: "summarize this", + RawBody: "/think high summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + // File block should be present (extraction succeeded) + expect(agentCall?.prompt).toContain(fileBlock); + // The body text should appear once, not duplicated + expect(agentCall?.prompt).toContain("summarize this"); + // Should NOT contain the directive prefix + expect(agentCall?.prompt).not.toContain("/think high"); + // The body should not be duplicated (would happen if originalBody didn't match) + const matches = agentCall?.prompt?.match(/summarize this/g); + expect(matches?.length).toBe(1); + }); }); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 94047155a61..39728c71def 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -45,7 +45,7 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; const LEADING_MEDIA_ATTACHED_LINE_RE = /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/; -const FILE_BLOCK_RE = / 0 || - (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt)); + (muResult.appliedFile && !bodyAlreadyHasFileBlock); if (shouldRebuildPrompt) { // Rebuild the queued prompt from the mutated media context so the // deferred path matches the primary path's prompt shape. From 5e0330db6ceb28128bbc17785a1ee5adeeccbcbb Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 17:53:40 -0400 Subject: [PATCH 05/17] Auto-reply: preserve deferred media understanding output --- src/auto-reply/reply/followup-runner.test.ts | 306 +++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 38 ++- .../reply/get-reply-run.media-only.test.ts | 74 +++++ src/auto-reply/reply/get-reply-run.ts | 14 +- 4 files changed, 424 insertions(+), 8 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index a0c5306380d..d2812ff61b6 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -818,6 +818,69 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); }); + it("applies media understanding for URL-only attachments", async () => { + const transcriptText = "URL-only transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "Got it!" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaUrl: "https://cdn.example.com/voice.ogg", + MediaUrls: ["https://cdn.example.com/voice.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + }); + it("strips the full media line when attachment paths or URLs contain brackets", async () => { const transcriptText = "Bracket-safe transcript"; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1329,6 +1392,98 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).not.toContain("/think high summarize this"); }); + it("preserves directive-like tokens inside extracted media content", async () => { + const fileBlock = + '\n/model claude-opus should stay\n/queue followup should stay\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `/think high summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).not.toContain("/think high summarize this"); + expect(agentCall?.prompt).toContain("/model claude-opus should stay"); + expect(agentCall?.prompt).toContain("/queue followup should stay"); + }); + + it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + const description = "[Image]\nDescription:\na mountain at sunset"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = description; + return { + outputs: [], + decisions: [], + appliedImage: true, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/photo.jpg"], + MediaTypes: ["image/jpeg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("a mountain at sunset"); + }); + it("does not false-positive on user text containing literal ' { const fileBlock = '\ncol1,col2\n1,2\n'; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1360,8 +1515,7 @@ describe("createFollowupRunner media understanding", () => { // file extraction results from being embedded in the prompt. await runner( createQueuedRun({ - prompt: - "[media attached: /tmp/data.csv (text/csv)]\ncheck my { expect(agentCall?.prompt).toContain("check my { + const fileBlock = + '\nRun `/think high` literally in the shell example.\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + CommandBody: "summarize this", + RawBody: "/think high summarize this", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example."); + }); + + it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items."; + return { + outputs: [], + decisions: [], + appliedImage: true, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + CommandBody: "some text", + RawBody: "some text", + MediaPaths: ["/tmp/board.jpg"], + MediaTypes: ["image/jpeg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[Image summary]"); + expect(agentCall?.prompt).toContain("A whiteboard with action items."); + }); + + it("applies media understanding for URL-only deferred attachments", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = "[Audio]\nTranscript:\nremote transcript"; + params.ctx.Transcript = "remote transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "remote transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[User sent media without caption]", + mediaContext: { + Body: "", + MediaUrl: "https://cdn.example.com/audio.ogg", + MediaUrls: ["https://cdn.example.com/audio.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("remote transcript"); + }); + it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => { const fileBlock = '\nreport content\n'; applyMediaUnderstandingMock.mockImplementationOnce( diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 39728c71def..64b8a935b2a 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -43,8 +43,7 @@ import type { TypingController } from "./typing.js"; const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; -const LEADING_MEDIA_ATTACHED_LINE_RE = - /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/; +const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; const FILE_BLOCK_RE = / 0), + queued.mediaContext.MediaPaths.length > 0) || + (Array.isArray(queued.mediaContext.MediaUrls) && + queued.mediaContext.MediaUrls.length > 0), ); if (hasMedia) { try { @@ -281,6 +308,9 @@ export function createFollowupRunner(params: { }); const shouldRebuildPrompt = muResult.outputs.length > 0 || + muResult.appliedAudio || + muResult.appliedImage || + muResult.appliedVideo || (muResult.appliedFile && !bodyAlreadyHasFileBlock); if (shouldRebuildPrompt) { // Rebuild the queued prompt from the mutated media context so the diff --git a/src/auto-reply/reply/get-reply-run.media-only.test.ts b/src/auto-reply/reply/get-reply-run.media-only.test.ts index 829b3937009..f519da10082 100644 --- a/src/auto-reply/reply/get-reply-run.media-only.test.ts +++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts @@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => { expect(call?.followupRun.prompt).toContain("[User sent media without caption]"); }); + it("snapshots URL-only attachments into followup mediaContext", async () => { + await runPreparedReply( + baseParams({ + ctx: { + Body: "check this attachment", + RawBody: "check this attachment", + CommandBody: "check this attachment", + ThreadHistoryBody: "Earlier message in this thread", + OriginatingChannel: "slack", + OriginatingTo: "C123", + ChatType: "group", + MediaUrl: "https://cdn.example.com/input.png", + MediaUrls: ["https://cdn.example.com/input.png"], + MediaType: "image/png", + MediaTypes: ["image/png"], + }, + sessionCtx: { + Body: "check this attachment", + BodyStripped: "check this attachment", + ThreadHistoryBody: "Earlier message in this thread", + Provider: "slack", + ChatType: "group", + OriginatingChannel: "slack", + OriginatingTo: "C123", + }, + }), + ); + + const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0]; + expect(call?.followupRun.mediaContext).toEqual( + expect.objectContaining({ + MediaUrl: "https://cdn.example.com/input.png", + MediaUrls: ["https://cdn.example.com/input.png"], + MediaType: "image/png", + MediaTypes: ["image/png"], + }), + ); + }); + it("keeps thread history context on follow-up turns", async () => { const result = await runPreparedReply( baseParams({ @@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => { expect(call?.followupRun.prompt).toContain("Earlier message in this thread"); }); + it("snapshots mediaContext for URL-only deferred attachments", async () => { + await runPreparedReply( + baseParams({ + ctx: { + Body: "", + RawBody: "", + CommandBody: "", + MediaUrl: "https://cdn.example.com/audio.ogg", + MediaUrls: ["https://cdn.example.com/audio.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + ThreadHistoryBody: "Earlier message in this thread", + OriginatingChannel: "slack", + OriginatingTo: "C123", + ChatType: "group", + }, + sessionCtx: { + Body: "", + BodyStripped: "", + ThreadHistoryBody: "Earlier message in this thread", + Provider: "slack", + ChatType: "group", + OriginatingChannel: "slack", + OriginatingTo: "C123", + }, + }), + ); + + const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0]; + expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg"); + expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([ + "https://cdn.example.com/audio.ogg", + ]); + }); + it("returns the empty-body reply when there is no text and no media", async () => { const result = await runPreparedReply( baseParams({ diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index fe87e3919d0..b4b693b8119 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -310,7 +310,14 @@ export async function runPreparedReply( : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n"); const baseBodyTrimmed = baseBodyForPrompt.trim(); const hasMediaAttachment = Boolean( - sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0), + sessionCtx.MediaPath || + sessionCtx.MediaUrl || + (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) || + (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) || + ctx.MediaPath?.trim() || + ctx.MediaUrl?.trim() || + (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) || + (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0), ); if (!baseBodyTrimmed && !hasMediaAttachment) { await typing.onReplyStart(); @@ -476,7 +483,10 @@ export async function runPreparedReply( // followup runner. When MediaUnderstanding is already populated the runner // knows transcription already succeeded and skips re-application. const hasMediaAttachments = Boolean( - ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0), + ctx.MediaPath?.trim() || + ctx.MediaUrl?.trim() || + (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) || + (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0), ); const mediaContext = hasMediaAttachments ? { From 7973b2cc5b2d12593082e6ada14863b9eac70b23 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 21:54:35 -0400 Subject: [PATCH 06/17] Reply: preserve deferred queued media context --- src/auto-reply/reply/followup-media.ts | 241 +++++++++++++++++++ src/auto-reply/reply/followup-runner.test.ts | 225 ++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 180 +------------- src/auto-reply/reply/get-reply-run.ts | 2 + src/auto-reply/reply/queue/drain.ts | 71 +++++- src/auto-reply/reply/queue/enqueue.ts | 37 ++- src/auto-reply/reply/queue/state.ts | 3 + src/auto-reply/reply/queue/types.ts | 3 + 8 files changed, 572 insertions(+), 190 deletions(-) create mode 100644 src/auto-reply/reply/followup-media.ts diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts new file mode 100644 index 00000000000..5a014e63f9b --- /dev/null +++ b/src/auto-reply/reply/followup-media.ts @@ -0,0 +1,241 @@ +import { logVerbose } from "../../globals.js"; +import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; +import { + normalizeAttachments, + resolveAttachmentKind, +} from "../../media-understanding/attachments.js"; +import { buildInboundMediaNote } from "../media-note.js"; +import type { MsgContext } from "../templating.js"; +import { parseInlineDirectives } from "./directive-handling.js"; +import type { FollowupMediaContext, FollowupRun } from "./queue/types.js"; + +const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; +const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; +const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; +const FILE_BLOCK_RE = / Boolean(value) && list.indexOf(value) === index, + ); + + let rebuilt = stripped; + for (const target of replacementTargets) { + const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); + if (replaced !== undefined) { + rebuilt = replaced; + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); + } + } + + rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); +} + +function hasMediaAttachments(mediaContext: FollowupMediaContext): boolean { + return Boolean( + mediaContext.MediaPath?.trim() || + mediaContext.MediaUrl?.trim() || + (Array.isArray(mediaContext.MediaPaths) && mediaContext.MediaPaths.length > 0) || + (Array.isArray(mediaContext.MediaUrls) && mediaContext.MediaUrls.length > 0), + ); +} + +function hasOnlyFileLikeAttachments(mediaContext: FollowupMediaContext): boolean { + const attachments = normalizeAttachments(mediaContext as MsgContext); + return ( + attachments.length > 0 && + attachments.every((attachment) => { + const kind = resolveAttachmentKind(attachment); + return kind !== "audio" && kind !== "image" && kind !== "video"; + }) + ); +} + +function snapshotUpdatedMediaContext(params: { + original: FollowupMediaContext; + mediaCtx: MsgContext; + updatedBody?: string; +}): FollowupMediaContext { + return { + ...params.original, + Body: params.updatedBody ?? params.original.Body, + Transcript: + typeof params.mediaCtx.Transcript === "string" + ? params.mediaCtx.Transcript + : params.original.Transcript, + MediaUnderstanding: Array.isArray(params.mediaCtx.MediaUnderstanding) + ? [...params.mediaCtx.MediaUnderstanding] + : params.original.MediaUnderstanding, + MediaUnderstandingDecisions: Array.isArray(params.mediaCtx.MediaUnderstandingDecisions) + ? [...params.mediaCtx.MediaUnderstandingDecisions] + : params.original.MediaUnderstandingDecisions, + DeferredMediaApplied: true, + }; +} + +export async function applyDeferredMediaUnderstandingToQueuedRun( + queued: FollowupRun, + params: { logLabel?: string } = {}, +): Promise { + const mediaContext = queued.mediaContext; + if (!mediaContext || mediaContext.DeferredMediaApplied) { + return; + } + if (mediaContext.MediaUnderstanding?.length) { + mediaContext.DeferredMediaApplied = true; + return; + } + if (!hasMediaAttachments(mediaContext)) { + mediaContext.DeferredMediaApplied = true; + return; + } + + const resolvedOriginalBody = + mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body; + const bodyAlreadyHasFileBlock = + FILE_BLOCK_RE.test(resolvedOriginalBody ?? "") || FILE_BLOCK_RE.test(mediaContext.Body ?? ""); + + if (bodyAlreadyHasFileBlock && hasOnlyFileLikeAttachments(mediaContext)) { + mediaContext.DeferredMediaApplied = true; + return; + } + + try { + const mediaCtx = { + ...mediaContext, + Body: resolvedOriginalBody, + Provider: + mediaContext.Provider ?? + queued.run.messageProvider ?? + (typeof mediaContext.OriginatingChannel === "string" + ? mediaContext.OriginatingChannel + : undefined), + Surface: mediaContext.Surface, + } as MsgContext; + + const muResult = await applyMediaUnderstanding({ + ctx: mediaCtx, + cfg: queued.run.config, + agentDir: queued.run.agentDir, + activeModel: { + provider: queued.run.provider, + model: queued.run.model, + }, + }); + + const shouldRebuildPrompt = + muResult.outputs.length > 0 || + muResult.appliedAudio || + muResult.appliedImage || + muResult.appliedVideo || + (muResult.appliedFile && !bodyAlreadyHasFileBlock); + + if (shouldRebuildPrompt) { + const newMediaNote = buildInboundMediaNote(mediaCtx); + queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({ + prompt: queued.prompt, + originalBody: resolvedOriginalBody, + updatedBody: mediaCtx.Body, + mediaNote: newMediaNote, + }); + logVerbose( + `${params.logLabel ?? "followup"}: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, + ); + } + + queued.mediaContext = snapshotUpdatedMediaContext({ + original: mediaContext, + mediaCtx, + updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined, + }); + } catch (err) { + logVerbose( + `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, + ); + } +} diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index d2812ff61b6..c3e75e6b856 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -2,7 +2,8 @@ import fs from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; import { beforeEach, describe, expect, it, vi } from "vitest"; -import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js"; +import { loadSessionStore, saveSessionStore, type SessionEntry } from "../../config/sessions.js"; +import { buildCollectPrompt } from "../../utils/queue-helpers.js"; import type { FollowupRun } from "./queue.js"; import * as sessionRunAccounting from "./session-run-accounting.js"; import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js"; @@ -35,6 +36,10 @@ vi.mock("./route-reply.js", async (importOriginal) => { }); import { createFollowupRunner } from "./followup-runner.js"; +import { + applyDeferredMediaToQueuedRuns, + buildMediaAwareQueueSummaryPrompt, +} from "./queue/drain.js"; const ROUTABLE_TEST_CHANNELS = new Set([ "telegram", @@ -818,6 +823,69 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); }); + it("propagates the queued message provider into deferred media context", async () => { + const transcriptText = "Provider-aware transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + expect(params.ctx.Provider).toBe("telegram"); + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "done" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[User sent media without caption]", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + }); + it("applies media understanding for URL-only attachments", async () => { const transcriptText = "URL-only transcript"; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1739,4 +1807,159 @@ describe("createFollowupRunner media understanding", () => { const matches = agentCall?.prompt?.match(/summarize this/g); expect(matches?.length).toBe(1); }); + + it("does not re-apply file extraction when the stored media body already has a file block", async () => { + const fileBlock = '\nreport content\n'; + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`, + mediaContext: { + Body: `summarize this\n\n${fileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt?.match(/ { + it("preprocesses collect batches before synthesizing the followup prompt", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: "collect transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = "collect transcript"; + params.ctx.Body = "[Audio]\nTranscript:\ncollect transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "collect transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + const items: FollowupRun[] = [ + createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text", + summaryLine: "some text", + originatingChannel: "telegram", + originatingTo: "chat:1", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + createQueuedRun({ + prompt: "second text", + summaryLine: "second text", + originatingChannel: "telegram", + originatingTo: "chat:1", + run: { messageProvider: "telegram" }, + }), + ]; + + await applyDeferredMediaToQueuedRuns(items); + + const prompt = buildCollectPrompt({ + title: "[Queued messages while agent was busy]", + items, + renderItem: (item, idx) => `---\nQueued #${idx + 1}\n${item.prompt}`.trim(), + }); + + expect(prompt).toContain("collect transcript"); + expect(prompt).toContain("Queued #2\nsecond text"); + expect(prompt).not.toContain("[media attached: /tmp/voice.ogg"); + }); + + it("preprocesses dropped media items before building overflow summaries", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: "overflow transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = "overflow transcript"; + params.ctx.Body = "[Audio]\nTranscript:\noverflow transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "overflow transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({ + dropPolicy: "summarize", + droppedCount: 1, + summaryLines: ["[media attached: /tmp/voice.ogg (audio/ogg)]"], + summaryItems: [ + createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]", + summaryLine: "", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ], + noun: "message", + }); + + expect(summaryPrompt).toContain("[Queue overflow] Dropped 1 message due to cap."); + expect(summaryPrompt).toContain("overflow transcript"); + expect(summaryPrompt).not.toContain("[media attached: /tmp/voice.ogg"); + }); }); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 64b8a935b2a..9a80d4f78ad 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -13,16 +13,14 @@ import type { SessionEntry } from "../../config/sessions.js"; import type { TypingMode } from "../../config/types.js"; import { logVerbose } from "../../globals.js"; import { registerAgentRunContext } from "../../infra/agent-events.js"; -import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; import { defaultRuntime } from "../../runtime.js"; import { isInternalMessageChannel } from "../../utils/message-channel.js"; import { stripHeartbeatToken } from "../heartbeat.js"; -import { buildInboundMediaNote } from "../media-note.js"; -import type { MsgContext, OriginatingChannelType } from "../templating.js"; +import type { OriginatingChannelType } from "../templating.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveRunAuthProfile } from "./agent-runner-utils.js"; -import { parseInlineDirectives } from "./directive-handling.js"; +import { applyDeferredMediaUnderstandingToQueuedRun } from "./followup-media.js"; import { resolveOriginAccountId, resolveOriginMessageProvider, @@ -41,113 +39,6 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r import { createTypingSignaler } from "./typing-mode.js"; import type { TypingController } from "./typing.js"; -const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; -const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; -const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; -const FILE_BLOCK_RE = / Boolean(value) && list.indexOf(value) === index, - ); - - let rebuilt = stripped; - for (const target of replacementTargets) { - const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); - if (replaced !== undefined) { - rebuilt = replaced; - return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); - } - } - - rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); - return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); -} - export function createFollowupRunner(params: { opts?: GetReplyOptions; typing: TypingController; @@ -267,72 +158,7 @@ export function createFollowupRunner(params: { let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen( activeSessionEntry?.systemPromptReport, ); - - // Apply media understanding for followup-queued messages when it was - // not applied (or failed) in the primary path. This ensures voice - // notes that arrived while the agent was mid-turn still get transcribed. - if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) { - const hasMedia = Boolean( - queued.mediaContext.MediaPath?.trim() || - queued.mediaContext.MediaUrl?.trim() || - (Array.isArray(queued.mediaContext.MediaPaths) && - queued.mediaContext.MediaPaths.length > 0) || - (Array.isArray(queued.mediaContext.MediaUrls) && - queued.mediaContext.MediaUrls.length > 0), - ); - if (hasMedia) { - try { - const resolvedOriginalBody = - queued.mediaContext.CommandBody ?? - queued.mediaContext.RawBody ?? - queued.mediaContext.Body; - const mediaCtx = { - ...queued.mediaContext, - Body: resolvedOriginalBody, - } as MsgContext; - const originalBody = resolvedOriginalBody; - // Capture whether the resolved body already contains a file block - // BEFORE applyMediaUnderstanding mutates it — this detects prior - // extraction so we avoid double-inserting. Checking the body - // (not the full queued.prompt) avoids false positives from user - // messages that happen to contain literal " 0 || - muResult.appliedAudio || - muResult.appliedImage || - muResult.appliedVideo || - (muResult.appliedFile && !bodyAlreadyHasFileBlock); - if (shouldRebuildPrompt) { - // Rebuild the queued prompt from the mutated media context so the - // deferred path matches the primary path's prompt shape. - const newMediaNote = buildInboundMediaNote(mediaCtx); - queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({ - prompt: queued.prompt, - originalBody, - updatedBody: mediaCtx.Body, - mediaNote: newMediaNote, - }); - logVerbose( - `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, - ); - } - } catch (err) { - logVerbose( - `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, - ); - } - } - } + await applyDeferredMediaUnderstandingToQueuedRun(queued, { logLabel: "followup" }); try { const fallbackResult = await runWithModelFallback({ diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index b4b693b8119..252cffbb364 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -493,6 +493,8 @@ export async function runPreparedReply( Body: ctx.Body, CommandBody: ctx.CommandBody, RawBody: ctx.RawBody, + Provider: ctx.Provider ?? sessionCtx.Provider, + Surface: ctx.Surface ?? sessionCtx.Surface, MediaPath: ctx.MediaPath, MediaUrl: ctx.MediaUrl, MediaType: ctx.MediaType, diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts index 1e2fb33e4e0..68c660fe2b8 100644 --- a/src/auto-reply/reply/queue/drain.ts +++ b/src/auto-reply/reply/queue/drain.ts @@ -3,15 +3,17 @@ import { resolveGlobalMap } from "../../../shared/global-singleton.js"; import { buildCollectPrompt, beginQueueDrain, + buildQueueSummaryLine, + buildQueueSummaryPrompt, clearQueueSummaryState, drainCollectQueueStep, drainNextQueueItem, hasCrossChannelItems, - previewQueueSummaryPrompt, waitForQueueDebounce, } from "../../../utils/queue-helpers.js"; +import { applyDeferredMediaUnderstandingToQueuedRun } from "../followup-media.js"; import { isRoutableChannel } from "../route-reply.js"; -import { FOLLOWUP_QUEUES } from "./state.js"; +import { FOLLOWUP_QUEUES, type FollowupQueueState } from "./state.js"; import type { FollowupRun } from "./types.js"; // Persists the most recent runFollowup callback per queue key so that @@ -68,6 +70,50 @@ function resolveCrossChannelKey(item: FollowupRun): { cross?: true; key?: string }; } +function clearFollowupQueueSummaryState(queue: FollowupQueueState): void { + clearQueueSummaryState(queue); + queue.summaryItems = []; +} + +export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise { + for (const item of items) { + await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); + } +} + +async function resolveSummaryLines(items: FollowupRun[]): Promise { + const summaryLines: string[] = []; + for (const item of items) { + await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); + summaryLines.push(buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim())); + } + return summaryLines; +} + +export async function buildMediaAwareQueueSummaryPrompt(params: { + dropPolicy: FollowupQueueState["dropPolicy"]; + droppedCount: number; + summaryLines: string[]; + summaryItems: FollowupRun[]; + noun: string; +}): Promise { + if (params.dropPolicy !== "summarize" || params.droppedCount <= 0) { + return undefined; + } + const summaryLines = + params.summaryItems.length > 0 + ? await resolveSummaryLines(params.summaryItems) + : params.summaryLines; + return buildQueueSummaryPrompt({ + state: { + dropPolicy: params.dropPolicy, + droppedCount: params.droppedCount, + summaryLines: [...summaryLines], + }, + noun: params.noun, + }); +} + export function scheduleFollowupDrain( key: string, runFollowup: (run: FollowupRun) => Promise, @@ -107,7 +153,14 @@ export function scheduleFollowupDrain( } const items = queue.items.slice(); - const summary = previewQueueSummaryPrompt({ state: queue, noun: "message" }); + await applyDeferredMediaToQueuedRuns(items); + const summary = await buildMediaAwareQueueSummaryPrompt({ + dropPolicy: queue.dropPolicy, + droppedCount: queue.droppedCount, + summaryLines: queue.summaryLines, + summaryItems: queue.summaryItems, + noun: "message", + }); const run = items.at(-1)?.run ?? queue.lastRun; if (!run) { break; @@ -129,12 +182,18 @@ export function scheduleFollowupDrain( }); queue.items.splice(0, items.length); if (summary) { - clearQueueSummaryState(queue); + clearFollowupQueueSummaryState(queue); } continue; } - const summaryPrompt = previewQueueSummaryPrompt({ state: queue, noun: "message" }); + const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({ + dropPolicy: queue.dropPolicy, + droppedCount: queue.droppedCount, + summaryLines: queue.summaryLines, + summaryItems: queue.summaryItems, + noun: "message", + }); if (summaryPrompt) { const run = queue.lastRun; if (!run) { @@ -155,7 +214,7 @@ export function scheduleFollowupDrain( ) { break; } - clearQueueSummaryState(queue); + clearFollowupQueueSummaryState(queue); continue; } diff --git a/src/auto-reply/reply/queue/enqueue.ts b/src/auto-reply/reply/queue/enqueue.ts index 11da0db98fc..e58cc5ffac5 100644 --- a/src/auto-reply/reply/queue/enqueue.ts +++ b/src/auto-reply/reply/queue/enqueue.ts @@ -1,8 +1,8 @@ import { createDedupeCache } from "../../../infra/dedupe.js"; import { resolveGlobalSingleton } from "../../../shared/global-singleton.js"; -import { applyQueueDropPolicy, shouldSkipQueueItem } from "../../../utils/queue-helpers.js"; +import { buildQueueSummaryLine, shouldSkipQueueItem } from "../../../utils/queue-helpers.js"; import { kickFollowupDrainIfIdle } from "./drain.js"; -import { getExistingFollowupQueue, getFollowupQueue } from "./state.js"; +import { getExistingFollowupQueue, getFollowupQueue, type FollowupQueueState } from "./state.js"; import type { FollowupRun, QueueDedupeMode, QueueSettings } from "./types.js"; /** @@ -57,6 +57,34 @@ function isRunAlreadyQueued( return items.some((item) => item.prompt === run.prompt && hasSameRouting(item)); } +function applyFollowupQueueDropPolicy(queue: FollowupQueueState): boolean { + const cap = queue.cap; + if (cap <= 0 || queue.items.length < cap) { + return true; + } + if (queue.dropPolicy === "new") { + return false; + } + + const dropCount = queue.items.length - cap + 1; + const dropped = queue.items.splice(0, dropCount); + if (queue.dropPolicy === "summarize") { + for (const item of dropped) { + queue.droppedCount += 1; + queue.summaryItems.push(item); + queue.summaryLines.push( + buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()), + ); + } + const limit = Math.max(0, cap); + while (queue.summaryLines.length > limit) { + queue.summaryLines.shift(); + queue.summaryItems.shift(); + } + } + return true; +} + export function enqueueFollowupRun( key: string, run: FollowupRun, @@ -83,10 +111,7 @@ export function enqueueFollowupRun( queue.lastEnqueuedAt = Date.now(); queue.lastRun = run.run; - const shouldEnqueue = applyQueueDropPolicy({ - queue, - summarize: (item) => item.summaryLine?.trim() || item.prompt.trim(), - }); + const shouldEnqueue = applyFollowupQueueDropPolicy(queue); if (!shouldEnqueue) { return false; } diff --git a/src/auto-reply/reply/queue/state.ts b/src/auto-reply/reply/queue/state.ts index 44208e727dd..94021dd0c4c 100644 --- a/src/auto-reply/reply/queue/state.ts +++ b/src/auto-reply/reply/queue/state.ts @@ -4,6 +4,7 @@ import type { FollowupRun, QueueDropPolicy, QueueMode, QueueSettings } from "./t export type FollowupQueueState = { items: FollowupRun[]; + summaryItems: FollowupRun[]; draining: boolean; lastEnqueuedAt: number; mode: QueueMode; @@ -47,6 +48,7 @@ export function getFollowupQueue(key: string, settings: QueueSettings): Followup const created: FollowupQueueState = { items: [], + summaryItems: [], draining: false, lastEnqueuedAt: 0, mode: settings.mode, @@ -78,6 +80,7 @@ export function clearFollowupQueue(key: string): number { } const cleared = queue.items.length + queue.droppedCount; queue.items.length = 0; + queue.summaryItems.length = 0; queue.droppedCount = 0; queue.summaryLines = []; queue.lastRun = undefined; diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts index 637ceb0a149..291059a28d7 100644 --- a/src/auto-reply/reply/queue/types.ts +++ b/src/auto-reply/reply/queue/types.ts @@ -32,6 +32,8 @@ export type FollowupMediaContext = { Body?: string; CommandBody?: string; RawBody?: string; + Provider?: string; + Surface?: string; MediaPath?: string; MediaUrl?: string; MediaType?: string; @@ -47,6 +49,7 @@ export type FollowupMediaContext = { OriginatingTo?: string; AccountId?: string; MessageThreadId?: string | number; + DeferredMediaApplied?: boolean; }; export type FollowupRun = { From f1e023c3de3412cc76622ee737c9215a7beb54cc Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 23:03:20 -0400 Subject: [PATCH 07/17] fix: set DeferredMediaApplied on error and strip old file blocks on rebuild --- src/auto-reply/reply/followup-media.ts | 15 +++ src/auto-reply/reply/followup-runner.test.ts | 111 +++++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index 5a014e63f9b..5340d0df99a 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -13,6 +13,11 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; const FILE_BLOCK_RE = /]*>[\s\S]*?<\/file>\n?/gi; + +function stripExistingFileBlocks(text: string): string { + return text.replace(FILE_BLOCK_FULL_RE, "").trim(); +} function stripLeadingMediaAttachedLines(prompt: string): string { const lines = prompt.split("\n"); @@ -87,6 +92,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { stripped = stripLeadingMediaReplyHint(stripped); } + // Strip pre-existing file blocks from the prompt when the updated body + // contains new file blocks. Mixed messages (audio + PDF) can arrive with + // file extraction already applied in the primary path; without this strip + // the old block stays in the prompt while the updated body adds a new one, + // duplicating potentially large file payloads. + if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) { + stripped = stripExistingFileBlocks(stripped); + } + const updatedBody = normalizeUpdatedBody({ originalBody: params.originalBody, updatedBody: params.updatedBody, @@ -234,6 +248,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined, }); } catch (err) { + mediaContext.DeferredMediaApplied = true; logVerbose( `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, ); diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index c3e75e6b856..cd1951aa748 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1808,6 +1808,117 @@ describe("createFollowupRunner media understanding", () => { expect(matches?.length).toBe(1); }); + it("does not duplicate file blocks for mixed audio+file messages re-processed in followup", async () => { + const existingFileBlock = + '\nold extracted content\n'; + const newFileBlock = + '\nnew extracted content\n'; + const transcriptText = "Mixed message transcript"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nanalyze this\n\n${newFileBlock}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // Simulate a mixed message where the primary path already extracted the + // PDF (file block is in the prompt) but audio transcription failed. + await runner( + createQueuedRun({ + prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nanalyze this\n\n${existingFileBlock}`, + mediaContext: { + Body: `analyze this\n\n${existingFileBlock}`, + CommandBody: "analyze this", + RawBody: "analyze this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + // Should contain the transcript + expect(agentCall?.prompt).toContain(transcriptText); + // Should have exactly one file block (the new one), not two + expect(agentCall?.prompt?.match(/ { + applyMediaUnderstandingMock.mockRejectedValueOnce( + new Error("transcription service unavailable"), + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "fallback reply" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }); + + await runner(queued); + + // DeferredMediaApplied should be set so re-runs don't retry + expect(queued.mediaContext?.DeferredMediaApplied).toBe(true); + + // The agent should still be called with the raw prompt + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("some text"); + }); + it("does not re-apply file extraction when the stored media body already has a file block", async () => { const fileBlock = '\nreport content\n'; runEmbeddedPiAgentMock.mockResolvedValueOnce({ From ad0a1bdc5e7bd3ecf47bd41d652bd522f978a6ac Mon Sep 17 00:00:00 2001 From: Joseph Krug Date: Sat, 14 Mar 2026 23:26:27 -0400 Subject: [PATCH 08/17] Update src/auto-reply/reply/followup-runner.test.ts Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/auto-reply/reply/followup-runner.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index cd1951aa748..16f2b8eec90 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1650,7 +1650,7 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example."); }); - it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + it("rebuilds the prompt when image understanding mutates the body alongside existing text", async () => { applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items."; From f890bc75dc4b752015aa39ca259043a0ac14cc63 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 23:51:22 -0400 Subject: [PATCH 09/17] fix: address remaining review feedback on followup media - Scope file-block stripping to body region only, preserving file blocks in quoted/replied text and thread history above the body - Gate file-extraction skip on mutation evidence (Body differs from resolved original) instead of raw '\nquoted thread attachment\n'; + const existingFileBlock = + '\nold extracted content\n'; + const newFileBlock = + '\nnew extracted content\n'; + const transcriptText = "Transcript from deferred audio"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nQuoted thread above\n\n${quotedFileBlock}`, + mediaContext: { + Body: `summarize this\n\n${existingFileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("Quoted thread above"); + expect(agentCall?.prompt).toContain(quotedFileBlock); + expect(agentCall?.prompt).toContain(newFileBlock); + expect(agentCall?.prompt?.match(/ { + const existingFileBlock = + '\nsummary notes:\nsummarize this\n'; + const transcriptText = "Transcript from deferred audio"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${existingFileBlock}`, + mediaContext: { + Body: `summarize this\n\n${existingFileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`; + expect(agentCall?.prompt).toContain(existingFileBlock); + expect(agentCall?.prompt).toContain(transcriptBlock); + expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeGreaterThan(-1); + expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeLessThan( + agentCall?.prompt?.indexOf(existingFileBlock) ?? -1, + ); + }); + it("sets DeferredMediaApplied when media understanding throws", async () => { applyMediaUnderstandingMock.mockRejectedValueOnce( new Error("transcription service unavailable"), From dc8cdfce5c7b23051291b6d5bd56989db0faa92a Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 12:12:12 -0400 Subject: [PATCH 11/17] fix: address Codex and Greptile review comments on #46454 Replace regex-based file extraction detection (FILE_BLOCK_RE.test) with a DeferredFileBlocksExtracted mutation marker on FollowupMediaContext. The old approach scanned user body text for ' Date: Sun, 15 Mar 2026 15:51:14 -0400 Subject: [PATCH 12/17] fix: use file-block-safe replacement in normalizeUpdatedBody and trailing fallback (#46454) --- src/auto-reply/reply/followup-media.test.ts | 107 ++++++++++++++++++++ src/auto-reply/reply/followup-media.ts | 44 +++++++- 2 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 src/auto-reply/reply/followup-media.test.ts diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts new file mode 100644 index 00000000000..d8ba453f801 --- /dev/null +++ b/src/auto-reply/reply/followup-media.test.ts @@ -0,0 +1,107 @@ +import { describe, expect, it } from "vitest"; +import { + _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks, + _normalizeUpdatedBody as normalizeUpdatedBody, + _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding, +} from "./followup-media.js"; + +const FILE_BLOCK = '\nPDF content\n'; + +describe("findLastOccurrenceBeforeFileBlocks", () => { + it("returns -1 for empty search", () => { + expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1); + }); + + it("finds last occurrence in body region before file blocks", () => { + const value = `hello world hello\n${FILE_BLOCK}`; + // "hello" appears at 0 and 12 — both before the file block + expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12); + }); + + it("does not match inside file block content", () => { + const value = `some text\n${FILE_BLOCK}\nPDF content`; + // "PDF content" appears in the file block and after it, but the body region + // (before { + // When the search string contains a block, it can't appear in the + // body-only region, so the fallback searches the full value. + const bodyWithFile = `caption\n${FILE_BLOCK}`; + const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`; + // Should find the *last* (trailing) occurrence + const expected = value.lastIndexOf(bodyWithFile); + expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected); + expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile)); + }); + + it("returns index when no file blocks exist in value", () => { + expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4); + }); +}); + +describe("normalizeUpdatedBody", () => { + it("returns empty string when updatedBody is empty", () => { + expect(normalizeUpdatedBody({ originalBody: "foo", updatedBody: "" })).toBe(""); + }); + + it("returns updatedBody when originalBody is empty", () => { + expect(normalizeUpdatedBody({ updatedBody: "hello" })).toBe("hello"); + }); + + it("strips directives when updatedBody equals originalBody", () => { + const body = "/think high tell me a joke"; + const result = normalizeUpdatedBody({ originalBody: body, updatedBody: body }); + expect(result).toBe("tell me a joke"); + }); + + it("does not corrupt file block content during directive cleanup", () => { + const originalBody = "/think high tell me about this file"; + // updatedBody has the original body plus a file block appended by media processing + const updatedBody = `${originalBody}\n${FILE_BLOCK}`; + const result = normalizeUpdatedBody({ originalBody, updatedBody }); + // The directive should be stripped from the body portion, file block preserved + expect(result).toContain("tell me about this file"); + expect(result).toContain(FILE_BLOCK); + expect(result).not.toContain("/think"); + }); + + it("replaces in body region, not inside file blocks", () => { + const originalBody = "PDF content"; + const updatedBody = `PDF content\n\nPDF content\n`; + // The replacement should target the body region "PDF content" before the + // file block, not the "PDF content" inside the block. + const result = normalizeUpdatedBody({ originalBody, updatedBody }); + // With no directives to strip, original === cleaned, updatedBody !== originalBody + // because updatedBody has the file block appended. The replacement targets the + // body-region occurrence. + expect(result).toContain('"); + }); +}); + +describe("rebuildQueuedPromptWithMediaUnderstanding", () => { + it("replaces original body with updated body in prompt", () => { + const result = rebuildQueuedPromptWithMediaUnderstanding({ + prompt: "thread context\nhello world", + originalBody: "hello world", + updatedBody: 'hello world\ndata', + }); + expect(result).toContain('data'); + expect(result).toContain("thread context"); + }); + + it("preserves file blocks in thread history when body is replaced", () => { + const prompt = `history\nold\nhello world`; + const result = rebuildQueuedPromptWithMediaUnderstanding({ + prompt, + originalBody: "hello world", + updatedBody: "hello world transcribed", + }); + // The old file block from history should be preserved since updatedBody + // has no file blocks of its own. + expect(result).toContain('old'); + expect(result).toContain("hello world transcribed"); + }); +}); diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index d9202554526..f40ec91f109 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -64,6 +64,40 @@ function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): num return bodyRegion.indexOf(search); } +function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number { + if (!search) { + return -1; + } + const fileBlockIndex = value.search(FILE_BLOCK_RE); + const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value; + const index = bodyRegion.lastIndexOf(search); + if (index >= 0) { + return index; + } + // Fallback: search string itself contains file blocks — it can't appear in the + // body-only region. Search the full value with lastIndexOf to pick the trailing + // (most recent) occurrence when thread/history has identical bodies. + if (FILE_BLOCK_RE.test(search)) { + return value.lastIndexOf(search); + } + return -1; +} + +function replaceLastOccurrenceBeforeFileBlocks( + value: string, + search: string, + replacement: string, +): string | undefined { + if (!search) { + return undefined; + } + const index = findLastOccurrenceBeforeFileBlocks(value, search); + if (index < 0) { + return undefined; + } + return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; +} + function replaceFirstOccurrenceBeforeFileBlocks( value: string, search: string, @@ -101,7 +135,8 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str return cleanedOriginalBody; } return ( - replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody + replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ?? + updatedBody ).trim(); } @@ -215,6 +250,13 @@ function snapshotUpdatedMediaContext(params: { }; } +// Exported for unit testing — these are pure string helpers with no side effects. +export { + findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks, + normalizeUpdatedBody as _normalizeUpdatedBody, + rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding, +}; + export async function applyDeferredMediaUnderstandingToQueuedRun( queued: FollowupRun, params: { logLabel?: string } = {}, From eb59b9c19d426296ef05f36a6b3c8ec10ca135a0 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 17:34:56 -0400 Subject: [PATCH 13/17] fix: trailing body match and RawBody-missing extraction detection (#46454) --- src/auto-reply/reply/followup-media.ts | 171 ++++++++++++------- src/auto-reply/reply/followup-runner.test.ts | 158 +++++++++++++++++ src/auto-reply/reply/queue/drain.ts | 9 +- 3 files changed, 272 insertions(+), 66 deletions(-) diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index f40ec91f109..425bdd601ea 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -1,3 +1,4 @@ +import path from "node:path"; import { logVerbose } from "../../globals.js"; import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; import { @@ -40,30 +41,6 @@ function stripLeadingMediaReplyHint(prompt: string): string { return prompt.trim(); } -function replaceLastOccurrence( - value: string, - search: string, - replacement: string, -): string | undefined { - if (!search) { - return undefined; - } - const index = value.lastIndexOf(search); - if (index < 0) { - return undefined; - } - return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; -} - -function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): number { - if (!search) { - return -1; - } - const fileBlockIndex = value.search(FILE_BLOCK_RE); - const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value; - return bodyRegion.indexOf(search); -} - function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number { if (!search) { return -1; @@ -98,21 +75,81 @@ function replaceLastOccurrenceBeforeFileBlocks( return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; } -function replaceFirstOccurrenceBeforeFileBlocks( +function findTrailingReplacementTargetBeforeFileBlocks( + value: string, + targets: string[], +): { index: number; target: string } | undefined { + let bestMatch: { index: number; target: string } | undefined; + for (const target of targets) { + const index = findLastOccurrenceBeforeFileBlocks(value, target); + if (index < 0) { + continue; + } + if (!bestMatch || index > bestMatch.index) { + bestMatch = { index, target }; + } + } + return bestMatch; +} + +function replaceOccurrenceAtIndex( value: string, search: string, replacement: string, -): string | undefined { - if (!search) { - return undefined; - } - const index = findFirstOccurrenceBeforeFileBlocks(value, search); - if (index < 0) { - return undefined; - } + index: number, +): string { return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; } +function decodeXmlAttr(value: string): string { + return value + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/&/g, "&"); +} + +function extractAttachmentFileName(value?: string): string | undefined { + const trimmed = value?.trim(); + if (!trimmed) { + return undefined; + } + if (/^[a-zA-Z][a-zA-Z\d+.-]*:/.test(trimmed)) { + try { + const pathname = new URL(trimmed).pathname; + const basename = path.posix.basename(pathname); + return basename || undefined; + } catch { + // Fall back to path-style parsing below. + } + } + const normalized = trimmed.replace(/\\/g, "/"); + const basename = path.posix.basename(normalized); + return basename || undefined; +} + +function bodyContainsMatchingFileBlock(mediaContext: FollowupMediaContext): boolean { + const body = mediaContext.Body?.trim(); + if (!body || !FILE_BLOCK_RE.test(body)) { + return false; + } + const bodyFileNames = new Set(); + for (const match of body.matchAll(/]*>/gi)) { + const fileName = match[1]?.trim(); + if (fileName) { + bodyFileNames.add(decodeXmlAttr(fileName)); + } + } + if (bodyFileNames.size === 0) { + return false; + } + return normalizeAttachments(mediaContext as MsgContext).some((attachment) => { + const fileName = extractAttachmentFileName(attachment.path ?? attachment.url); + return Boolean(fileName && bodyFileNames.has(fileName)); + }); +} + function stripInlineDirectives(text: string | undefined): string { return parseInlineDirectives(text ?? "").cleaned.trim(); } @@ -168,12 +205,14 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { // thread history above the body, and prompts whose original body no longer // appears all retain any legitimate blocks. if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) { - const bodyIdx = - replacementTargets - .map((target) => findFirstOccurrenceBeforeFileBlocks(stripped, target)) - .find((index) => index >= 0) ?? -1; - if (bodyIdx >= 0) { - stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx)); + const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks( + stripped, + replacementTargets, + ); + if (trailingMatch) { + stripped = + stripped.slice(0, trailingMatch.index) + + stripExistingFileBlocks(stripped.slice(trailingMatch.index)); } } @@ -186,12 +225,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { } let rebuilt = stripped; - for (const target of replacementTargets) { - const replaced = replaceFirstOccurrenceBeforeFileBlocks(rebuilt, target, updatedBody); - if (replaced !== undefined) { - rebuilt = replaced; - return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); - } + const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(rebuilt, replacementTargets); + if (trailingMatch) { + rebuilt = replaceOccurrenceAtIndex( + rebuilt, + trailingMatch.target, + updatedBody, + trailingMatch.index, + ); + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); } rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); @@ -268,29 +310,29 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( if (!mediaContext || mediaContext.DeferredMediaApplied) { return; } - if (mediaContext.MediaUnderstanding?.length) { - mediaContext.DeferredMediaApplied = true; - return; - } if (!hasMediaAttachments(mediaContext)) { mediaContext.DeferredMediaApplied = true; return; } - - const resolvedOriginalBody = - mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body; - // Detect file extraction from the primary path via body mutation instead of - // scanning for literal ' { }; expect(agentCall?.prompt?.match(/ { + const fileBlock = '\nreport content\n'; + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`, + mediaContext: { + Body: `summarize this\n\n${fileBlock}`, + CommandBody: "summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(fileBlock); + }); + + it("replaces the trailing repeated body segment instead of the first matching thread text", async () => { + const existingFileBlock = + '\nold extracted content\n'; + const newFileBlock = + '\nnew extracted content\n'; + const transcriptText = "Deferred transcript"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nThread history: summarize this\n\n` + + `summarize this\n\n${existingFileBlock}`, + mediaContext: { + Body: `summarize this\n\n${existingFileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("Thread history: summarize this"); + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).toContain(newFileBlock); + expect(agentCall?.prompt).not.toContain("old extracted content"); + expect(agentCall?.prompt?.indexOf(newFileBlock)).toBeGreaterThan( + agentCall?.prompt?.lastIndexOf("summarize this") ?? -1, + ); + }); }); describe("followup queue drain deferred media understanding", () => { @@ -2163,6 +2271,56 @@ describe("followup queue drain deferred media understanding", () => { expect(prompt).not.toContain("[media attached: /tmp/voice.ogg"); }); + it("preprocesses queued runs in parallel", async () => { + const resolvers: Array<() => void> = []; + const done = () => ({ + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }); + applyMediaUnderstandingMock.mockImplementation( + async () => + await new Promise((resolve) => { + resolvers.push(() => resolve(done())); + }), + ); + + const items: FollowupRun[] = [ + createQueuedRun({ + prompt: "[media attached: /tmp/voice-1.ogg (audio/ogg)]\nfirst text", + summaryLine: "first text", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "first text", + MediaPaths: ["/tmp/voice-1.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + createQueuedRun({ + prompt: "[media attached: /tmp/voice-2.ogg (audio/ogg)]\nsecond text", + summaryLine: "second text", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "second text", + MediaPaths: ["/tmp/voice-2.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ]; + + const pending = applyDeferredMediaToQueuedRuns(items); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(2); + + for (const resolve of resolvers) { + resolve(); + } + await pending; + }); + it("preprocesses dropped media items before building overflow summaries", async () => { applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts index 471c94200ba..5d53df34189 100644 --- a/src/auto-reply/reply/queue/drain.ts +++ b/src/auto-reply/reply/queue/drain.ts @@ -76,9 +76,12 @@ function clearFollowupQueueSummaryState(queue: FollowupQueueState): void { } export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise { - for (const item of items) { - await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); - } + await Promise.allSettled( + items.map( + async (item) => + await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }), + ), + ); } async function resolveSummaryLines(items: FollowupRun[]): Promise { From 5ec82dd0e988227f77e79a53bd279255491e5d16 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 18:00:45 -0400 Subject: [PATCH 14/17] fix: parallelize deferred media calls and search full prompt outside file blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parallelize deferred media understanding calls in resolveSummaryLines and applyDeferredMediaToQueuedRuns using Promise.allSettled so media API calls run concurrently while summary line order stays sequential. Replace findFirstOccurrenceBeforeFileBlocks (which truncated at the first tag) with findLastOccurrenceOutsideFileBlocks that searches the full prompt via lastIndexOf and skips matches inside blocks. This fixes body replacement when thread/history context has extracted file blocks before the current queued message body. Add regression test for body appearing after thread-history file blocks. --- src/auto-reply/reply/followup-media.test.ts | 40 +++++++---- src/auto-reply/reply/followup-media.ts | 58 +++++++++++----- src/auto-reply/reply/followup-runner.test.ts | 72 ++++++++++++++++++++ src/auto-reply/reply/queue/drain.ts | 21 +++--- 4 files changed, 152 insertions(+), 39 deletions(-) diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts index d8ba453f801..77996f85606 100644 --- a/src/auto-reply/reply/followup-media.test.ts +++ b/src/auto-reply/reply/followup-media.test.ts @@ -1,43 +1,57 @@ import { describe, expect, it } from "vitest"; import { - _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks, + _findLastOccurrenceOutsideFileBlocks as findLastOccurrenceOutsideFileBlocks, _normalizeUpdatedBody as normalizeUpdatedBody, _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding, } from "./followup-media.js"; const FILE_BLOCK = '\nPDF content\n'; -describe("findLastOccurrenceBeforeFileBlocks", () => { +describe("findLastOccurrenceOutsideFileBlocks", () => { it("returns -1 for empty search", () => { - expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1); + expect(findLastOccurrenceOutsideFileBlocks("hello", "")).toBe(-1); }); it("finds last occurrence in body region before file blocks", () => { const value = `hello world hello\n${FILE_BLOCK}`; // "hello" appears at 0 and 12 — both before the file block - expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12); + expect(findLastOccurrenceOutsideFileBlocks(value, "hello")).toBe(12); }); - it("does not match inside file block content", () => { + it("skips matches inside file block content", () => { + // "PDF content" appears only inside the file block — no valid match outside. + const value = `some text\n${FILE_BLOCK}`; + expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(-1); + }); + + it("finds trailing occurrence outside file block even when also inside one", () => { const value = `some text\n${FILE_BLOCK}\nPDF content`; - // "PDF content" appears in the file block and after it, but the body region - // (before { - // When the search string contains a block, it can't appear in the - // body-only region, so the fallback searches the full value. + it("finds occurrence when search itself contains file blocks", () => { const bodyWithFile = `caption\n${FILE_BLOCK}`; const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`; // Should find the *last* (trailing) occurrence const expected = value.lastIndexOf(bodyWithFile); - expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected); + expect(findLastOccurrenceOutsideFileBlocks(value, bodyWithFile)).toBe(expected); expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile)); }); it("returns index when no file blocks exist in value", () => { - expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4); + expect(findLastOccurrenceOutsideFileBlocks("abc abc", "abc")).toBe(4); + }); + + it("finds body text after thread-history file blocks", () => { + const value = `Thread history\n${FILE_BLOCK}\n\ncheck this out`; + // The body "check this out" appears after a file block from thread history. + // The old truncation approach would miss this; the new approach finds it. + expect(findLastOccurrenceOutsideFileBlocks(value, "check this out")).toBe( + value.lastIndexOf("check this out"), + ); }); }); diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index 425bdd601ea..f0d5d951683 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -41,26 +41,48 @@ function stripLeadingMediaReplyHint(prompt: string): string { return prompt.trim(); } -function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number { +/** Collect the [start, end) ranges of every `` block in `value`. */ +function collectFileBlockRanges(value: string): Array<[number, number]> { + const ranges: Array<[number, number]> = []; + const re = new RegExp(FILE_BLOCK_FULL_RE.source, FILE_BLOCK_FULL_RE.flags); + let m: RegExpExecArray | null; + while ((m = re.exec(value)) !== null) { + ranges.push([m.index, m.index + m[0].length]); + } + return ranges; +} + +function isInsideFileBlock( + position: number, + length: number, + ranges: Array<[number, number]>, +): boolean { + for (const [start, end] of ranges) { + if (position >= start && position + length <= end) { + return true; + } + } + return false; +} + +/** + * Find the last occurrence of `search` in `value` that is NOT inside a + * `` block. Searches the full string with lastIndexOf, + * then walks backward past any matches that fall inside file blocks. + */ +function findLastOccurrenceOutsideFileBlocks(value: string, search: string): number { if (!search) { return -1; } - const fileBlockIndex = value.search(FILE_BLOCK_RE); - const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value; - const index = bodyRegion.lastIndexOf(search); - if (index >= 0) { - return index; + const ranges = collectFileBlockRanges(value); + let pos = value.lastIndexOf(search); + while (pos >= 0 && isInsideFileBlock(pos, search.length, ranges)) { + pos = value.lastIndexOf(search, pos - 1); } - // Fallback: search string itself contains file blocks — it can't appear in the - // body-only region. Search the full value with lastIndexOf to pick the trailing - // (most recent) occurrence when thread/history has identical bodies. - if (FILE_BLOCK_RE.test(search)) { - return value.lastIndexOf(search); - } - return -1; + return pos; } -function replaceLastOccurrenceBeforeFileBlocks( +function replaceLastOccurrenceOutsideFileBlocks( value: string, search: string, replacement: string, @@ -68,7 +90,7 @@ function replaceLastOccurrenceBeforeFileBlocks( if (!search) { return undefined; } - const index = findLastOccurrenceBeforeFileBlocks(value, search); + const index = findLastOccurrenceOutsideFileBlocks(value, search); if (index < 0) { return undefined; } @@ -81,7 +103,7 @@ function findTrailingReplacementTargetBeforeFileBlocks( ): { index: number; target: string } | undefined { let bestMatch: { index: number; target: string } | undefined; for (const target of targets) { - const index = findLastOccurrenceBeforeFileBlocks(value, target); + const index = findLastOccurrenceOutsideFileBlocks(value, target); if (index < 0) { continue; } @@ -172,7 +194,7 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str return cleanedOriginalBody; } return ( - replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ?? + replaceLastOccurrenceOutsideFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody ).trim(); } @@ -294,7 +316,7 @@ function snapshotUpdatedMediaContext(params: { // Exported for unit testing — these are pure string helpers with no side effects. export { - findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks, + findLastOccurrenceOutsideFileBlocks as _findLastOccurrenceOutsideFileBlocks, normalizeUpdatedBody as _normalizeUpdatedBody, rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding, }; diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 0db30881b7e..2c94f03fb8d 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -2025,6 +2025,78 @@ describe("createFollowupRunner media understanding", () => { ); }); + it("finds the body after thread-history file blocks when body appears after the first tag", async () => { + const threadFileBlock = + '\nolder thread attachment\n'; + const transcriptText = "Transcript from deferred voice note"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // The prompt has thread history with a file block BEFORE the current + // queued body text. The old truncation approach would miss the body + // because it only searched before the first tag. + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/voice.ogg]\n${MEDIA_REPLY_HINT}\nThread history\n\n${threadFileBlock}\n\ncheck this out`, + mediaContext: { + Body: "check this out", + RawBody: "check this out", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`; + // The body should be replaced with the transcript block + expect(agentCall?.prompt).toContain(transcriptBlock); + // Thread history and its file block should be preserved + expect(agentCall?.prompt).toContain("Thread history"); + expect(agentCall?.prompt).toContain(threadFileBlock); + }); + it("sets DeferredMediaApplied when media understanding throws", async () => { applyMediaUnderstandingMock.mockRejectedValueOnce( new Error("transcription service unavailable"), diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts index 5d53df34189..41c859d2ec3 100644 --- a/src/auto-reply/reply/queue/drain.ts +++ b/src/auto-reply/reply/queue/drain.ts @@ -85,14 +85,19 @@ export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Prom } async function resolveSummaryLines(items: FollowupRun[]): Promise { - const summaryLines: string[] = []; - for (const item of items) { - await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); - // After deferred media, prefer the updated prompt (which includes transcripts) - // over the original summaryLine (which may just be the caption text). - summaryLines.push(buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || "")); - } - return summaryLines; + // Parallelize the media understanding API calls upfront (same pattern as + // applyDeferredMediaToQueuedRuns), then build summary lines sequentially + // so line order matches the original item order. + await Promise.allSettled( + items.map((item) => + applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }), + ), + ); + // After deferred media, prefer the updated prompt (which includes transcripts) + // over the original summaryLine (which may just be the caption text). + return items.map((item) => + buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""), + ); } export async function buildMediaAwareQueueSummaryPrompt(params: { From e8fd176cf491f2e03c42bd02a190c5910f9db637 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 20:36:20 -0400 Subject: [PATCH 15/17] fix: match file blocks by presence not basename (#46454) --- src/auto-reply/reply/followup-media.ts | 63 ++++---------------- src/auto-reply/reply/followup-runner.test.ts | 35 +++++++++++ 2 files changed, 45 insertions(+), 53 deletions(-) diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index f0d5d951683..dca7b4b414b 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -1,4 +1,3 @@ -import path from "node:path"; import { logVerbose } from "../../globals.js"; import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; import { @@ -14,6 +13,7 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; const FILE_BLOCK_RE = /]*>[\s\S]*?<\/file>/i; const FILE_BLOCK_FULL_RE = /]*>[\s\S]*?<\/file>\n?/gi; function stripExistingFileBlocks(text: string): string { @@ -123,59 +123,14 @@ function replaceOccurrenceAtIndex( return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; } -function decodeXmlAttr(value: string): string { - return value - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/</g, "<") - .replace(/>/g, ">") - .replace(/&/g, "&"); -} - -function extractAttachmentFileName(value?: string): string | undefined { - const trimmed = value?.trim(); - if (!trimmed) { - return undefined; - } - if (/^[a-zA-Z][a-zA-Z\d+.-]*:/.test(trimmed)) { - try { - const pathname = new URL(trimmed).pathname; - const basename = path.posix.basename(pathname); - return basename || undefined; - } catch { - // Fall back to path-style parsing below. - } - } - const normalized = trimmed.replace(/\\/g, "/"); - const basename = path.posix.basename(normalized); - return basename || undefined; -} - -function bodyContainsMatchingFileBlock(mediaContext: FollowupMediaContext): boolean { - const body = mediaContext.Body?.trim(); - if (!body || !FILE_BLOCK_RE.test(body)) { - return false; - } - const bodyFileNames = new Set(); - for (const match of body.matchAll(/]*>/gi)) { - const fileName = match[1]?.trim(); - if (fileName) { - bodyFileNames.add(decodeXmlAttr(fileName)); - } - } - if (bodyFileNames.size === 0) { - return false; - } - return normalizeAttachments(mediaContext as MsgContext).some((attachment) => { - const fileName = extractAttachmentFileName(attachment.path ?? attachment.url); - return Boolean(fileName && bodyFileNames.has(fileName)); - }); -} - function stripInlineDirectives(text: string | undefined): string { return parseInlineDirectives(text ?? "").cleaned.trim(); } +function bodyContainsExtractedFileBlock(text: string | undefined): boolean { + return FILE_BLOCK_BODY_RE.test(text ?? ""); +} + function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string { const updatedBody = params.updatedBody?.trim(); if (!updatedBody) { @@ -336,10 +291,12 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( mediaContext.DeferredMediaApplied = true; return; } + const referenceBody = mediaContext.RawBody ?? mediaContext.Body; // Prefer RawBody-vs-Body comparison when RawBody exists. If RawBody is - // missing, fall back to explicit file-extraction signals instead of - // re-running extraction just because the clean pre-extraction body is gone. + // missing, any real ... block plus file-like attachments means + // extraction already ran, even if the stored name came from Content-Disposition + // instead of the attachment path/url basename. if (!mediaContext.DeferredFileBlocksExtracted && hasAnyFileAttachments(mediaContext)) { const rawBodyMissing = typeof mediaContext.RawBody !== "string"; if (mediaContext.Body !== referenceBody) { @@ -347,7 +304,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( } else if ( rawBodyMissing && (Boolean(mediaContext.MediaUnderstanding?.length) || - bodyContainsMatchingFileBlock(mediaContext)) + bodyContainsExtractedFileBlock(mediaContext.Body)) ) { mediaContext.DeferredFileBlocksExtracted = true; } diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 2c94f03fb8d..0c8ac63d4d7 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -2201,6 +2201,41 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).toContain(fileBlock); }); + it("treats any stored file block as already extracted even when the filename differs from the attachment basename", async () => { + const fileBlock = + '\nreport content\n'; + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: `[media attached: /tmp/upload-8472.bin]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`, + mediaContext: { + Body: `summarize this\n\n${fileBlock}`, + CommandBody: "summarize this", + MediaPaths: ["/tmp/upload-8472.bin"], + MediaTypes: ["application/pdf"], + }, + }); + + await runner(queued); + + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + expect(queued.mediaContext?.DeferredFileBlocksExtracted).toBe(true); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt?.match(/ { const existingFileBlock = '\nold extracted content\n'; From 1b6c43baa47f9e09a670bc055b73ee32b65bc693 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 21:54:48 -0400 Subject: [PATCH 16/17] fix: stop inferring file extraction from Body/RawBody mismatch (#46454) --- src/auto-reply/reply/followup-media.ts | 28 +++++------- src/auto-reply/reply/followup-runner.test.ts | 48 ++++++++++++++++++++ src/auto-reply/reply/queue/types.ts | 6 +-- 3 files changed, 62 insertions(+), 20 deletions(-) diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index dca7b4b414b..19b2c061c8c 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -292,27 +292,21 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( return; } - const referenceBody = mediaContext.RawBody ?? mediaContext.Body; - // Prefer RawBody-vs-Body comparison when RawBody exists. If RawBody is - // missing, any real ... block plus file-like attachments means - // extraction already ran, even if the stored name came from Content-Disposition - // instead of the attachment path/url basename. - if (!mediaContext.DeferredFileBlocksExtracted && hasAnyFileAttachments(mediaContext)) { - const rawBodyMissing = typeof mediaContext.RawBody !== "string"; - if (mediaContext.Body !== referenceBody) { - mediaContext.DeferredFileBlocksExtracted = true; - } else if ( - rawBodyMissing && - (Boolean(mediaContext.MediaUnderstanding?.length) || - bodyContainsExtractedFileBlock(mediaContext.Body)) - ) { - mediaContext.DeferredFileBlocksExtracted = true; - } - } if (mediaContext.MediaUnderstanding?.length) { mediaContext.DeferredMediaApplied = true; return; } + // Treat followup file extraction as already applied only when we have explicit + // evidence: the queue snapshot already flagged it or Body already contains a + // real extracted ... block. Body/RawBody mismatches are not + // reliable because some channels wrap Body with envelope metadata. + if ( + !mediaContext.DeferredFileBlocksExtracted && + hasAnyFileAttachments(mediaContext) && + bodyContainsExtractedFileBlock(mediaContext.Body) + ) { + mediaContext.DeferredFileBlocksExtracted = true; + } if (mediaContext.DeferredFileBlocksExtracted && hasOnlyFileLikeAttachments(mediaContext)) { mediaContext.DeferredMediaApplied = true; diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 0c8ac63d4d7..f766aa57321 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1231,6 +1231,54 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt?.match(/ { + const fileBlock = '\nreport content\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "file processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/report.pdf]\nLine: Alice\nsummarize this", + mediaContext: { + Body: "Line: Alice\nsummarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("Line: Alice"); + expect(agentCall?.prompt).toContain(fileBlock); + expect(agentCall?.prompt?.match(/ { applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts index e1e9e20e5c8..1c2f5e0551a 100644 --- a/src/auto-reply/reply/queue/types.ts +++ b/src/auto-reply/reply/queue/types.ts @@ -52,9 +52,9 @@ export type FollowupMediaContext = { DeferredMediaApplied?: boolean; /** * Set when file extraction has already been applied to Body (either in the - * primary path or by a previous deferred-media run). Checked instead of - * scanning body text for `...` + * blocks. */ DeferredFileBlocksExtracted?: boolean; }; From fddce3db72f5e22b8d3a1d2f7a12daf15b3e045c Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 21 Mar 2026 00:28:15 -0400 Subject: [PATCH 17/17] fix: remove redundant async wrapper in drain.ts --- src/auto-reply/reply/queue/drain.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts index 41c859d2ec3..27491713848 100644 --- a/src/auto-reply/reply/queue/drain.ts +++ b/src/auto-reply/reply/queue/drain.ts @@ -77,9 +77,8 @@ function clearFollowupQueueSummaryState(queue: FollowupQueueState): void { export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise { await Promise.allSettled( - items.map( - async (item) => - await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }), + items.map((item) => + applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }), ), ); }