diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index 5a014e63f9b..5340d0df99a 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -13,6 +13,11 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; const FILE_BLOCK_RE = /]*>[\s\S]*?<\/file>\n?/gi; + +function stripExistingFileBlocks(text: string): string { + return text.replace(FILE_BLOCK_FULL_RE, "").trim(); +} function stripLeadingMediaAttachedLines(prompt: string): string { const lines = prompt.split("\n"); @@ -87,6 +92,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { stripped = stripLeadingMediaReplyHint(stripped); } + // Strip pre-existing file blocks from the prompt when the updated body + // contains new file blocks. Mixed messages (audio + PDF) can arrive with + // file extraction already applied in the primary path; without this strip + // the old block stays in the prompt while the updated body adds a new one, + // duplicating potentially large file payloads. + if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) { + stripped = stripExistingFileBlocks(stripped); + } + const updatedBody = normalizeUpdatedBody({ originalBody: params.originalBody, updatedBody: params.updatedBody, @@ -234,6 +248,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined, }); } catch (err) { + mediaContext.DeferredMediaApplied = true; logVerbose( `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, ); diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index c3e75e6b856..cd1951aa748 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1808,6 +1808,117 @@ describe("createFollowupRunner media understanding", () => { expect(matches?.length).toBe(1); }); + it("does not duplicate file blocks for mixed audio+file messages re-processed in followup", async () => { + const existingFileBlock = + '\nold extracted content\n'; + const newFileBlock = + '\nnew extracted content\n'; + const transcriptText = "Mixed message transcript"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nanalyze this\n\n${newFileBlock}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // Simulate a mixed message where the primary path already extracted the + // PDF (file block is in the prompt) but audio transcription failed. + await runner( + createQueuedRun({ + prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nanalyze this\n\n${existingFileBlock}`, + mediaContext: { + Body: `analyze this\n\n${existingFileBlock}`, + CommandBody: "analyze this", + RawBody: "analyze this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + // Should contain the transcript + expect(agentCall?.prompt).toContain(transcriptText); + // Should have exactly one file block (the new one), not two + expect(agentCall?.prompt?.match(/ { + applyMediaUnderstandingMock.mockRejectedValueOnce( + new Error("transcription service unavailable"), + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "fallback reply" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }); + + await runner(queued); + + // DeferredMediaApplied should be set so re-runs don't retry + expect(queued.mediaContext?.DeferredMediaApplied).toBe(true); + + // The agent should still be called with the raw prompt + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("some text"); + }); + it("does not re-apply file extraction when the stored media body already has a file block", async () => { const fileBlock = '\nreport content\n'; runEmbeddedPiAgentMock.mockResolvedValueOnce({