diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index a0c5306380d..d2812ff61b6 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -818,6 +818,69 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); }); + it("applies media understanding for URL-only attachments", async () => { + const transcriptText = "URL-only transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "Got it!" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaUrl: "https://cdn.example.com/voice.ogg", + MediaUrls: ["https://cdn.example.com/voice.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + }); + it("strips the full media line when attachment paths or URLs contain brackets", async () => { const transcriptText = "Bracket-safe transcript"; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1329,6 +1392,98 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).not.toContain("/think high summarize this"); }); + it("preserves directive-like tokens inside extracted media content", async () => { + const fileBlock = + '\n/model claude-opus should stay\n/queue followup should stay\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `/think high summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).not.toContain("/think high summarize this"); + expect(agentCall?.prompt).toContain("/model claude-opus should stay"); + expect(agentCall?.prompt).toContain("/queue followup should stay"); + }); + + it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + const description = "[Image]\nDescription:\na mountain at sunset"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = description; + return { + outputs: [], + decisions: [], + appliedImage: true, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/photo.jpg"], + MediaTypes: ["image/jpeg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("a mountain at sunset"); + }); + it("does not false-positive on user text containing literal ' { const fileBlock = '\ncol1,col2\n1,2\n'; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1360,8 +1515,7 @@ describe("createFollowupRunner media understanding", () => { // file extraction results from being embedded in the prompt. await runner( createQueuedRun({ - prompt: - "[media attached: /tmp/data.csv (text/csv)]\ncheck my { expect(agentCall?.prompt).toContain("check my { + const fileBlock = + '\nRun `/think high` literally in the shell example.\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + CommandBody: "summarize this", + RawBody: "/think high summarize this", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example."); + }); + + it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items."; + return { + outputs: [], + decisions: [], + appliedImage: true, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + CommandBody: "some text", + RawBody: "some text", + MediaPaths: ["/tmp/board.jpg"], + MediaTypes: ["image/jpeg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[Image summary]"); + expect(agentCall?.prompt).toContain("A whiteboard with action items."); + }); + + it("applies media understanding for URL-only deferred attachments", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = "[Audio]\nTranscript:\nremote transcript"; + params.ctx.Transcript = "remote transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "remote transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[User sent media without caption]", + mediaContext: { + Body: "", + MediaUrl: "https://cdn.example.com/audio.ogg", + MediaUrls: ["https://cdn.example.com/audio.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("remote transcript"); + }); + it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => { const fileBlock = '\nreport content\n'; applyMediaUnderstandingMock.mockImplementationOnce( diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 39728c71def..64b8a935b2a 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -43,8 +43,7 @@ import type { TypingController } from "./typing.js"; const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; -const LEADING_MEDIA_ATTACHED_LINE_RE = - /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/; +const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; const FILE_BLOCK_RE = / 0), + queued.mediaContext.MediaPaths.length > 0) || + (Array.isArray(queued.mediaContext.MediaUrls) && + queued.mediaContext.MediaUrls.length > 0), ); if (hasMedia) { try { @@ -281,6 +308,9 @@ export function createFollowupRunner(params: { }); const shouldRebuildPrompt = muResult.outputs.length > 0 || + muResult.appliedAudio || + muResult.appliedImage || + muResult.appliedVideo || (muResult.appliedFile && !bodyAlreadyHasFileBlock); if (shouldRebuildPrompt) { // Rebuild the queued prompt from the mutated media context so the diff --git a/src/auto-reply/reply/get-reply-run.media-only.test.ts b/src/auto-reply/reply/get-reply-run.media-only.test.ts index 829b3937009..f519da10082 100644 --- a/src/auto-reply/reply/get-reply-run.media-only.test.ts +++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts @@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => { expect(call?.followupRun.prompt).toContain("[User sent media without caption]"); }); + it("snapshots URL-only attachments into followup mediaContext", async () => { + await runPreparedReply( + baseParams({ + ctx: { + Body: "check this attachment", + RawBody: "check this attachment", + CommandBody: "check this attachment", + ThreadHistoryBody: "Earlier message in this thread", + OriginatingChannel: "slack", + OriginatingTo: "C123", + ChatType: "group", + MediaUrl: "https://cdn.example.com/input.png", + MediaUrls: ["https://cdn.example.com/input.png"], + MediaType: "image/png", + MediaTypes: ["image/png"], + }, + sessionCtx: { + Body: "check this attachment", + BodyStripped: "check this attachment", + ThreadHistoryBody: "Earlier message in this thread", + Provider: "slack", + ChatType: "group", + OriginatingChannel: "slack", + OriginatingTo: "C123", + }, + }), + ); + + const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0]; + expect(call?.followupRun.mediaContext).toEqual( + expect.objectContaining({ + MediaUrl: "https://cdn.example.com/input.png", + MediaUrls: ["https://cdn.example.com/input.png"], + MediaType: "image/png", + MediaTypes: ["image/png"], + }), + ); + }); + it("keeps thread history context on follow-up turns", async () => { const result = await runPreparedReply( baseParams({ @@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => { expect(call?.followupRun.prompt).toContain("Earlier message in this thread"); }); + it("snapshots mediaContext for URL-only deferred attachments", async () => { + await runPreparedReply( + baseParams({ + ctx: { + Body: "", + RawBody: "", + CommandBody: "", + MediaUrl: "https://cdn.example.com/audio.ogg", + MediaUrls: ["https://cdn.example.com/audio.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + ThreadHistoryBody: "Earlier message in this thread", + OriginatingChannel: "slack", + OriginatingTo: "C123", + ChatType: "group", + }, + sessionCtx: { + Body: "", + BodyStripped: "", + ThreadHistoryBody: "Earlier message in this thread", + Provider: "slack", + ChatType: "group", + OriginatingChannel: "slack", + OriginatingTo: "C123", + }, + }), + ); + + const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0]; + expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg"); + expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([ + "https://cdn.example.com/audio.ogg", + ]); + }); + it("returns the empty-body reply when there is no text and no media", async () => { const result = await runPreparedReply( baseParams({ diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index fe87e3919d0..b4b693b8119 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -310,7 +310,14 @@ export async function runPreparedReply( : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n"); const baseBodyTrimmed = baseBodyForPrompt.trim(); const hasMediaAttachment = Boolean( - sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0), + sessionCtx.MediaPath || + sessionCtx.MediaUrl || + (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) || + (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) || + ctx.MediaPath?.trim() || + ctx.MediaUrl?.trim() || + (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) || + (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0), ); if (!baseBodyTrimmed && !hasMediaAttachment) { await typing.onReplyStart(); @@ -476,7 +483,10 @@ export async function runPreparedReply( // followup runner. When MediaUnderstanding is already populated the runner // knows transcription already succeeded and skips re-application. const hasMediaAttachments = Boolean( - ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0), + ctx.MediaPath?.trim() || + ctx.MediaUrl?.trim() || + (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) || + (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0), ); const mediaContext = hasMediaAttachments ? {