From 6edb3b7e345573df8fe92485bd6d45e604de29b9 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 15:29:55 -0400 Subject: [PATCH] fix queued media-understanding prompt rebuild --- src/auto-reply/reply/followup-runner.test.ts | 170 ++++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 116 ++++++++++--- 2 files changed, 260 insertions(+), 26 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index f0a060af4ac..71451526400 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -67,6 +67,9 @@ beforeEach(() => { const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun => createMockFollowupRun({ run: { messageProvider } }); +const MEDIA_REPLY_HINT = + "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body."; + function createQueuedRun( overrides: Partial> & { run?: Partial; @@ -741,7 +744,7 @@ describe("createFollowupRunner media understanding", () => { it("applies audio transcription when mediaContext has untranscribed audio", async () => { const transcriptText = "Hello, this is a voice note."; // The real applyMediaUnderstanding mutates the ctx; the mock must do the same - // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line. + // so buildInboundMediaNote and queued prompt rebuilding see the transcribed body. applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { params.ctx.MediaUnderstanding = [ @@ -753,6 +756,7 @@ describe("createFollowupRunner media understanding", () => { }, ]; params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; return { outputs: [ { @@ -929,6 +933,7 @@ describe("createFollowupRunner media understanding", () => { }, ]; params.ctx.Transcript = "voice transcript"; + params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript"; return { outputs: [ { @@ -980,4 +985,167 @@ describe("createFollowupRunner media understanding", () => { // The transcript should be present expect(agentCall?.prompt).toContain("voice transcript"); }); + + it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => { + const transcriptText = "Bracket-safe transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "ok" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" + + MEDIA_REPLY_HINT + + "\n" + + "some text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice[0].ogg"], + MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg"); + expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123"); + expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT); + }); + + it("preserves file-only media understanding when outputs are empty", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = + '\nQuarterly report body\n'; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`, + mediaContext: { + Body: "", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]"); + expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT); + expect(agentCall?.prompt).toContain(''); + expect(agentCall?.prompt).toContain("Quarterly report body"); + expect(agentCall?.prompt).not.toContain("[User sent media without caption]"); + }); + + it("replaces the queued body when inline directives were already stripped from the prompt", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = + '/think high summarize this\n\n\nreport\n'; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).toContain(''); + expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this"); + expect(agentCall?.prompt).not.toContain("/think high summarize this"); + }); }); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 1e65380a020..ab2169fc09e 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -14,7 +14,6 @@ import type { TypingMode } from "../../config/types.js"; import { logVerbose } from "../../globals.js"; import { registerAgentRunContext } from "../../infra/agent-events.js"; import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; -import { formatMediaUnderstandingBody } from "../../media-understanding/format.js"; import { defaultRuntime } from "../../runtime.js"; import { isInternalMessageChannel } from "../../utils/message-channel.js"; import { stripHeartbeatToken } from "../heartbeat.js"; @@ -23,6 +22,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveRunAuthProfile } from "./agent-runner-utils.js"; +import { parseInlineDirectives } from "./directive-handling.js"; import { resolveOriginAccountId, resolveOriginMessageProvider, @@ -41,6 +41,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r import { createTypingSignaler } from "./typing-mode.js"; import type { TypingController } from "./typing.js"; +const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; +const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; + +function stripLeadingMediaAttachedLines(prompt: string): string { + const lines = prompt.split("\n"); + let index = 0; + while (index < lines.length) { + const trimmed = lines[index]?.trim() ?? ""; + if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) { + break; + } + index += 1; + } + return lines.slice(index).join("\n").trim(); +} + +function stripLeadingMediaReplyHint(prompt: string): string { + const lines = prompt.split("\n"); + if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) { + return lines.slice(1).join("\n").trim(); + } + return prompt.trim(); +} + +function replaceLastOccurrence( + value: string, + search: string, + replacement: string, +): string | undefined { + if (!search) { + return undefined; + } + const index = value.lastIndexOf(search); + if (index < 0) { + return undefined; + } + return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; +} + +function stripInlineDirectives(text: string | undefined): string { + return parseInlineDirectives(text ?? "").cleaned.trim(); +} + +function rebuildQueuedPromptWithMediaUnderstanding(params: { + prompt: string; + originalBody?: string; + updatedBody?: string; + mediaNote?: string; +}): string { + let stripped = stripLeadingMediaAttachedLines(params.prompt); + if (!params.mediaNote) { + stripped = stripLeadingMediaReplyHint(stripped); + } + + const updatedBody = stripInlineDirectives(params.updatedBody); + if (!updatedBody) { + return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim(); + } + + const replacementTargets = [ + params.originalBody?.trim(), + stripInlineDirectives(params.originalBody), + MEDIA_ONLY_PLACEHOLDER, + ].filter( + (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index, + ); + + let rebuilt = stripped; + for (const target of replacementTargets) { + const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); + if (replaced !== undefined) { + rebuilt = replaced; + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); + } + } + + rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); +} + export function createFollowupRunner(params: { opts?: GetReplyOptions; typing: TypingController; @@ -173,6 +253,7 @@ export function createFollowupRunner(params: { if (hasMedia) { try { const mediaCtx = { ...queued.mediaContext } as MsgContext; + const originalBody = mediaCtx.Body; const muResult = await applyMediaUnderstanding({ ctx: mediaCtx, cfg: queued.run.config, @@ -182,34 +263,19 @@ export function createFollowupRunner(params: { model: queued.run.model, }, }); - if (muResult.outputs.length > 0) { - // Rebuild the prompt with media understanding results baked in, - // matching the primary path's formatting. + if (muResult.outputs.length > 0 || muResult.appliedFile) { + // Rebuild the queued prompt from the mutated media context so the + // deferred path matches the primary path's prompt shape. const newMediaNote = buildInboundMediaNote(mediaCtx); - const transcriptBody = formatMediaUnderstandingBody({ - body: undefined, - outputs: muResult.outputs, + queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({ + prompt: queued.prompt, + originalBody, + updatedBody: mediaCtx.Body, + mediaNote: newMediaNote, }); - // Strip existing [media attached ...] lines from the prompt so - // they can be replaced by the updated media note (which excludes - // successfully-understood attachments like transcribed audio). - const stripped = queued.prompt - .replace(/\[media attached: \d+ files\]\n?/g, "") - .replace(/\[media attached[^\]]*\]\n?/g, ""); - - const parts: string[] = []; - if (newMediaNote) { - parts.push(newMediaNote); - } - if (transcriptBody) { - parts.push(transcriptBody); - } - parts.push(stripped.trim()); - queued.prompt = parts.filter(Boolean).join("\n\n"); - logVerbose( - `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`, + `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, ); } } catch (err) {