fix queued media-understanding prompt rebuild

This commit is contained in:
Joey Krug 2026-03-14 15:29:55 -04:00
parent 3bf55561cb
commit 6edb3b7e34
2 changed files with 260 additions and 26 deletions

View File

@ -67,6 +67,9 @@ beforeEach(() => {
const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
createMockFollowupRun({ run: { messageProvider } });
const MEDIA_REPLY_HINT =
"To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body.";
function createQueuedRun(
overrides: Partial<Omit<FollowupRun, "run">> & {
run?: Partial<FollowupRun["run"]>;
@ -741,7 +744,7 @@ describe("createFollowupRunner media understanding", () => {
it("applies audio transcription when mediaContext has untranscribed audio", async () => {
const transcriptText = "Hello, this is a voice note.";
// The real applyMediaUnderstanding mutates the ctx; the mock must do the same
// so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line.
// so buildInboundMediaNote and queued prompt rebuilding see the transcribed body.
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.MediaUnderstanding = [
@ -753,6 +756,7 @@ describe("createFollowupRunner media understanding", () => {
},
];
params.ctx.Transcript = transcriptText;
params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
return {
outputs: [
{
@ -929,6 +933,7 @@ describe("createFollowupRunner media understanding", () => {
},
];
params.ctx.Transcript = "voice transcript";
params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript";
return {
outputs: [
{
@ -980,4 +985,167 @@ describe("createFollowupRunner media understanding", () => {
// The transcript should be present
expect(agentCall?.prompt).toContain("voice transcript");
});
it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => {
const transcriptText = "Bracket-safe transcript";
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.MediaUnderstanding = [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
];
params.ctx.Transcript = transcriptText;
params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
return {
outputs: [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
],
decisions: [],
appliedImage: false,
appliedAudio: true,
appliedVideo: false,
appliedFile: false,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "ok" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt:
"[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" +
MEDIA_REPLY_HINT +
"\n" +
"some text",
mediaContext: {
Body: "some text",
MediaPaths: ["/tmp/voice[0].ogg"],
MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"],
MediaTypes: ["audio/ogg"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain(transcriptText);
expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg");
expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123");
expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT);
});
it("preserves file-only media understanding when outputs are empty", async () => {
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body =
'<file name="report.pdf" mime="application/pdf">\nQuarterly report body\n</file>';
return {
outputs: [],
decisions: [],
appliedImage: false,
appliedAudio: false,
appliedVideo: false,
appliedFile: true,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`,
mediaContext: {
Body: "",
MediaPaths: ["/tmp/report.pdf"],
MediaTypes: ["application/pdf"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]");
expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT);
expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
expect(agentCall?.prompt).toContain("Quarterly report body");
expect(agentCall?.prompt).not.toContain("[User sent media without caption]");
});
it("replaces the queued body when inline directives were already stripped from the prompt", async () => {
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body =
'/think high summarize this\n\n<file name="report.pdf" mime="application/pdf">\nreport\n</file>';
return {
outputs: [],
decisions: [],
appliedImage: false,
appliedAudio: false,
appliedVideo: false,
appliedFile: true,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
mediaContext: {
Body: "/think high summarize this",
MediaPaths: ["/tmp/report.pdf"],
MediaTypes: ["application/pdf"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("summarize this");
expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
expect(agentCall?.prompt).not.toContain("/think high summarize this");
});
});

View File

@ -14,7 +14,6 @@ import type { TypingMode } from "../../config/types.js";
import { logVerbose } from "../../globals.js";
import { registerAgentRunContext } from "../../infra/agent-events.js";
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
import { formatMediaUnderstandingBody } from "../../media-understanding/format.js";
import { defaultRuntime } from "../../runtime.js";
import { isInternalMessageChannel } from "../../utils/message-channel.js";
import { stripHeartbeatToken } from "../heartbeat.js";
@ -23,6 +22,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js";
import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
import type { GetReplyOptions, ReplyPayload } from "../types.js";
import { resolveRunAuthProfile } from "./agent-runner-utils.js";
import { parseInlineDirectives } from "./directive-handling.js";
import {
resolveOriginAccountId,
resolveOriginMessageProvider,
@ -41,6 +41,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r
import { createTypingSignaler } from "./typing-mode.js";
import type { TypingController } from "./typing.js";
const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
function stripLeadingMediaAttachedLines(prompt: string): string {
const lines = prompt.split("\n");
let index = 0;
while (index < lines.length) {
const trimmed = lines[index]?.trim() ?? "";
if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
break;
}
index += 1;
}
return lines.slice(index).join("\n").trim();
}
function stripLeadingMediaReplyHint(prompt: string): string {
const lines = prompt.split("\n");
if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
return lines.slice(1).join("\n").trim();
}
return prompt.trim();
}
function replaceLastOccurrence(
value: string,
search: string,
replacement: string,
): string | undefined {
if (!search) {
return undefined;
}
const index = value.lastIndexOf(search);
if (index < 0) {
return undefined;
}
return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
}
function stripInlineDirectives(text: string | undefined): string {
return parseInlineDirectives(text ?? "").cleaned.trim();
}
function rebuildQueuedPromptWithMediaUnderstanding(params: {
prompt: string;
originalBody?: string;
updatedBody?: string;
mediaNote?: string;
}): string {
let stripped = stripLeadingMediaAttachedLines(params.prompt);
if (!params.mediaNote) {
stripped = stripLeadingMediaReplyHint(stripped);
}
const updatedBody = stripInlineDirectives(params.updatedBody);
if (!updatedBody) {
return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
}
const replacementTargets = [
params.originalBody?.trim(),
stripInlineDirectives(params.originalBody),
MEDIA_ONLY_PLACEHOLDER,
].filter(
(value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
);
let rebuilt = stripped;
for (const target of replacementTargets) {
const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
if (replaced !== undefined) {
rebuilt = replaced;
return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
}
}
rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
}
export function createFollowupRunner(params: {
opts?: GetReplyOptions;
typing: TypingController;
@ -173,6 +253,7 @@ export function createFollowupRunner(params: {
if (hasMedia) {
try {
const mediaCtx = { ...queued.mediaContext } as MsgContext;
const originalBody = mediaCtx.Body;
const muResult = await applyMediaUnderstanding({
ctx: mediaCtx,
cfg: queued.run.config,
@ -182,34 +263,19 @@ export function createFollowupRunner(params: {
model: queued.run.model,
},
});
if (muResult.outputs.length > 0) {
// Rebuild the prompt with media understanding results baked in,
// matching the primary path's formatting.
if (muResult.outputs.length > 0 || muResult.appliedFile) {
// Rebuild the queued prompt from the mutated media context so the
// deferred path matches the primary path's prompt shape.
const newMediaNote = buildInboundMediaNote(mediaCtx);
const transcriptBody = formatMediaUnderstandingBody({
body: undefined,
outputs: muResult.outputs,
queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
prompt: queued.prompt,
originalBody,
updatedBody: mediaCtx.Body,
mediaNote: newMediaNote,
});
// Strip existing [media attached ...] lines from the prompt so
// they can be replaced by the updated media note (which excludes
// successfully-understood attachments like transcribed audio).
const stripped = queued.prompt
.replace(/\[media attached: \d+ files\]\n?/g, "")
.replace(/\[media attached[^\]]*\]\n?/g, "");
const parts: string[] = [];
if (newMediaNote) {
parts.push(newMediaNote);
}
if (transcriptBody) {
parts.push(transcriptBody);
}
parts.push(stripped.trim());
queued.prompt = parts.filter(Boolean).join("\n\n");
logVerbose(
`followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`,
`followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
);
}
} catch (err) {