Auto-reply: preserve deferred media understanding output

This commit is contained in:
Joey Krug 2026-03-14 17:53:40 -04:00
parent 67e90527e1
commit 5e0330db6c
4 changed files with 424 additions and 8 deletions

View File

@ -818,6 +818,69 @@ describe("createFollowupRunner media understanding", () => {
expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
});
it("applies media understanding for URL-only attachments", async () => {
const transcriptText = "URL-only transcript";
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.MediaUnderstanding = [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
];
params.ctx.Transcript = transcriptText;
params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
return {
outputs: [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
],
decisions: [],
appliedImage: false,
appliedAudio: true,
appliedVideo: false,
appliedFile: false,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "Got it!" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text",
mediaContext: {
Body: "some text",
MediaUrl: "https://cdn.example.com/voice.ogg",
MediaUrls: ["https://cdn.example.com/voice.ogg"],
MediaType: "audio/ogg",
MediaTypes: ["audio/ogg"],
},
}),
);
expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain(transcriptText);
});
it("strips the full media line when attachment paths or URLs contain brackets", async () => {
const transcriptText = "Bracket-safe transcript";
applyMediaUnderstandingMock.mockImplementationOnce(
@ -1329,6 +1392,98 @@ describe("createFollowupRunner media understanding", () => {
expect(agentCall?.prompt).not.toContain("/think high summarize this");
});
it("preserves directive-like tokens inside extracted media content", async () => {
const fileBlock =
'<file name="notes.txt" mime="text/plain">\n/model claude-opus should stay\n/queue followup should stay\n</file>';
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body = `/think high summarize this\n\n${fileBlock}`;
return {
outputs: [],
decisions: [],
appliedImage: false,
appliedAudio: false,
appliedVideo: false,
appliedFile: true,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
mediaContext: {
Body: "/think high summarize this",
MediaPaths: ["/tmp/notes.txt"],
MediaTypes: ["text/plain"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("summarize this");
expect(agentCall?.prompt).not.toContain("/think high summarize this");
expect(agentCall?.prompt).toContain("/model claude-opus should stay");
expect(agentCall?.prompt).toContain("/queue followup should stay");
});
it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
const description = "[Image]\nDescription:\na mountain at sunset";
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body = description;
return {
outputs: [],
decisions: [],
appliedImage: true,
appliedAudio: false,
appliedVideo: false,
appliedFile: false,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text",
mediaContext: {
Body: "some text",
MediaPaths: ["/tmp/photo.jpg"],
MediaTypes: ["image/jpeg"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("a mountain at sunset");
});
it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
applyMediaUnderstandingMock.mockImplementationOnce(
@ -1360,8 +1515,7 @@ describe("createFollowupRunner media understanding", () => {
// file extraction results from being embedded in the prompt.
await runner(
createQueuedRun({
prompt:
"[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
prompt: "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
mediaContext: {
Body: "check my <file upload please",
CommandBody: "check my <file upload please",
@ -1380,6 +1534,154 @@ describe("createFollowupRunner media understanding", () => {
expect(agentCall?.prompt).toContain("check my <file upload please");
});
it("preserves directive-like text that appears inside extracted file content", async () => {
const fileBlock =
'<file name="notes.txt" mime="text/plain">\nRun `/think high` literally in the shell example.\n</file>';
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body = `summarize this\n\n${fileBlock}`;
return {
outputs: [],
decisions: [],
appliedImage: false,
appliedAudio: false,
appliedVideo: false,
appliedFile: true,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
mediaContext: {
Body: "/think high summarize this",
CommandBody: "summarize this",
RawBody: "/think high summarize this",
MediaPaths: ["/tmp/notes.txt"],
MediaTypes: ["text/plain"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("summarize this");
expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
});
it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";
return {
outputs: [],
decisions: [],
appliedImage: true,
appliedAudio: false,
appliedVideo: false,
appliedFile: false,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text",
mediaContext: {
Body: "some text",
CommandBody: "some text",
RawBody: "some text",
MediaPaths: ["/tmp/board.jpg"],
MediaTypes: ["image/jpeg"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("[Image summary]");
expect(agentCall?.prompt).toContain("A whiteboard with action items.");
});
it("applies media understanding for URL-only deferred attachments", async () => {
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body = "[Audio]\nTranscript:\nremote transcript";
params.ctx.Transcript = "remote transcript";
return {
outputs: [
{
kind: "audio.transcription",
text: "remote transcript",
attachmentIndex: 0,
provider: "whisper",
},
],
decisions: [],
appliedImage: false,
appliedAudio: true,
appliedVideo: false,
appliedFile: false,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: "[User sent media without caption]",
mediaContext: {
Body: "",
MediaUrl: "https://cdn.example.com/audio.ogg",
MediaUrls: ["https://cdn.example.com/audio.ogg"],
MediaType: "audio/ogg",
MediaTypes: ["audio/ogg"],
},
}),
);
expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("remote transcript");
});
it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
applyMediaUnderstandingMock.mockImplementationOnce(

View File

@ -43,8 +43,7 @@ import type { TypingController } from "./typing.js";
const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
const LEADING_MEDIA_ATTACHED_LINE_RE =
/^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
const FILE_BLOCK_RE = /<file\s+name="/i;
function stripLeadingMediaAttachedLines(prompt: string): string {
@ -87,6 +86,28 @@ function stripInlineDirectives(text: string | undefined): string {
return parseInlineDirectives(text ?? "").cleaned.trim();
}
function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
const updatedBody = params.updatedBody?.trim();
if (!updatedBody) {
return "";
}
const originalBody = params.originalBody?.trim();
if (!originalBody) {
return updatedBody;
}
const cleanedOriginalBody = stripInlineDirectives(originalBody);
if (!cleanedOriginalBody) {
return updatedBody;
}
if (updatedBody === originalBody) {
return cleanedOriginalBody;
}
return (
replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
).trim();
}
function rebuildQueuedPromptWithMediaUnderstanding(params: {
prompt: string;
originalBody?: string;
@ -98,7 +119,10 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
stripped = stripLeadingMediaReplyHint(stripped);
}
const updatedBody = stripInlineDirectives(params.updatedBody);
const updatedBody = normalizeUpdatedBody({
originalBody: params.originalBody,
updatedBody: params.updatedBody,
});
if (!updatedBody) {
return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
}
@ -250,8 +274,11 @@ export function createFollowupRunner(params: {
if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
const hasMedia = Boolean(
queued.mediaContext.MediaPath?.trim() ||
queued.mediaContext.MediaUrl?.trim() ||
(Array.isArray(queued.mediaContext.MediaPaths) &&
queued.mediaContext.MediaPaths.length > 0),
queued.mediaContext.MediaPaths.length > 0) ||
(Array.isArray(queued.mediaContext.MediaUrls) &&
queued.mediaContext.MediaUrls.length > 0),
);
if (hasMedia) {
try {
@ -281,6 +308,9 @@ export function createFollowupRunner(params: {
});
const shouldRebuildPrompt =
muResult.outputs.length > 0 ||
muResult.appliedAudio ||
muResult.appliedImage ||
muResult.appliedVideo ||
(muResult.appliedFile && !bodyAlreadyHasFileBlock);
if (shouldRebuildPrompt) {
// Rebuild the queued prompt from the mutated media context so the

View File

@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => {
expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
});
it("snapshots URL-only attachments into followup mediaContext", async () => {
await runPreparedReply(
baseParams({
ctx: {
Body: "check this attachment",
RawBody: "check this attachment",
CommandBody: "check this attachment",
ThreadHistoryBody: "Earlier message in this thread",
OriginatingChannel: "slack",
OriginatingTo: "C123",
ChatType: "group",
MediaUrl: "https://cdn.example.com/input.png",
MediaUrls: ["https://cdn.example.com/input.png"],
MediaType: "image/png",
MediaTypes: ["image/png"],
},
sessionCtx: {
Body: "check this attachment",
BodyStripped: "check this attachment",
ThreadHistoryBody: "Earlier message in this thread",
Provider: "slack",
ChatType: "group",
OriginatingChannel: "slack",
OriginatingTo: "C123",
},
}),
);
const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
expect(call?.followupRun.mediaContext).toEqual(
expect.objectContaining({
MediaUrl: "https://cdn.example.com/input.png",
MediaUrls: ["https://cdn.example.com/input.png"],
MediaType: "image/png",
MediaTypes: ["image/png"],
}),
);
});
it("keeps thread history context on follow-up turns", async () => {
const result = await runPreparedReply(
baseParams({
@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => {
expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
});
it("snapshots mediaContext for URL-only deferred attachments", async () => {
await runPreparedReply(
baseParams({
ctx: {
Body: "",
RawBody: "",
CommandBody: "",
MediaUrl: "https://cdn.example.com/audio.ogg",
MediaUrls: ["https://cdn.example.com/audio.ogg"],
MediaType: "audio/ogg",
MediaTypes: ["audio/ogg"],
ThreadHistoryBody: "Earlier message in this thread",
OriginatingChannel: "slack",
OriginatingTo: "C123",
ChatType: "group",
},
sessionCtx: {
Body: "",
BodyStripped: "",
ThreadHistoryBody: "Earlier message in this thread",
Provider: "slack",
ChatType: "group",
OriginatingChannel: "slack",
OriginatingTo: "C123",
},
}),
);
const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg");
expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([
"https://cdn.example.com/audio.ogg",
]);
});
it("returns the empty-body reply when there is no text and no media", async () => {
const result = await runPreparedReply(
baseParams({

View File

@ -310,7 +310,14 @@ export async function runPreparedReply(
: [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
const baseBodyTrimmed = baseBodyForPrompt.trim();
const hasMediaAttachment = Boolean(
sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
sessionCtx.MediaPath ||
sessionCtx.MediaUrl ||
(sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) ||
(sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) ||
ctx.MediaPath?.trim() ||
ctx.MediaUrl?.trim() ||
(Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
(Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
);
if (!baseBodyTrimmed && !hasMediaAttachment) {
await typing.onReplyStart();
@ -476,7 +483,10 @@ export async function runPreparedReply(
// followup runner. When MediaUnderstanding is already populated the runner
// knows transcription already succeeded and skips re-application.
const hasMediaAttachments = Boolean(
ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
ctx.MediaPath?.trim() ||
ctx.MediaUrl?.trim() ||
(Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
(Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
);
const mediaContext = hasMediaAttachments
? {