Auto-reply: preserve deferred media understanding output
This commit is contained in:
parent
67e90527e1
commit
5e0330db6c
@ -818,6 +818,69 @@ describe("createFollowupRunner media understanding", () => {
|
||||
expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
|
||||
});
|
||||
|
||||
it("applies media understanding for URL-only attachments", async () => {
|
||||
const transcriptText = "URL-only transcript";
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.MediaUnderstanding = [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
text: transcriptText,
|
||||
attachmentIndex: 0,
|
||||
provider: "whisper",
|
||||
},
|
||||
];
|
||||
params.ctx.Transcript = transcriptText;
|
||||
params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
|
||||
return {
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
text: transcriptText,
|
||||
attachmentIndex: 0,
|
||||
provider: "whisper",
|
||||
},
|
||||
],
|
||||
decisions: [],
|
||||
appliedImage: false,
|
||||
appliedAudio: true,
|
||||
appliedVideo: false,
|
||||
appliedFile: false,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "Got it!" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text",
|
||||
mediaContext: {
|
||||
Body: "some text",
|
||||
MediaUrl: "https://cdn.example.com/voice.ogg",
|
||||
MediaUrls: ["https://cdn.example.com/voice.ogg"],
|
||||
MediaType: "audio/ogg",
|
||||
MediaTypes: ["audio/ogg"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain(transcriptText);
|
||||
});
|
||||
|
||||
it("strips the full media line when attachment paths or URLs contain brackets", async () => {
|
||||
const transcriptText = "Bracket-safe transcript";
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
@ -1329,6 +1392,98 @@ describe("createFollowupRunner media understanding", () => {
|
||||
expect(agentCall?.prompt).not.toContain("/think high summarize this");
|
||||
});
|
||||
|
||||
it("preserves directive-like tokens inside extracted media content", async () => {
|
||||
const fileBlock =
|
||||
'<file name="notes.txt" mime="text/plain">\n/model claude-opus should stay\n/queue followup should stay\n</file>';
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.Body = `/think high summarize this\n\n${fileBlock}`;
|
||||
return {
|
||||
outputs: [],
|
||||
decisions: [],
|
||||
appliedImage: false,
|
||||
appliedAudio: false,
|
||||
appliedVideo: false,
|
||||
appliedFile: true,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "processed" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
|
||||
mediaContext: {
|
||||
Body: "/think high summarize this",
|
||||
MediaPaths: ["/tmp/notes.txt"],
|
||||
MediaTypes: ["text/plain"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain("summarize this");
|
||||
expect(agentCall?.prompt).not.toContain("/think high summarize this");
|
||||
expect(agentCall?.prompt).toContain("/model claude-opus should stay");
|
||||
expect(agentCall?.prompt).toContain("/queue followup should stay");
|
||||
});
|
||||
|
||||
it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
|
||||
const description = "[Image]\nDescription:\na mountain at sunset";
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.Body = description;
|
||||
return {
|
||||
outputs: [],
|
||||
decisions: [],
|
||||
appliedImage: true,
|
||||
appliedAudio: false,
|
||||
appliedVideo: false,
|
||||
appliedFile: false,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "processed" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text",
|
||||
mediaContext: {
|
||||
Body: "some text",
|
||||
MediaPaths: ["/tmp/photo.jpg"],
|
||||
MediaTypes: ["image/jpeg"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain("a mountain at sunset");
|
||||
});
|
||||
|
||||
it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
|
||||
const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
@ -1360,8 +1515,7 @@ describe("createFollowupRunner media understanding", () => {
|
||||
// file extraction results from being embedded in the prompt.
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt:
|
||||
"[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
|
||||
prompt: "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
|
||||
mediaContext: {
|
||||
Body: "check my <file upload please",
|
||||
CommandBody: "check my <file upload please",
|
||||
@ -1380,6 +1534,154 @@ describe("createFollowupRunner media understanding", () => {
|
||||
expect(agentCall?.prompt).toContain("check my <file upload please");
|
||||
});
|
||||
|
||||
it("preserves directive-like text that appears inside extracted file content", async () => {
|
||||
const fileBlock =
|
||||
'<file name="notes.txt" mime="text/plain">\nRun `/think high` literally in the shell example.\n</file>';
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.Body = `summarize this\n\n${fileBlock}`;
|
||||
return {
|
||||
outputs: [],
|
||||
decisions: [],
|
||||
appliedImage: false,
|
||||
appliedAudio: false,
|
||||
appliedVideo: false,
|
||||
appliedFile: true,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "processed" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
|
||||
mediaContext: {
|
||||
Body: "/think high summarize this",
|
||||
CommandBody: "summarize this",
|
||||
RawBody: "/think high summarize this",
|
||||
MediaPaths: ["/tmp/notes.txt"],
|
||||
MediaTypes: ["text/plain"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain("summarize this");
|
||||
expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
|
||||
});
|
||||
|
||||
it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";
|
||||
return {
|
||||
outputs: [],
|
||||
decisions: [],
|
||||
appliedImage: true,
|
||||
appliedAudio: false,
|
||||
appliedVideo: false,
|
||||
appliedFile: false,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "processed" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text",
|
||||
mediaContext: {
|
||||
Body: "some text",
|
||||
CommandBody: "some text",
|
||||
RawBody: "some text",
|
||||
MediaPaths: ["/tmp/board.jpg"],
|
||||
MediaTypes: ["image/jpeg"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain("[Image summary]");
|
||||
expect(agentCall?.prompt).toContain("A whiteboard with action items.");
|
||||
});
|
||||
|
||||
it("applies media understanding for URL-only deferred attachments", async () => {
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.Body = "[Audio]\nTranscript:\nremote transcript";
|
||||
params.ctx.Transcript = "remote transcript";
|
||||
return {
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
text: "remote transcript",
|
||||
attachmentIndex: 0,
|
||||
provider: "whisper",
|
||||
},
|
||||
],
|
||||
decisions: [],
|
||||
appliedImage: false,
|
||||
appliedAudio: true,
|
||||
appliedVideo: false,
|
||||
appliedFile: false,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "processed" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: "[User sent media without caption]",
|
||||
mediaContext: {
|
||||
Body: "",
|
||||
MediaUrl: "https://cdn.example.com/audio.ogg",
|
||||
MediaUrls: ["https://cdn.example.com/audio.ogg"],
|
||||
MediaType: "audio/ogg",
|
||||
MediaTypes: ["audio/ogg"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain("remote transcript");
|
||||
});
|
||||
|
||||
it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
|
||||
const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
|
||||
@ -43,8 +43,7 @@ import type { TypingController } from "./typing.js";
|
||||
|
||||
const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
|
||||
const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
|
||||
const LEADING_MEDIA_ATTACHED_LINE_RE =
|
||||
/^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
|
||||
const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
|
||||
const FILE_BLOCK_RE = /<file\s+name="/i;
|
||||
|
||||
function stripLeadingMediaAttachedLines(prompt: string): string {
|
||||
@ -87,6 +86,28 @@ function stripInlineDirectives(text: string | undefined): string {
|
||||
return parseInlineDirectives(text ?? "").cleaned.trim();
|
||||
}
|
||||
|
||||
function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
|
||||
const updatedBody = params.updatedBody?.trim();
|
||||
if (!updatedBody) {
|
||||
return "";
|
||||
}
|
||||
const originalBody = params.originalBody?.trim();
|
||||
if (!originalBody) {
|
||||
return updatedBody;
|
||||
}
|
||||
|
||||
const cleanedOriginalBody = stripInlineDirectives(originalBody);
|
||||
if (!cleanedOriginalBody) {
|
||||
return updatedBody;
|
||||
}
|
||||
if (updatedBody === originalBody) {
|
||||
return cleanedOriginalBody;
|
||||
}
|
||||
return (
|
||||
replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
|
||||
).trim();
|
||||
}
|
||||
|
||||
function rebuildQueuedPromptWithMediaUnderstanding(params: {
|
||||
prompt: string;
|
||||
originalBody?: string;
|
||||
@ -98,7 +119,10 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
|
||||
stripped = stripLeadingMediaReplyHint(stripped);
|
||||
}
|
||||
|
||||
const updatedBody = stripInlineDirectives(params.updatedBody);
|
||||
const updatedBody = normalizeUpdatedBody({
|
||||
originalBody: params.originalBody,
|
||||
updatedBody: params.updatedBody,
|
||||
});
|
||||
if (!updatedBody) {
|
||||
return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
|
||||
}
|
||||
@ -250,8 +274,11 @@ export function createFollowupRunner(params: {
|
||||
if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
|
||||
const hasMedia = Boolean(
|
||||
queued.mediaContext.MediaPath?.trim() ||
|
||||
queued.mediaContext.MediaUrl?.trim() ||
|
||||
(Array.isArray(queued.mediaContext.MediaPaths) &&
|
||||
queued.mediaContext.MediaPaths.length > 0),
|
||||
queued.mediaContext.MediaPaths.length > 0) ||
|
||||
(Array.isArray(queued.mediaContext.MediaUrls) &&
|
||||
queued.mediaContext.MediaUrls.length > 0),
|
||||
);
|
||||
if (hasMedia) {
|
||||
try {
|
||||
@ -281,6 +308,9 @@ export function createFollowupRunner(params: {
|
||||
});
|
||||
const shouldRebuildPrompt =
|
||||
muResult.outputs.length > 0 ||
|
||||
muResult.appliedAudio ||
|
||||
muResult.appliedImage ||
|
||||
muResult.appliedVideo ||
|
||||
(muResult.appliedFile && !bodyAlreadyHasFileBlock);
|
||||
if (shouldRebuildPrompt) {
|
||||
// Rebuild the queued prompt from the mutated media context so the
|
||||
|
||||
@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => {
|
||||
expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
|
||||
});
|
||||
|
||||
it("snapshots URL-only attachments into followup mediaContext", async () => {
|
||||
await runPreparedReply(
|
||||
baseParams({
|
||||
ctx: {
|
||||
Body: "check this attachment",
|
||||
RawBody: "check this attachment",
|
||||
CommandBody: "check this attachment",
|
||||
ThreadHistoryBody: "Earlier message in this thread",
|
||||
OriginatingChannel: "slack",
|
||||
OriginatingTo: "C123",
|
||||
ChatType: "group",
|
||||
MediaUrl: "https://cdn.example.com/input.png",
|
||||
MediaUrls: ["https://cdn.example.com/input.png"],
|
||||
MediaType: "image/png",
|
||||
MediaTypes: ["image/png"],
|
||||
},
|
||||
sessionCtx: {
|
||||
Body: "check this attachment",
|
||||
BodyStripped: "check this attachment",
|
||||
ThreadHistoryBody: "Earlier message in this thread",
|
||||
Provider: "slack",
|
||||
ChatType: "group",
|
||||
OriginatingChannel: "slack",
|
||||
OriginatingTo: "C123",
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
|
||||
expect(call?.followupRun.mediaContext).toEqual(
|
||||
expect.objectContaining({
|
||||
MediaUrl: "https://cdn.example.com/input.png",
|
||||
MediaUrls: ["https://cdn.example.com/input.png"],
|
||||
MediaType: "image/png",
|
||||
MediaTypes: ["image/png"],
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps thread history context on follow-up turns", async () => {
|
||||
const result = await runPreparedReply(
|
||||
baseParams({
|
||||
@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => {
|
||||
expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
|
||||
});
|
||||
|
||||
it("snapshots mediaContext for URL-only deferred attachments", async () => {
|
||||
await runPreparedReply(
|
||||
baseParams({
|
||||
ctx: {
|
||||
Body: "",
|
||||
RawBody: "",
|
||||
CommandBody: "",
|
||||
MediaUrl: "https://cdn.example.com/audio.ogg",
|
||||
MediaUrls: ["https://cdn.example.com/audio.ogg"],
|
||||
MediaType: "audio/ogg",
|
||||
MediaTypes: ["audio/ogg"],
|
||||
ThreadHistoryBody: "Earlier message in this thread",
|
||||
OriginatingChannel: "slack",
|
||||
OriginatingTo: "C123",
|
||||
ChatType: "group",
|
||||
},
|
||||
sessionCtx: {
|
||||
Body: "",
|
||||
BodyStripped: "",
|
||||
ThreadHistoryBody: "Earlier message in this thread",
|
||||
Provider: "slack",
|
||||
ChatType: "group",
|
||||
OriginatingChannel: "slack",
|
||||
OriginatingTo: "C123",
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
|
||||
expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg");
|
||||
expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([
|
||||
"https://cdn.example.com/audio.ogg",
|
||||
]);
|
||||
});
|
||||
|
||||
it("returns the empty-body reply when there is no text and no media", async () => {
|
||||
const result = await runPreparedReply(
|
||||
baseParams({
|
||||
|
||||
@ -310,7 +310,14 @@ export async function runPreparedReply(
|
||||
: [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
|
||||
const baseBodyTrimmed = baseBodyForPrompt.trim();
|
||||
const hasMediaAttachment = Boolean(
|
||||
sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
|
||||
sessionCtx.MediaPath ||
|
||||
sessionCtx.MediaUrl ||
|
||||
(sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) ||
|
||||
(sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) ||
|
||||
ctx.MediaPath?.trim() ||
|
||||
ctx.MediaUrl?.trim() ||
|
||||
(Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
|
||||
(Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
|
||||
);
|
||||
if (!baseBodyTrimmed && !hasMediaAttachment) {
|
||||
await typing.onReplyStart();
|
||||
@ -476,7 +483,10 @@ export async function runPreparedReply(
|
||||
// followup runner. When MediaUnderstanding is already populated the runner
|
||||
// knows transcription already succeeded and skips re-application.
|
||||
const hasMediaAttachments = Boolean(
|
||||
ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
|
||||
ctx.MediaPath?.trim() ||
|
||||
ctx.MediaUrl?.trim() ||
|
||||
(Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
|
||||
(Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
|
||||
);
|
||||
const mediaContext = hasMediaAttachments
|
||||
? {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user