fix: rebuild queued followup media prompts

This commit is contained in:
Joey Krug 2026-03-14 15:45:17 -04:00
parent 6edb3b7e34
commit be3eec46e2
2 changed files with 196 additions and 5 deletions

View File

@ -818,6 +818,138 @@ describe("createFollowupRunner media understanding", () => {
expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
});
it("strips the full media line when attachment paths or URLs contain brackets", async () => {
const transcriptText = "Bracket-safe transcript";
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.MediaUnderstanding = [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
];
params.ctx.Transcript = transcriptText;
params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
return {
outputs: [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
],
decisions: [],
appliedImage: false,
appliedAudio: true,
appliedVideo: false,
appliedFile: false,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "done" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt:
"[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg]\nsome text",
mediaContext: {
Body: "some text",
CommandBody: "some text",
RawBody: "some text",
MediaPaths: ["/tmp/voice[0].ogg"],
MediaUrls: ["https://cdn.example.com/files[0].ogg"],
MediaTypes: ["audio/ogg"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain(transcriptText);
expect(agentCall?.prompt).not.toContain("[media attached:");
expect(agentCall?.prompt).not.toContain("files[0].ogg]");
});
it("only strips leading synthetic media lines and preserves literal user text later in the prompt", async () => {
const transcriptText = "Transcript with literal token";
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.MediaUnderstanding = [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
];
params.ctx.Transcript = transcriptText;
params.ctx.Body = "I literally typed [media attached: keep me] in this message.";
return {
outputs: [
{
kind: "audio.transcription",
text: transcriptText,
attachmentIndex: 0,
provider: "whisper",
},
],
decisions: [],
appliedImage: false,
appliedAudio: true,
appliedVideo: false,
appliedFile: false,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "done" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt:
"[media attached: /tmp/voice.ogg (audio/ogg)]\nI literally typed [media attached: keep me] in this message.",
mediaContext: {
Body: "I literally typed [media attached: keep me] in this message.",
CommandBody: "I literally typed [media attached: keep me] in this message.",
RawBody: "I literally typed [media attached: keep me] in this message.",
MediaPaths: ["/tmp/voice.ogg"],
MediaTypes: ["audio/ogg"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain(
"I literally typed [media attached: keep me] in this message.",
);
expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg (audio/ogg)]");
});
it("skips media understanding when MediaUnderstanding is already populated", async () => {
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "reply" }],
@ -920,6 +1052,54 @@ describe("createFollowupRunner media understanding", () => {
expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" }));
});
it("rebuilds the prompt when file extraction succeeds without media outputs", async () => {
const fileBlock = '<file name="notes.txt" mime="text/plain">\nline one\n</file>';
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {
params.ctx.Body = `some text\n\n${fileBlock}`;
return {
outputs: [],
decisions: [],
appliedImage: false,
appliedAudio: false,
appliedVideo: false,
appliedFile: true,
};
},
);
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "file processed" }],
meta: {},
});
const runner = createFollowupRunner({
opts: { onBlockReply: vi.fn(async () => {}) },
typing: createMockTypingController(),
typingMode: "instant",
defaultModel: "anthropic/claude-opus-4-5",
});
await runner(
createQueuedRun({
prompt: "[media attached: /tmp/notes.txt (text/plain)]\nsome text",
mediaContext: {
Body: "some text",
CommandBody: "some text",
RawBody: "some text",
MediaPaths: ["/tmp/notes.txt"],
MediaTypes: ["text/plain"],
},
}),
);
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
prompt?: string;
};
expect(agentCall?.prompt).toContain("[media attached: /tmp/notes.txt (text/plain)]");
expect(agentCall?.prompt).toContain(fileBlock);
expect(agentCall?.prompt?.match(/<file\b/g)).toHaveLength(1);
});
it("preserves non-audio media lines when only audio is transcribed", async () => {
applyMediaUnderstandingMock.mockImplementationOnce(
async (params: { ctx: Record<string, unknown> }) => {

View File

@ -43,13 +43,16 @@ import type { TypingController } from "./typing.js";
const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
const LEADING_MEDIA_ATTACHED_LINE_RE =
/^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
const FILE_BLOCK_RE = /<file\b/i;
function stripLeadingMediaAttachedLines(prompt: string): string {
const lines = prompt.split("\n");
let index = 0;
while (index < lines.length) {
const trimmed = lines[index]?.trim() ?? "";
if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
break;
}
index += 1;
@ -252,8 +255,14 @@ export function createFollowupRunner(params: {
);
if (hasMedia) {
try {
const mediaCtx = { ...queued.mediaContext } as MsgContext;
const originalBody = mediaCtx.Body;
const mediaCtx = {
...queued.mediaContext,
Body:
queued.mediaContext.CommandBody ??
queued.mediaContext.RawBody ??
queued.mediaContext.Body,
} as MsgContext;
const originalBody = queued.mediaContext.Body;
const muResult = await applyMediaUnderstanding({
ctx: mediaCtx,
cfg: queued.run.config,
@ -263,7 +272,10 @@ export function createFollowupRunner(params: {
model: queued.run.model,
},
});
if (muResult.outputs.length > 0 || muResult.appliedFile) {
const shouldRebuildPrompt =
muResult.outputs.length > 0 ||
(muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt));
if (shouldRebuildPrompt) {
// Rebuild the queued prompt from the mutated media context so the
// deferred path matches the primary path's prompt shape.
const newMediaNote = buildInboundMediaNote(mediaCtx);
@ -273,7 +285,6 @@ export function createFollowupRunner(params: {
updatedBody: mediaCtx.Body,
mediaNote: newMediaNote,
});
logVerbose(
`followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
);