fix queued media-understanding prompt rebuild
This commit is contained in:
parent
3bf55561cb
commit
6edb3b7e34
@ -67,6 +67,9 @@ beforeEach(() => {
|
||||
const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
|
||||
createMockFollowupRun({ run: { messageProvider } });
|
||||
|
||||
const MEDIA_REPLY_HINT =
|
||||
"To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body.";
|
||||
|
||||
function createQueuedRun(
|
||||
overrides: Partial<Omit<FollowupRun, "run">> & {
|
||||
run?: Partial<FollowupRun["run"]>;
|
||||
@ -741,7 +744,7 @@ describe("createFollowupRunner media understanding", () => {
|
||||
it("applies audio transcription when mediaContext has untranscribed audio", async () => {
|
||||
const transcriptText = "Hello, this is a voice note.";
|
||||
// The real applyMediaUnderstanding mutates the ctx; the mock must do the same
|
||||
// so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line.
|
||||
// so buildInboundMediaNote and queued prompt rebuilding see the transcribed body.
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.MediaUnderstanding = [
|
||||
@ -753,6 +756,7 @@ describe("createFollowupRunner media understanding", () => {
|
||||
},
|
||||
];
|
||||
params.ctx.Transcript = transcriptText;
|
||||
params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
|
||||
return {
|
||||
outputs: [
|
||||
{
|
||||
@ -929,6 +933,7 @@ describe("createFollowupRunner media understanding", () => {
|
||||
},
|
||||
];
|
||||
params.ctx.Transcript = "voice transcript";
|
||||
params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript";
|
||||
return {
|
||||
outputs: [
|
||||
{
|
||||
@ -980,4 +985,167 @@ describe("createFollowupRunner media understanding", () => {
|
||||
// The transcript should be present
|
||||
expect(agentCall?.prompt).toContain("voice transcript");
|
||||
});
|
||||
|
||||
it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => {
|
||||
const transcriptText = "Bracket-safe transcript";
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.MediaUnderstanding = [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
text: transcriptText,
|
||||
attachmentIndex: 0,
|
||||
provider: "whisper",
|
||||
},
|
||||
];
|
||||
params.ctx.Transcript = transcriptText;
|
||||
params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
|
||||
return {
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
text: transcriptText,
|
||||
attachmentIndex: 0,
|
||||
provider: "whisper",
|
||||
},
|
||||
],
|
||||
decisions: [],
|
||||
appliedImage: false,
|
||||
appliedAudio: true,
|
||||
appliedVideo: false,
|
||||
appliedFile: false,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "ok" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt:
|
||||
"[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" +
|
||||
MEDIA_REPLY_HINT +
|
||||
"\n" +
|
||||
"some text",
|
||||
mediaContext: {
|
||||
Body: "some text",
|
||||
MediaPaths: ["/tmp/voice[0].ogg"],
|
||||
MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"],
|
||||
MediaTypes: ["audio/ogg"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain(transcriptText);
|
||||
expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg");
|
||||
expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123");
|
||||
expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT);
|
||||
});
|
||||
|
||||
it("preserves file-only media understanding when outputs are empty", async () => {
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.Body =
|
||||
'<file name="report.pdf" mime="application/pdf">\nQuarterly report body\n</file>';
|
||||
return {
|
||||
outputs: [],
|
||||
decisions: [],
|
||||
appliedImage: false,
|
||||
appliedAudio: false,
|
||||
appliedVideo: false,
|
||||
appliedFile: true,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "processed" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`,
|
||||
mediaContext: {
|
||||
Body: "",
|
||||
MediaPaths: ["/tmp/report.pdf"],
|
||||
MediaTypes: ["application/pdf"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]");
|
||||
expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT);
|
||||
expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
|
||||
expect(agentCall?.prompt).toContain("Quarterly report body");
|
||||
expect(agentCall?.prompt).not.toContain("[User sent media without caption]");
|
||||
});
|
||||
|
||||
it("replaces the queued body when inline directives were already stripped from the prompt", async () => {
|
||||
applyMediaUnderstandingMock.mockImplementationOnce(
|
||||
async (params: { ctx: Record<string, unknown> }) => {
|
||||
params.ctx.Body =
|
||||
'/think high summarize this\n\n<file name="report.pdf" mime="application/pdf">\nreport\n</file>';
|
||||
return {
|
||||
outputs: [],
|
||||
decisions: [],
|
||||
appliedImage: false,
|
||||
appliedAudio: false,
|
||||
appliedVideo: false,
|
||||
appliedFile: true,
|
||||
};
|
||||
},
|
||||
);
|
||||
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||
payloads: [{ text: "processed" }],
|
||||
meta: {},
|
||||
});
|
||||
|
||||
const runner = createFollowupRunner({
|
||||
opts: { onBlockReply: vi.fn(async () => {}) },
|
||||
typing: createMockTypingController(),
|
||||
typingMode: "instant",
|
||||
defaultModel: "anthropic/claude-opus-4-5",
|
||||
});
|
||||
|
||||
await runner(
|
||||
createQueuedRun({
|
||||
prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
|
||||
mediaContext: {
|
||||
Body: "/think high summarize this",
|
||||
MediaPaths: ["/tmp/report.pdf"],
|
||||
MediaTypes: ["application/pdf"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
|
||||
prompt?: string;
|
||||
};
|
||||
expect(agentCall?.prompt).toContain("summarize this");
|
||||
expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
|
||||
expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
|
||||
expect(agentCall?.prompt).not.toContain("/think high summarize this");
|
||||
});
|
||||
});
|
||||
|
||||
@ -14,7 +14,6 @@ import type { TypingMode } from "../../config/types.js";
|
||||
import { logVerbose } from "../../globals.js";
|
||||
import { registerAgentRunContext } from "../../infra/agent-events.js";
|
||||
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
|
||||
import { formatMediaUnderstandingBody } from "../../media-understanding/format.js";
|
||||
import { defaultRuntime } from "../../runtime.js";
|
||||
import { isInternalMessageChannel } from "../../utils/message-channel.js";
|
||||
import { stripHeartbeatToken } from "../heartbeat.js";
|
||||
@ -23,6 +22,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js";
|
||||
import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
|
||||
import type { GetReplyOptions, ReplyPayload } from "../types.js";
|
||||
import { resolveRunAuthProfile } from "./agent-runner-utils.js";
|
||||
import { parseInlineDirectives } from "./directive-handling.js";
|
||||
import {
|
||||
resolveOriginAccountId,
|
||||
resolveOriginMessageProvider,
|
||||
@ -41,6 +41,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r
|
||||
import { createTypingSignaler } from "./typing-mode.js";
|
||||
import type { TypingController } from "./typing.js";
|
||||
|
||||
const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
|
||||
const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
|
||||
|
||||
function stripLeadingMediaAttachedLines(prompt: string): string {
|
||||
const lines = prompt.split("\n");
|
||||
let index = 0;
|
||||
while (index < lines.length) {
|
||||
const trimmed = lines[index]?.trim() ?? "";
|
||||
if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
|
||||
break;
|
||||
}
|
||||
index += 1;
|
||||
}
|
||||
return lines.slice(index).join("\n").trim();
|
||||
}
|
||||
|
||||
function stripLeadingMediaReplyHint(prompt: string): string {
|
||||
const lines = prompt.split("\n");
|
||||
if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
|
||||
return lines.slice(1).join("\n").trim();
|
||||
}
|
||||
return prompt.trim();
|
||||
}
|
||||
|
||||
function replaceLastOccurrence(
|
||||
value: string,
|
||||
search: string,
|
||||
replacement: string,
|
||||
): string | undefined {
|
||||
if (!search) {
|
||||
return undefined;
|
||||
}
|
||||
const index = value.lastIndexOf(search);
|
||||
if (index < 0) {
|
||||
return undefined;
|
||||
}
|
||||
return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
|
||||
}
|
||||
|
||||
function stripInlineDirectives(text: string | undefined): string {
|
||||
return parseInlineDirectives(text ?? "").cleaned.trim();
|
||||
}
|
||||
|
||||
function rebuildQueuedPromptWithMediaUnderstanding(params: {
|
||||
prompt: string;
|
||||
originalBody?: string;
|
||||
updatedBody?: string;
|
||||
mediaNote?: string;
|
||||
}): string {
|
||||
let stripped = stripLeadingMediaAttachedLines(params.prompt);
|
||||
if (!params.mediaNote) {
|
||||
stripped = stripLeadingMediaReplyHint(stripped);
|
||||
}
|
||||
|
||||
const updatedBody = stripInlineDirectives(params.updatedBody);
|
||||
if (!updatedBody) {
|
||||
return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
|
||||
}
|
||||
|
||||
const replacementTargets = [
|
||||
params.originalBody?.trim(),
|
||||
stripInlineDirectives(params.originalBody),
|
||||
MEDIA_ONLY_PLACEHOLDER,
|
||||
].filter(
|
||||
(value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
|
||||
);
|
||||
|
||||
let rebuilt = stripped;
|
||||
for (const target of replacementTargets) {
|
||||
const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
|
||||
if (replaced !== undefined) {
|
||||
rebuilt = replaced;
|
||||
return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
|
||||
}
|
||||
}
|
||||
|
||||
rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
|
||||
return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
|
||||
}
|
||||
|
||||
export function createFollowupRunner(params: {
|
||||
opts?: GetReplyOptions;
|
||||
typing: TypingController;
|
||||
@ -173,6 +253,7 @@ export function createFollowupRunner(params: {
|
||||
if (hasMedia) {
|
||||
try {
|
||||
const mediaCtx = { ...queued.mediaContext } as MsgContext;
|
||||
const originalBody = mediaCtx.Body;
|
||||
const muResult = await applyMediaUnderstanding({
|
||||
ctx: mediaCtx,
|
||||
cfg: queued.run.config,
|
||||
@ -182,34 +263,19 @@ export function createFollowupRunner(params: {
|
||||
model: queued.run.model,
|
||||
},
|
||||
});
|
||||
if (muResult.outputs.length > 0) {
|
||||
// Rebuild the prompt with media understanding results baked in,
|
||||
// matching the primary path's formatting.
|
||||
if (muResult.outputs.length > 0 || muResult.appliedFile) {
|
||||
// Rebuild the queued prompt from the mutated media context so the
|
||||
// deferred path matches the primary path's prompt shape.
|
||||
const newMediaNote = buildInboundMediaNote(mediaCtx);
|
||||
const transcriptBody = formatMediaUnderstandingBody({
|
||||
body: undefined,
|
||||
outputs: muResult.outputs,
|
||||
queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
|
||||
prompt: queued.prompt,
|
||||
originalBody,
|
||||
updatedBody: mediaCtx.Body,
|
||||
mediaNote: newMediaNote,
|
||||
});
|
||||
|
||||
// Strip existing [media attached ...] lines from the prompt so
|
||||
// they can be replaced by the updated media note (which excludes
|
||||
// successfully-understood attachments like transcribed audio).
|
||||
const stripped = queued.prompt
|
||||
.replace(/\[media attached: \d+ files\]\n?/g, "")
|
||||
.replace(/\[media attached[^\]]*\]\n?/g, "");
|
||||
|
||||
const parts: string[] = [];
|
||||
if (newMediaNote) {
|
||||
parts.push(newMediaNote);
|
||||
}
|
||||
if (transcriptBody) {
|
||||
parts.push(transcriptBody);
|
||||
}
|
||||
parts.push(stripped.trim());
|
||||
queued.prompt = parts.filter(Boolean).join("\n\n");
|
||||
|
||||
logVerbose(
|
||||
`followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`,
|
||||
`followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user