Compare commits

...

2 Commits

Author SHA1 Message Date
Tyler Yust
b36f8d23e9 Agents: harden CLI prompt image hydration (#51373) 2026-03-20 19:29:35 -07:00
Tyler Yust
315713fc40 fix(cli): hydrate prompt image refs for inbound media 2026-03-20 18:58:15 -07:00
5 changed files with 297 additions and 2 deletions

View File

@ -205,6 +205,7 @@ Docs: https://docs.openclaw.ai
- Plugins/Matrix: add a new Matrix plugin backed by the official `matrix-js-sdk`. If you are upgrading from the previous public Matrix plugin, follow the migration guide: https://docs.openclaw.ai/install/migrating-matrix Thanks @gumadeiras. - Plugins/Matrix: add a new Matrix plugin backed by the official `matrix-js-sdk`. If you are upgrading from the previous public Matrix plugin, follow the migration guide: https://docs.openclaw.ai/install/migrating-matrix Thanks @gumadeiras.
- Discord/commands: switch native command deployment to Carbon reconcile by default so Discord restarts stop churning slash commands through OpenClaws local deploy path. (#46597) Thanks @huntharo and @thewilloftheshadow. - Discord/commands: switch native command deployment to Carbon reconcile by default so Discord restarts stop churning slash commands through OpenClaws local deploy path. (#46597) Thanks @huntharo and @thewilloftheshadow.
- Plugins/Matrix: durably dedupe inbound room events across gateway restarts so previously handled Matrix messages are not replayed as new, while preserving clean-restart backlog delivery for unseen events. (#50922) thanks @gumadeiras - Plugins/Matrix: durably dedupe inbound room events across gateway restarts so previously handled Matrix messages are not replayed as new, while preserving clean-restart backlog delivery for unseen events. (#50922) thanks @gumadeiras
- BlueBubbles/CLI agents: restore inbound prompt image refs for CLI routed turns, reapply embedded runner image size guardrails, and cover both CLI image transport paths with regression tests. (#51373)
## 2026.3.13 ## 2026.3.13

View File

@ -0,0 +1,116 @@
import type { ImageContent } from "@mariozechner/pi-ai";
import { beforeEach, describe, expect, it, vi } from "vitest";
import { MAX_IMAGE_BYTES } from "../media/constants.js";
import type { SandboxFsBridge } from "./sandbox/fs-bridge.js";
const mocks = vi.hoisted(() => ({
loadImageFromRef: vi.fn(),
sanitizeImageBlocks: vi.fn(),
}));
vi.mock("./pi-embedded-runner/run/images.js", async (importOriginal) => {
const actual = await importOriginal<typeof import("./pi-embedded-runner/run/images.js")>();
return {
...actual,
loadImageFromRef: (...args: unknown[]) => mocks.loadImageFromRef(...args),
};
});
vi.mock("./tool-images.js", async (importOriginal) => {
const actual = await importOriginal<typeof import("./tool-images.js")>();
return {
...actual,
sanitizeImageBlocks: (...args: unknown[]) => mocks.sanitizeImageBlocks(...args),
};
});
import { loadPromptRefImages } from "./cli-runner/helpers.js";
describe("loadPromptRefImages", () => {
beforeEach(() => {
mocks.loadImageFromRef.mockReset();
mocks.sanitizeImageBlocks.mockReset();
mocks.sanitizeImageBlocks.mockImplementation(async (images: ImageContent[]) => ({
images,
dropped: 0,
}));
});
it("returns empty results when the prompt has no image refs", async () => {
await expect(
loadPromptRefImages({
prompt: "just text",
workspaceDir: "/workspace",
}),
).resolves.toEqual([]);
expect(mocks.loadImageFromRef).not.toHaveBeenCalled();
expect(mocks.sanitizeImageBlocks).not.toHaveBeenCalled();
});
it("passes the max-byte guardrail through load and sanitize", async () => {
const loadedImage: ImageContent = {
type: "image",
data: "c29tZS1pbWFnZQ==",
mimeType: "image/png",
};
const sanitizedImage: ImageContent = {
type: "image",
data: "c2FuaXRpemVkLWltYWdl",
mimeType: "image/jpeg",
};
const sandbox = {
root: "/sandbox",
bridge: {} as SandboxFsBridge,
};
mocks.loadImageFromRef.mockResolvedValueOnce(loadedImage);
mocks.sanitizeImageBlocks.mockResolvedValueOnce({ images: [sanitizedImage], dropped: 0 });
const result = await loadPromptRefImages({
prompt: "Look at /tmp/photo.png",
workspaceDir: "/workspace",
workspaceOnly: true,
sandbox,
});
const [ref, workspaceDir, options] = mocks.loadImageFromRef.mock.calls[0] ?? [];
expect(ref).toMatchObject({ resolved: "/tmp/photo.png", type: "path" });
expect(workspaceDir).toBe("/workspace");
expect(options).toEqual({
maxBytes: MAX_IMAGE_BYTES,
workspaceOnly: true,
sandbox,
});
expect(mocks.sanitizeImageBlocks).toHaveBeenCalledWith([loadedImage], "prompt:images", {
maxBytes: MAX_IMAGE_BYTES,
});
expect(result).toEqual([sanitizedImage]);
});
it("dedupes repeated refs and skips failed loads before sanitizing", async () => {
const loadedImage: ImageContent = {
type: "image",
data: "b25lLWltYWdl",
mimeType: "image/png",
};
mocks.loadImageFromRef.mockResolvedValueOnce(loadedImage).mockResolvedValueOnce(null);
const result = await loadPromptRefImages({
prompt: "Compare /tmp/a.png with /tmp/a.png and /tmp/b.png",
workspaceDir: "/workspace",
});
expect(mocks.loadImageFromRef).toHaveBeenCalledTimes(2);
expect(
mocks.loadImageFromRef.mock.calls.map(
(call: unknown[]) => (call[0] as { resolved?: string } | undefined)?.resolved,
),
).toEqual(["/tmp/a.png", "/tmp/b.png"]);
expect(mocks.sanitizeImageBlocks).toHaveBeenCalledWith([loadedImage], "prompt:images", {
maxBytes: MAX_IMAGE_BYTES,
});
expect(result).toEqual([loadedImage]);
});
});

View File

@ -3,6 +3,7 @@ import os from "node:os";
import path from "node:path"; import path from "node:path";
import { beforeEach, describe, expect, it, vi } from "vitest"; import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js"; import type { OpenClawConfig } from "../config/config.js";
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
import { runCliAgent } from "./cli-runner.js"; import { runCliAgent } from "./cli-runner.js";
import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js"; import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js";
import type { EmbeddedContextFile } from "./pi-embedded-helpers.js"; import type { EmbeddedContextFile } from "./pi-embedded-helpers.js";
@ -11,6 +12,8 @@ import type { WorkspaceBootstrapFile } from "./workspace.js";
const supervisorSpawnMock = vi.fn(); const supervisorSpawnMock = vi.fn();
const enqueueSystemEventMock = vi.fn(); const enqueueSystemEventMock = vi.fn();
const requestHeartbeatNowMock = vi.fn(); const requestHeartbeatNowMock = vi.fn();
const SMALL_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
const hoisted = vi.hoisted(() => { const hoisted = vi.hoisted(() => {
type BootstrapContext = { type BootstrapContext = {
bootstrapFiles: WorkspaceBootstrapFile[]; bootstrapFiles: WorkspaceBootstrapFile[];
@ -187,6 +190,135 @@ describe("runCliAgent with process supervisor", () => {
expect(promptCarrier).toContain("hi"); expect(promptCarrier).toContain("hi");
}); });
it("hydrates prompt media refs into CLI image args", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "exit",
exitCode: 0,
exitSignal: null,
durationMs: 50,
stdout: "ok",
stderr: "",
timedOut: false,
noOutputTimedOut: false,
}),
);
const tempDir = await fs.mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-prompt-image-"),
);
const sourceImage = path.join(tempDir, "bb-image.png");
await fs.writeFile(sourceImage, Buffer.from(SMALL_PNG_BASE64, "base64"));
try {
await runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: tempDir,
prompt: `[media attached: ${sourceImage} (image/png)]\n\n<media:image>`,
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-prompt-image",
});
} finally {
await fs.rm(tempDir, { recursive: true, force: true });
}
const input = supervisorSpawnMock.mock.calls[0]?.[0] as { argv?: string[] };
const argv = input.argv ?? [];
const imageArgIndex = argv.indexOf("--image");
expect(imageArgIndex).toBeGreaterThanOrEqual(0);
expect(argv[imageArgIndex + 1]).toContain("openclaw-cli-images-");
expect(argv[imageArgIndex + 1]).not.toBe(sourceImage);
});
it("appends hydrated prompt media refs to generic backend prompts", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "exit",
exitCode: 0,
exitSignal: null,
durationMs: 50,
stdout: "ok",
stderr: "",
timedOut: false,
noOutputTimedOut: false,
}),
);
const tempDir = await fs.mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-prompt-image-generic-"),
);
const sourceImage = path.join(tempDir, "claude-image.png");
await fs.writeFile(sourceImage, Buffer.from(SMALL_PNG_BASE64, "base64"));
try {
await runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: tempDir,
prompt: `[media attached: ${sourceImage} (image/png)]\n\n<media:image>`,
provider: "claude-cli",
model: "claude-opus-4-1",
timeoutMs: 1_000,
runId: "run-prompt-image-generic",
});
} finally {
await fs.rm(tempDir, { recursive: true, force: true });
}
const input = supervisorSpawnMock.mock.calls[0]?.[0] as { argv?: string[]; input?: string };
const argv = input.argv ?? [];
expect(argv).not.toContain("--image");
const promptCarrier = [input.input ?? "", ...argv].join("\n");
const appendedPath = argv.find((value) => value.includes("openclaw-cli-images-"));
expect(appendedPath).toBeDefined();
expect(appendedPath).not.toBe(sourceImage);
expect(promptCarrier).toContain(appendedPath ?? "");
});
it("prefers explicit images over prompt refs", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "exit",
exitCode: 0,
exitSignal: null,
durationMs: 50,
stdout: "ok",
stderr: "",
timedOut: false,
noOutputTimedOut: false,
}),
);
const tempDir = await fs.mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-explicit-images-"),
);
const sourceImage = path.join(tempDir, "ignored-prompt-image.png");
await fs.writeFile(sourceImage, Buffer.from(SMALL_PNG_BASE64, "base64"));
try {
await runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: tempDir,
prompt: `[media attached: ${sourceImage} (image/png)]\n\n<media:image>`,
images: [{ type: "image", data: SMALL_PNG_BASE64, mimeType: "image/png" }],
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-explicit-image-precedence",
});
} finally {
await fs.rm(tempDir, { recursive: true, force: true });
}
const input = supervisorSpawnMock.mock.calls[0]?.[0] as { argv?: string[] };
const argv = input.argv ?? [];
expect(argv.filter((arg) => arg === "--image")).toHaveLength(1);
});
it("fails with timeout when no-output watchdog trips", async () => { it("fails with timeout when no-output watchdog trips", async () => {
supervisorSpawnMock.mockResolvedValueOnce( supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({ createManagedRun({

View File

@ -26,6 +26,7 @@ import {
buildCliArgs, buildCliArgs,
buildSystemPrompt, buildSystemPrompt,
enqueueCliRun, enqueueCliRun,
loadPromptRefImages,
normalizeCliModel, normalizeCliModel,
parseCliJson, parseCliJson,
parseCliJsonl, parseCliJsonl,
@ -221,8 +222,12 @@ export async function runCliAgent(params: {
let prompt = prependBootstrapPromptWarning(params.prompt, bootstrapPromptWarning.lines, { let prompt = prependBootstrapPromptWarning(params.prompt, bootstrapPromptWarning.lines, {
preserveExactPrompt: heartbeatPrompt, preserveExactPrompt: heartbeatPrompt,
}); });
if (params.images && params.images.length > 0) { const resolvedImages =
const imagePayload = await writeCliImages(params.images); params.images && params.images.length > 0
? params.images
: await loadPromptRefImages({ prompt, workspaceDir });
if (resolvedImages.length > 0) {
const imagePayload = await writeCliImages(resolvedImages);
imagePaths = imagePayload.paths; imagePaths = imagePayload.paths;
cleanupImages = imagePayload.cleanup; cleanupImages = imagePayload.cleanup;
if (!backend.imageArg) { if (!backend.imageArg) {

View File

@ -8,15 +8,19 @@ import { KeyedAsyncQueue } from "openclaw/plugin-sdk/keyed-async-queue";
import type { ThinkLevel } from "../../auto-reply/thinking.js"; import type { ThinkLevel } from "../../auto-reply/thinking.js";
import type { OpenClawConfig } from "../../config/config.js"; import type { OpenClawConfig } from "../../config/config.js";
import type { CliBackendConfig } from "../../config/types.js"; import type { CliBackendConfig } from "../../config/types.js";
import { MAX_IMAGE_BYTES } from "../../media/constants.js";
import { buildTtsSystemPromptHint } from "../../tts/tts.js"; import { buildTtsSystemPromptHint } from "../../tts/tts.js";
import { isRecord } from "../../utils.js"; import { isRecord } from "../../utils.js";
import { buildModelAliasLines } from "../model-alias-lines.js"; import { buildModelAliasLines } from "../model-alias-lines.js";
import { resolveDefaultModelForAgent } from "../model-selection.js"; import { resolveDefaultModelForAgent } from "../model-selection.js";
import { resolveOwnerDisplaySetting } from "../owner-display.js"; import { resolveOwnerDisplaySetting } from "../owner-display.js";
import type { EmbeddedContextFile } from "../pi-embedded-helpers.js"; import type { EmbeddedContextFile } from "../pi-embedded-helpers.js";
import { detectImageReferences, loadImageFromRef } from "../pi-embedded-runner/run/images.js";
import type { SandboxFsBridge } from "../sandbox/fs-bridge.js";
import { detectRuntimeShell } from "../shell-utils.js"; import { detectRuntimeShell } from "../shell-utils.js";
import { buildSystemPromptParams } from "../system-prompt-params.js"; import { buildSystemPromptParams } from "../system-prompt-params.js";
import { buildAgentSystemPrompt } from "../system-prompt.js"; import { buildAgentSystemPrompt } from "../system-prompt.js";
import { sanitizeImageBlocks } from "../tool-images.js";
export { buildCliSupervisorScopeKey, resolveCliNoOutputTimeoutMs } from "./reliability.js"; export { buildCliSupervisorScopeKey, resolveCliNoOutputTimeoutMs } from "./reliability.js";
const CLI_RUN_QUEUE = new KeyedAsyncQueue(); const CLI_RUN_QUEUE = new KeyedAsyncQueue();
@ -324,6 +328,43 @@ export function appendImagePathsToPrompt(prompt: string, paths: string[]): strin
return `${trimmed}${separator}${paths.join("\n")}`; return `${trimmed}${separator}${paths.join("\n")}`;
} }
export async function loadPromptRefImages(params: {
prompt: string;
workspaceDir: string;
maxBytes?: number;
workspaceOnly?: boolean;
sandbox?: { root: string; bridge: SandboxFsBridge };
}): Promise<ImageContent[]> {
const refs = detectImageReferences(params.prompt);
if (refs.length === 0) {
return [];
}
const maxBytes = params.maxBytes ?? MAX_IMAGE_BYTES;
const seen = new Set<string>();
const images: ImageContent[] = [];
for (const ref of refs) {
const key = `${ref.type}:${ref.resolved}`;
if (seen.has(key)) {
continue;
}
seen.add(key);
const image = await loadImageFromRef(ref, params.workspaceDir, {
maxBytes,
workspaceOnly: params.workspaceOnly,
sandbox: params.sandbox,
});
if (image) {
images.push(image);
}
}
const { images: sanitizedImages } = await sanitizeImageBlocks(images, "prompt:images", {
maxBytes,
});
return sanitizedImages;
}
export async function writeCliImages( export async function writeCliImages(
images: ImageContent[], images: ImageContent[],
): Promise<{ paths: string[]; cleanup: () => Promise<void> }> { ): Promise<{ paths: string[]; cleanup: () => Promise<void> }> {