openclaw/src/agents/tools/image-tool.ts

433 lines
14 KiB
TypeScript
Raw Normal View History

import fs from "node:fs/promises";
import path from "node:path";
2026-01-04 21:56:16 +01:00
import {
type Api,
type AssistantMessage,
type Context,
complete,
type Model,
} from "@mariozechner/pi-ai";
import { discoverAuthStorage, discoverModels } from "../pi-model-discovery.js";
2026-01-04 19:35:00 +01:00
import { Type } from "@sinclair/typebox";
2026-01-30 03:15:10 +01:00
import type { OpenClawConfig } from "../../config/config.js";
2026-01-04 21:56:16 +01:00
import { resolveUserPath } from "../../utils.js";
2026-01-04 19:35:00 +01:00
import { loadWebMedia } from "../../web/media.js";
import { ensureAuthProfileStore, listProfilesForProvider } from "../auth-profiles.js";
2026-01-12 17:50:44 +00:00
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
import { minimaxUnderstandImage } from "../minimax-vlm.js";
import { getApiKeyForModel, requireApiKey, resolveEnvApiKey } from "../model-auth.js";
2026-01-04 19:35:00 +01:00
import { runWithImageModelFallback } from "../model-fallback.js";
import { resolveConfiguredModelRef } from "../model-selection.js";
2026-01-30 03:15:10 +01:00
import { ensureOpenClawModelsJson } from "../models-config.js";
2026-01-12 17:56:48 +00:00
import { assertSandboxPath } from "../sandbox-paths.js";
2026-01-04 19:35:00 +01:00
import type { AnyAgentTool } from "./common.js";
2026-01-14 01:08:15 +00:00
import {
coerceImageAssistantText,
coerceImageModelConfig,
decodeDataUrl,
type ImageModelConfig,
resolveProviderVisionModelFromConfig,
} from "./image-tool.helpers.js";
2026-01-04 19:35:00 +01:00
const DEFAULT_PROMPT = "Describe the image.";
2026-01-12 18:17:39 +00:00
export const __testing = {
decodeDataUrl,
2026-01-12 18:45:46 +00:00
coerceImageAssistantText,
2026-01-12 18:17:39 +00:00
} as const;
2026-01-30 03:15:10 +01:00
function resolveDefaultModelRef(cfg?: OpenClawConfig): {
2026-01-12 17:50:44 +00:00
provider: string;
model: string;
} {
if (cfg) {
const resolved = resolveConfiguredModelRef({
cfg,
defaultProvider: DEFAULT_PROVIDER,
defaultModel: DEFAULT_MODEL,
});
return { provider: resolved.provider, model: resolved.model };
}
return { provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL };
2026-01-12 17:50:44 +00:00
}
function hasAuthForProvider(params: { provider: string; agentDir: string }): boolean {
2026-01-12 17:50:44 +00:00
if (resolveEnvApiKey(params.provider)?.apiKey) return true;
const store = ensureAuthProfileStore(params.agentDir, {
allowKeychainPrompt: false,
});
return listProfilesForProvider(store, params.provider).length > 0;
}
/**
* Resolve the effective image model config for the `image` tool.
*
* - Prefer explicit config (`agents.defaults.imageModel`).
* - Otherwise, try to "pair" the primary model with an image-capable model:
* - same provider (best effort)
* - fall back to OpenAI/Anthropic when available
*/
export function resolveImageModelConfigForTool(params: {
2026-01-30 03:15:10 +01:00
cfg?: OpenClawConfig;
2026-01-12 17:50:44 +00:00
agentDir: string;
}): ImageModelConfig | null {
// Note: We intentionally do NOT gate based on primarySupportsImages here.
// Even when the primary model supports images, we keep the tool available
// because images are auto-injected into prompts (see attempt.ts detectAndLoadPromptImages).
// The tool description is adjusted via modelHasVision to discourage redundant usage.
2026-01-12 17:50:44 +00:00
const explicit = coerceImageModelConfig(params.cfg);
if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
return explicit;
}
const primary = resolveDefaultModelRef(params.cfg);
const openaiOk = hasAuthForProvider({
provider: "openai",
agentDir: params.agentDir,
});
const anthropicOk = hasAuthForProvider({
provider: "anthropic",
agentDir: params.agentDir,
});
const fallbacks: string[] = [];
const addFallback = (modelRef: string | null) => {
const ref = (modelRef ?? "").trim();
if (!ref) return;
if (fallbacks.includes(ref)) return;
fallbacks.push(ref);
};
const providerVisionFromConfig = resolveProviderVisionModelFromConfig({
cfg: params.cfg,
provider: primary.provider,
});
const providerOk = hasAuthForProvider({
provider: primary.provider,
agentDir: params.agentDir,
});
let preferred: string | null = null;
// MiniMax users: always try the canonical vision model first when auth exists.
if (primary.provider === "minimax" && providerOk) {
preferred = "minimax/MiniMax-VL-01";
} else if (providerOk && providerVisionFromConfig) {
preferred = providerVisionFromConfig;
} else if (primary.provider === "openai" && openaiOk) {
preferred = "openai/gpt-5-mini";
} else if (primary.provider === "anthropic" && anthropicOk) {
preferred = "anthropic/claude-opus-4-5";
}
if (preferred?.trim()) {
if (openaiOk) addFallback("openai/gpt-5-mini");
if (anthropicOk) addFallback("anthropic/claude-opus-4-5");
// Don't duplicate primary in fallbacks.
const pruned = fallbacks.filter((ref) => ref !== preferred);
return {
primary: preferred,
...(pruned.length > 0 ? { fallbacks: pruned } : {}),
};
}
// Cross-provider fallback when we can't pair with the primary provider.
if (openaiOk) {
if (anthropicOk) addFallback("anthropic/claude-opus-4-5");
return {
primary: "openai/gpt-5-mini",
...(fallbacks.length ? { fallbacks } : {}),
};
}
if (anthropicOk) {
return { primary: "anthropic/claude-opus-4-5" };
}
return null;
2026-01-04 19:35:00 +01:00
}
2026-01-30 03:15:10 +01:00
function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undefined {
if (typeof maxBytesMb === "number" && Number.isFinite(maxBytesMb) && maxBytesMb > 0) {
2026-01-04 19:35:00 +01:00
return Math.floor(maxBytesMb * 1024 * 1024);
}
const configured = cfg?.agents?.defaults?.mediaMaxMb;
if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
2026-01-04 19:35:00 +01:00
return Math.floor(configured * 1024 * 1024);
}
return undefined;
}
function buildImageContext(prompt: string, base64: string, mimeType: string): Context {
2026-01-04 19:35:00 +01:00
return {
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image", data: base64, mimeType },
],
timestamp: Date.now(),
},
],
};
}
async function resolveSandboxedImagePath(params: {
sandboxRoot: string;
imagePath: string;
}): Promise<{ resolved: string; rewrittenFrom?: string }> {
const normalize = (p: string) => (p.startsWith("file://") ? p.slice("file://".length) : p);
const filePath = normalize(params.imagePath);
try {
const out = await assertSandboxPath({
filePath,
cwd: params.sandboxRoot,
root: params.sandboxRoot,
});
return { resolved: out.resolved };
} catch (err) {
const name = path.basename(filePath);
const candidateRel = path.join("media", "inbound", name);
const candidateAbs = path.join(params.sandboxRoot, candidateRel);
try {
await fs.stat(candidateAbs);
} catch {
throw err;
}
const out = await assertSandboxPath({
filePath: candidateRel,
cwd: params.sandboxRoot,
root: params.sandboxRoot,
});
return { resolved: out.resolved, rewrittenFrom: filePath };
}
}
2026-01-04 19:35:00 +01:00
async function runImagePrompt(params: {
2026-01-30 03:15:10 +01:00
cfg?: OpenClawConfig;
agentDir: string;
2026-01-12 17:50:44 +00:00
imageModelConfig: ImageModelConfig;
2026-01-04 19:35:00 +01:00
modelOverride?: string;
prompt: string;
base64: string;
mimeType: string;
2026-01-12 18:45:46 +00:00
}): Promise<{
text: string;
provider: string;
model: string;
attempts: Array<{ provider: string; model: string; error: string }>;
}> {
2026-01-30 03:15:10 +01:00
const effectiveCfg: OpenClawConfig | undefined = params.cfg
2026-01-12 17:50:44 +00:00
? {
...params.cfg,
agents: {
...params.cfg.agents,
defaults: {
...params.cfg.agents?.defaults,
imageModel: params.imageModelConfig,
},
},
}
: undefined;
2026-01-30 03:15:10 +01:00
await ensureOpenClawModelsJson(effectiveCfg, params.agentDir);
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
2026-01-04 19:35:00 +01:00
const result = await runWithImageModelFallback({
2026-01-12 17:50:44 +00:00
cfg: effectiveCfg,
2026-01-04 19:35:00 +01:00
modelOverride: params.modelOverride,
run: async (provider, modelId) => {
const model = modelRegistry.find(provider, modelId) as Model<Api> | null;
if (!model) {
throw new Error(`Unknown model: ${provider}/${modelId}`);
}
if (!model.input?.includes("image")) {
throw new Error(`Model does not support images: ${provider}/${modelId}`);
2026-01-04 19:35:00 +01:00
}
const apiKeyInfo = await getApiKeyForModel({
model,
2026-01-12 17:50:44 +00:00
cfg: effectiveCfg,
agentDir: params.agentDir,
});
const apiKey = requireApiKey(apiKeyInfo, model.provider);
authStorage.setRuntimeApiKey(model.provider, apiKey);
const imageDataUrl = `data:${params.mimeType};base64,${params.base64}`;
if (model.provider === "minimax") {
const text = await minimaxUnderstandImage({
apiKey,
prompt: params.prompt,
imageDataUrl,
modelBaseUrl: model.baseUrl,
});
return { text, provider: model.provider, model: model.id };
}
const context = buildImageContext(params.prompt, params.base64, params.mimeType);
2026-01-04 19:35:00 +01:00
const message = (await complete(model, context, {
apiKey,
2026-01-04 19:35:00 +01:00
maxTokens: 512,
})) as AssistantMessage;
const text = coerceImageAssistantText({
2026-01-12 18:45:46 +00:00
message,
provider: model.provider,
model: model.id,
});
return { text, provider: model.provider, model: model.id };
2026-01-04 19:35:00 +01:00
},
});
return {
text: result.result.text,
2026-01-12 18:45:46 +00:00
provider: result.result.provider,
model: result.result.model,
attempts: result.attempts.map((attempt) => ({
provider: attempt.provider,
model: attempt.model,
error: attempt.error,
})),
2026-01-04 19:35:00 +01:00
};
}
export function createImageTool(options?: {
2026-01-30 03:15:10 +01:00
config?: OpenClawConfig;
agentDir?: string;
2026-01-12 17:56:48 +00:00
sandboxRoot?: string;
/** If true, the model has native vision capability and images in the prompt are auto-injected */
modelHasVision?: boolean;
2026-01-04 19:35:00 +01:00
}): AnyAgentTool | null {
2026-01-12 18:12:51 +00:00
const agentDir = options?.agentDir?.trim();
if (!agentDir) {
const explicit = coerceImageModelConfig(options?.config);
if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
throw new Error("createImageTool requires agentDir when enabled");
}
return null;
}
2026-01-12 17:50:44 +00:00
const imageModelConfig = resolveImageModelConfigForTool({
cfg: options?.config,
agentDir,
});
if (!imageModelConfig) return null;
// If model has native vision, images in the prompt are auto-injected
// so this tool is only needed when image wasn't provided in the prompt
const description = options?.modelHasVision
? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
: "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.";
2026-01-04 19:35:00 +01:00
return {
label: "Image",
name: "image",
description,
2026-01-04 19:35:00 +01:00
parameters: Type.Object({
prompt: Type.Optional(Type.String()),
image: Type.String(),
model: Type.Optional(Type.String()),
maxBytesMb: Type.Optional(Type.Number()),
}),
execute: async (_toolCallId, args) => {
const record = args && typeof args === "object" ? (args as Record<string, unknown>) : {};
const imageRawInput = typeof record.image === "string" ? record.image.trim() : "";
const imageRaw = imageRawInput.startsWith("@")
? imageRawInput.slice(1).trim()
: imageRawInput;
2026-01-04 19:35:00 +01:00
if (!imageRaw) throw new Error("image required");
2026-01-14 01:08:15 +00:00
// The tool accepts file paths, file/data URLs, or http(s) URLs. In some
// agent/model contexts, images can be referenced as pseudo-URIs like
// `image:0` (e.g. "first image in the prompt"). We don't have access to a
// shared image registry here, so fail gracefully instead of attempting to
// `fs.readFile("image:0")` and producing a noisy ENOENT.
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(imageRaw);
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(imageRaw);
const isFileUrl = /^file:/i.test(imageRaw);
const isHttpUrl = /^https?:\/\//i.test(imageRaw);
const isDataUrl = /^data:/i.test(imageRaw);
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
2026-01-14 01:08:15 +00:00
return {
content: [
{
type: "text",
text: `Unsupported image reference: ${imageRawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
},
],
details: {
error: "unsupported_image_reference",
image: imageRawInput,
},
};
}
2026-01-04 19:35:00 +01:00
const promptRaw =
typeof record.prompt === "string" && record.prompt.trim()
? record.prompt.trim()
: DEFAULT_PROMPT;
const modelOverride =
typeof record.model === "string" && record.model.trim() ? record.model.trim() : undefined;
const maxBytesMb = typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined;
2026-01-04 19:35:00 +01:00
const maxBytes = pickMaxBytes(options?.config, maxBytesMb);
2026-01-12 17:56:48 +00:00
const sandboxRoot = options?.sandboxRoot?.trim();
2026-01-14 01:08:15 +00:00
const isUrl = isHttpUrl;
2026-01-12 17:56:48 +00:00
if (sandboxRoot && isUrl) {
throw new Error("Sandboxed image tool does not allow remote URLs.");
}
2026-01-12 18:17:39 +00:00
const resolvedImage = (() => {
if (sandboxRoot) return imageRaw;
if (imageRaw.startsWith("~")) return resolveUserPath(imageRaw);
return imageRaw;
})();
const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl
? { resolved: "" }
: sandboxRoot
? await resolveSandboxedImagePath({
sandboxRoot,
imagePath: resolvedImage,
})
: {
resolved: resolvedImage.startsWith("file://")
? resolvedImage.slice("file://".length)
: resolvedImage,
};
const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved;
2026-01-12 18:17:39 +00:00
const media = isDataUrl
? decodeDataUrl(resolvedImage)
: await loadWebMedia(resolvedPath ?? resolvedImage, maxBytes);
2026-01-04 19:35:00 +01:00
if (media.kind !== "image") {
throw new Error(`Unsupported media type: ${media.kind}`);
}
const mimeType =
("contentType" in media && media.contentType) ||
("mimeType" in media && media.mimeType) ||
"image/png";
2026-01-04 19:35:00 +01:00
const base64 = media.buffer.toString("base64");
const result = await runImagePrompt({
cfg: options?.config,
agentDir,
2026-01-12 17:50:44 +00:00
imageModelConfig,
2026-01-04 19:35:00 +01:00
modelOverride,
prompt: promptRaw,
base64,
mimeType,
});
return {
content: [{ type: "text", text: result.text }],
details: {
model: `${result.provider}/${result.model}`,
image: resolvedImage,
...(resolvedPathInfo.rewrittenFrom
? { rewrittenFrom: resolvedPathInfo.rewrittenFrom }
: {}),
2026-01-12 18:45:46 +00:00
attempts: result.attempts,
2026-01-04 19:35:00 +01:00
},
};
},
};
}