2026-01-12 20:38:11 +00:00
import fs from "node:fs/promises" ;
import path from "node:path" ;
2026-01-04 21:56:16 +01:00
import {
type Api ,
type AssistantMessage ,
type Context ,
complete ,
type Model ,
} from "@mariozechner/pi-ai" ;
2026-01-31 06:40:45 +01:00
import { discoverAuthStorage , discoverModels } from "../pi-model-discovery.js" ;
2026-01-04 19:35:00 +01:00
import { Type } from "@sinclair/typebox" ;
2026-01-30 03:15:10 +01:00
import type { OpenClawConfig } from "../../config/config.js" ;
2026-01-04 21:56:16 +01:00
import { resolveUserPath } from "../../utils.js" ;
2026-01-04 19:35:00 +01:00
import { loadWebMedia } from "../../web/media.js" ;
2026-01-14 14:31:43 +00:00
import { ensureAuthProfileStore , listProfilesForProvider } from "../auth-profiles.js" ;
2026-01-12 17:50:44 +00:00
import { DEFAULT_MODEL , DEFAULT_PROVIDER } from "../defaults.js" ;
2026-01-12 20:38:11 +00:00
import { minimaxUnderstandImage } from "../minimax-vlm.js" ;
2026-01-20 07:53:25 +00:00
import { getApiKeyForModel , requireApiKey , resolveEnvApiKey } from "../model-auth.js" ;
2026-01-04 19:35:00 +01:00
import { runWithImageModelFallback } from "../model-fallback.js" ;
2026-01-18 08:06:50 +00:00
import { resolveConfiguredModelRef } from "../model-selection.js" ;
2026-01-30 03:15:10 +01:00
import { ensureOpenClawModelsJson } from "../models-config.js" ;
2026-01-12 17:56:48 +00:00
import { assertSandboxPath } from "../sandbox-paths.js" ;
2026-01-04 19:35:00 +01:00
import type { AnyAgentTool } from "./common.js" ;
2026-01-14 01:08:15 +00:00
import {
coerceImageAssistantText ,
coerceImageModelConfig ,
decodeDataUrl ,
type ImageModelConfig ,
resolveProviderVisionModelFromConfig ,
} from "./image-tool.helpers.js" ;
2026-01-04 19:35:00 +01:00
const DEFAULT_PROMPT = "Describe the image." ;
2026-01-12 18:17:39 +00:00
export const __testing = {
decodeDataUrl ,
2026-01-12 18:45:46 +00:00
coerceImageAssistantText ,
2026-01-12 18:17:39 +00:00
} as const ;
2026-01-30 03:15:10 +01:00
function resolveDefaultModelRef ( cfg? : OpenClawConfig ) : {
2026-01-12 17:50:44 +00:00
provider : string ;
model : string ;
} {
2026-01-17 09:33:53 +00:00
if ( cfg ) {
const resolved = resolveConfiguredModelRef ( {
cfg ,
defaultProvider : DEFAULT_PROVIDER ,
defaultModel : DEFAULT_MODEL ,
} ) ;
return { provider : resolved.provider , model : resolved.model } ;
}
return { provider : DEFAULT_PROVIDER , model : DEFAULT_MODEL } ;
2026-01-12 17:50:44 +00:00
}
2026-01-14 14:31:43 +00:00
function hasAuthForProvider ( params : { provider : string ; agentDir : string } ) : boolean {
2026-01-12 17:50:44 +00:00
if ( resolveEnvApiKey ( params . provider ) ? . apiKey ) return true ;
const store = ensureAuthProfileStore ( params . agentDir , {
allowKeychainPrompt : false ,
} ) ;
return listProfilesForProvider ( store , params . provider ) . length > 0 ;
}
/ * *
* Resolve the effective image model config for the ` image ` tool .
*
* - Prefer explicit config ( ` agents.defaults.imageModel ` ) .
* - Otherwise , try to "pair" the primary model with an image - capable model :
* - same provider ( best effort )
* - fall back to OpenAI / Anthropic when available
* /
export function resolveImageModelConfigForTool ( params : {
2026-01-30 03:15:10 +01:00
cfg? : OpenClawConfig ;
2026-01-12 17:50:44 +00:00
agentDir : string ;
} ) : ImageModelConfig | null {
2026-01-17 03:10:10 -08:00
// Note: We intentionally do NOT gate based on primarySupportsImages here.
// Even when the primary model supports images, we keep the tool available
// because images are auto-injected into prompts (see attempt.ts detectAndLoadPromptImages).
// The tool description is adjusted via modelHasVision to discourage redundant usage.
2026-01-12 17:50:44 +00:00
const explicit = coerceImageModelConfig ( params . cfg ) ;
if ( explicit . primary ? . trim ( ) || ( explicit . fallbacks ? . length ? ? 0 ) > 0 ) {
return explicit ;
}
const primary = resolveDefaultModelRef ( params . cfg ) ;
const openaiOk = hasAuthForProvider ( {
provider : "openai" ,
agentDir : params.agentDir ,
} ) ;
const anthropicOk = hasAuthForProvider ( {
provider : "anthropic" ,
agentDir : params.agentDir ,
} ) ;
const fallbacks : string [ ] = [ ] ;
const addFallback = ( modelRef : string | null ) = > {
const ref = ( modelRef ? ? "" ) . trim ( ) ;
if ( ! ref ) return ;
if ( fallbacks . includes ( ref ) ) return ;
fallbacks . push ( ref ) ;
} ;
const providerVisionFromConfig = resolveProviderVisionModelFromConfig ( {
cfg : params.cfg ,
provider : primary.provider ,
} ) ;
const providerOk = hasAuthForProvider ( {
provider : primary.provider ,
agentDir : params.agentDir ,
} ) ;
let preferred : string | null = null ;
// MiniMax users: always try the canonical vision model first when auth exists.
if ( primary . provider === "minimax" && providerOk ) {
preferred = "minimax/MiniMax-VL-01" ;
} else if ( providerOk && providerVisionFromConfig ) {
preferred = providerVisionFromConfig ;
} else if ( primary . provider === "openai" && openaiOk ) {
preferred = "openai/gpt-5-mini" ;
} else if ( primary . provider === "anthropic" && anthropicOk ) {
preferred = "anthropic/claude-opus-4-5" ;
}
if ( preferred ? . trim ( ) ) {
if ( openaiOk ) addFallback ( "openai/gpt-5-mini" ) ;
if ( anthropicOk ) addFallback ( "anthropic/claude-opus-4-5" ) ;
// Don't duplicate primary in fallbacks.
const pruned = fallbacks . filter ( ( ref ) = > ref !== preferred ) ;
return {
primary : preferred ,
. . . ( pruned . length > 0 ? { fallbacks : pruned } : { } ) ,
} ;
}
// Cross-provider fallback when we can't pair with the primary provider.
if ( openaiOk ) {
if ( anthropicOk ) addFallback ( "anthropic/claude-opus-4-5" ) ;
return {
primary : "openai/gpt-5-mini" ,
. . . ( fallbacks . length ? { fallbacks } : { } ) ,
} ;
}
if ( anthropicOk ) {
return { primary : "anthropic/claude-opus-4-5" } ;
}
return null ;
2026-01-04 19:35:00 +01:00
}
2026-01-30 03:15:10 +01:00
function pickMaxBytes ( cfg? : OpenClawConfig , maxBytesMb? : number ) : number | undefined {
2026-01-14 14:31:43 +00:00
if ( typeof maxBytesMb === "number" && Number . isFinite ( maxBytesMb ) && maxBytesMb > 0 ) {
2026-01-04 19:35:00 +01:00
return Math . floor ( maxBytesMb * 1024 * 1024 ) ;
}
2026-01-09 12:44:23 +00:00
const configured = cfg ? . agents ? . defaults ? . mediaMaxMb ;
2026-01-14 14:31:43 +00:00
if ( typeof configured === "number" && Number . isFinite ( configured ) && configured > 0 ) {
2026-01-04 19:35:00 +01:00
return Math . floor ( configured * 1024 * 1024 ) ;
}
return undefined ;
}
2026-01-14 14:31:43 +00:00
function buildImageContext ( prompt : string , base64 : string , mimeType : string ) : Context {
2026-01-04 19:35:00 +01:00
return {
messages : [
{
role : "user" ,
content : [
{ type : "text" , text : prompt } ,
{ type : "image" , data : base64 , mimeType } ,
] ,
timestamp : Date.now ( ) ,
} ,
] ,
} ;
}
2026-01-12 20:38:11 +00:00
async function resolveSandboxedImagePath ( params : {
sandboxRoot : string ;
imagePath : string ;
} ) : Promise < { resolved : string ; rewrittenFrom? : string } > {
2026-01-14 14:31:43 +00:00
const normalize = ( p : string ) = > ( p . startsWith ( "file://" ) ? p . slice ( "file://" . length ) : p ) ;
2026-01-12 20:38:11 +00:00
const filePath = normalize ( params . imagePath ) ;
try {
const out = await assertSandboxPath ( {
filePath ,
cwd : params.sandboxRoot ,
root : params.sandboxRoot ,
} ) ;
return { resolved : out.resolved } ;
} catch ( err ) {
const name = path . basename ( filePath ) ;
const candidateRel = path . join ( "media" , "inbound" , name ) ;
const candidateAbs = path . join ( params . sandboxRoot , candidateRel ) ;
try {
await fs . stat ( candidateAbs ) ;
} catch {
throw err ;
}
const out = await assertSandboxPath ( {
filePath : candidateRel ,
cwd : params.sandboxRoot ,
root : params.sandboxRoot ,
} ) ;
return { resolved : out.resolved , rewrittenFrom : filePath } ;
}
}
2026-01-04 19:35:00 +01:00
async function runImagePrompt ( params : {
2026-01-30 03:15:10 +01:00
cfg? : OpenClawConfig ;
2026-01-06 18:25:37 +00:00
agentDir : string ;
2026-01-12 17:50:44 +00:00
imageModelConfig : ImageModelConfig ;
2026-01-04 19:35:00 +01:00
modelOverride? : string ;
prompt : string ;
base64 : string ;
mimeType : string ;
2026-01-12 18:45:46 +00:00
} ) : Promise < {
text : string ;
provider : string ;
model : string ;
attempts : Array < { provider : string ; model : string ; error : string } > ;
} > {
2026-01-30 03:15:10 +01:00
const effectiveCfg : OpenClawConfig | undefined = params . cfg
2026-01-12 17:50:44 +00:00
? {
. . . params . cfg ,
agents : {
. . . params . cfg . agents ,
defaults : {
. . . params . cfg . agents ? . defaults ,
imageModel : params.imageModelConfig ,
} ,
} ,
}
: undefined ;
2026-01-30 03:15:10 +01:00
await ensureOpenClawModelsJson ( effectiveCfg , params . agentDir ) ;
2026-01-31 06:40:45 +01:00
const authStorage = discoverAuthStorage ( params . agentDir ) ;
const modelRegistry = discoverModels ( authStorage , params . agentDir ) ;
2026-01-04 19:35:00 +01:00
const result = await runWithImageModelFallback ( {
2026-01-12 17:50:44 +00:00
cfg : effectiveCfg ,
2026-01-04 19:35:00 +01:00
modelOverride : params.modelOverride ,
run : async ( provider , modelId ) = > {
const model = modelRegistry . find ( provider , modelId ) as Model < Api > | null ;
if ( ! model ) {
throw new Error ( ` Unknown model: ${ provider } / ${ modelId } ` ) ;
}
if ( ! model . input ? . includes ( "image" ) ) {
2026-01-14 14:31:43 +00:00
throw new Error ( ` Model does not support images: ${ provider } / ${ modelId } ` ) ;
2026-01-04 19:35:00 +01:00
}
2026-01-06 00:56:29 +00:00
const apiKeyInfo = await getApiKeyForModel ( {
model ,
2026-01-12 17:50:44 +00:00
cfg : effectiveCfg ,
2026-01-06 18:25:37 +00:00
agentDir : params.agentDir ,
2026-01-06 00:56:29 +00:00
} ) ;
2026-01-20 07:53:25 +00:00
const apiKey = requireApiKey ( apiKeyInfo , model . provider ) ;
authStorage . setRuntimeApiKey ( model . provider , apiKey ) ;
2026-01-12 20:38:11 +00:00
const imageDataUrl = ` data: ${ params . mimeType } ;base64, ${ params . base64 } ` ;
if ( model . provider === "minimax" ) {
const text = await minimaxUnderstandImage ( {
2026-01-20 07:53:25 +00:00
apiKey ,
2026-01-12 20:38:11 +00:00
prompt : params.prompt ,
imageDataUrl ,
modelBaseUrl : model.baseUrl ,
} ) ;
return { text , provider : model.provider , model : model.id } ;
}
2026-01-14 14:31:43 +00:00
const context = buildImageContext ( params . prompt , params . base64 , params . mimeType ) ;
2026-01-04 19:35:00 +01:00
const message = ( await complete ( model , context , {
2026-01-20 07:53:25 +00:00
apiKey ,
2026-01-04 19:35:00 +01:00
maxTokens : 512 ,
} ) ) as AssistantMessage ;
2026-01-12 20:38:11 +00:00
const text = coerceImageAssistantText ( {
2026-01-12 18:45:46 +00:00
message ,
provider : model.provider ,
model : model.id ,
2026-01-12 20:38:11 +00:00
} ) ;
return { text , provider : model.provider , model : model.id } ;
2026-01-04 19:35:00 +01:00
} ,
} ) ;
2026-01-12 20:38:11 +00:00
return {
text : result.result.text ,
2026-01-12 18:45:46 +00:00
provider : result.result.provider ,
model : result.result.model ,
attempts : result.attempts.map ( ( attempt ) = > ( {
provider : attempt.provider ,
model : attempt.model ,
error : attempt.error ,
} ) ) ,
2026-01-04 19:35:00 +01:00
} ;
}
export function createImageTool ( options ? : {
2026-01-30 03:15:10 +01:00
config? : OpenClawConfig ;
2026-01-06 18:25:37 +00:00
agentDir? : string ;
2026-01-12 17:56:48 +00:00
sandboxRoot? : string ;
2026-01-17 03:10:10 -08:00
/** If true, the model has native vision capability and images in the prompt are auto-injected */
modelHasVision? : boolean ;
2026-01-04 19:35:00 +01:00
} ) : AnyAgentTool | null {
2026-01-12 18:12:51 +00:00
const agentDir = options ? . agentDir ? . trim ( ) ;
if ( ! agentDir ) {
const explicit = coerceImageModelConfig ( options ? . config ) ;
if ( explicit . primary ? . trim ( ) || ( explicit . fallbacks ? . length ? ? 0 ) > 0 ) {
throw new Error ( "createImageTool requires agentDir when enabled" ) ;
}
return null ;
2026-01-06 18:25:37 +00:00
}
2026-01-12 17:50:44 +00:00
const imageModelConfig = resolveImageModelConfigForTool ( {
cfg : options?.config ,
agentDir ,
} ) ;
if ( ! imageModelConfig ) return null ;
2026-01-17 03:10:10 -08:00
// If model has native vision, images in the prompt are auto-injected
// so this tool is only needed when image wasn't provided in the prompt
const description = options ? . modelHasVision
? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
: "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL." ;
2026-01-04 19:35:00 +01:00
return {
label : "Image" ,
name : "image" ,
2026-01-17 03:10:10 -08:00
description ,
2026-01-04 19:35:00 +01:00
parameters : Type.Object ( {
prompt : Type.Optional ( Type . String ( ) ) ,
image : Type.String ( ) ,
model : Type.Optional ( Type . String ( ) ) ,
maxBytesMb : Type.Optional ( Type . Number ( ) ) ,
} ) ,
execute : async ( _toolCallId , args ) = > {
2026-01-14 14:31:43 +00:00
const record = args && typeof args === "object" ? ( args as Record < string , unknown > ) : { } ;
const imageRawInput = typeof record . image === "string" ? record . image . trim ( ) : "" ;
2026-01-12 20:53:16 +00:00
const imageRaw = imageRawInput . startsWith ( "@" )
? imageRawInput . slice ( 1 ) . trim ( )
: imageRawInput ;
2026-01-04 19:35:00 +01:00
if ( ! imageRaw ) throw new Error ( "image required" ) ;
2026-01-14 01:08:15 +00:00
// The tool accepts file paths, file/data URLs, or http(s) URLs. In some
// agent/model contexts, images can be referenced as pseudo-URIs like
// `image:0` (e.g. "first image in the prompt"). We don't have access to a
// shared image registry here, so fail gracefully instead of attempting to
// `fs.readFile("image:0")` and producing a noisy ENOENT.
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/ . test ( imageRaw ) ;
const hasScheme = /^[a-z][a-z0-9+.-]*:/i . test ( imageRaw ) ;
const isFileUrl = /^file:/i . test ( imageRaw ) ;
const isHttpUrl = /^https?:\/\//i . test ( imageRaw ) ;
const isDataUrl = /^data:/i . test ( imageRaw ) ;
2026-01-14 14:31:43 +00:00
if ( hasScheme && ! looksLikeWindowsDrivePath && ! isFileUrl && ! isHttpUrl && ! isDataUrl ) {
2026-01-14 01:08:15 +00:00
return {
content : [
{
type : "text" ,
text : ` Unsupported image reference: ${ imageRawInput } . Use a file path, a file:// URL, a data: URL, or an http(s) URL. ` ,
} ,
] ,
details : {
error : "unsupported_image_reference" ,
image : imageRawInput ,
} ,
} ;
}
2026-01-04 19:35:00 +01:00
const promptRaw =
typeof record . prompt === "string" && record . prompt . trim ( )
? record . prompt . trim ( )
: DEFAULT_PROMPT ;
const modelOverride =
2026-01-14 14:31:43 +00:00
typeof record . model === "string" && record . model . trim ( ) ? record . model . trim ( ) : undefined ;
const maxBytesMb = typeof record . maxBytesMb === "number" ? record.maxBytesMb : undefined ;
2026-01-04 19:35:00 +01:00
const maxBytes = pickMaxBytes ( options ? . config , maxBytesMb ) ;
2026-01-12 17:56:48 +00:00
const sandboxRoot = options ? . sandboxRoot ? . trim ( ) ;
2026-01-14 01:08:15 +00:00
const isUrl = isHttpUrl ;
2026-01-12 17:56:48 +00:00
if ( sandboxRoot && isUrl ) {
throw new Error ( "Sandboxed image tool does not allow remote URLs." ) ;
}
2026-01-12 18:17:39 +00:00
const resolvedImage = ( ( ) = > {
if ( sandboxRoot ) return imageRaw ;
if ( imageRaw . startsWith ( "~" ) ) return resolveUserPath ( imageRaw ) ;
return imageRaw ;
} ) ( ) ;
2026-01-14 14:31:43 +00:00
const resolvedPathInfo : { resolved : string ; rewrittenFrom? : string } = isDataUrl
? { resolved : "" }
: sandboxRoot
? await resolveSandboxedImagePath ( {
sandboxRoot ,
imagePath : resolvedImage ,
} )
: {
resolved : resolvedImage.startsWith ( "file://" )
? resolvedImage . slice ( "file://" . length )
: resolvedImage ,
} ;
2026-01-12 20:38:11 +00:00
const resolvedPath = isDataUrl ? null : resolvedPathInfo . resolved ;
2026-01-12 18:17:39 +00:00
const media = isDataUrl
? decodeDataUrl ( resolvedImage )
: await loadWebMedia ( resolvedPath ? ? resolvedImage , maxBytes ) ;
2026-01-04 19:35:00 +01:00
if ( media . kind !== "image" ) {
throw new Error ( ` Unsupported media type: ${ media . kind } ` ) ;
}
2026-01-12 18:24:11 +00:00
const mimeType =
( "contentType" in media && media . contentType ) ||
( "mimeType" in media && media . mimeType ) ||
"image/png" ;
2026-01-04 19:35:00 +01:00
const base64 = media . buffer . toString ( "base64" ) ;
const result = await runImagePrompt ( {
cfg : options?.config ,
2026-01-06 18:25:37 +00:00
agentDir ,
2026-01-12 17:50:44 +00:00
imageModelConfig ,
2026-01-04 19:35:00 +01:00
modelOverride ,
prompt : promptRaw ,
base64 ,
mimeType ,
} ) ;
return {
content : [ { type : "text" , text : result.text } ] ,
details : {
model : ` ${ result . provider } / ${ result . model } ` ,
image : resolvedImage ,
2026-01-12 20:38:11 +00:00
. . . ( resolvedPathInfo . rewrittenFrom
? { rewrittenFrom : resolvedPathInfo.rewrittenFrom }
: { } ) ,
2026-01-12 18:45:46 +00:00
attempts : result.attempts ,
2026-01-04 19:35:00 +01:00
} ,
} ;
} ,
} ;
}