diff --git a/extensions/anthropic/index.ts b/extensions/anthropic/index.ts index 25cb604dbcb..4cad353908b 100644 --- a/extensions/anthropic/index.ts +++ b/extensions/anthropic/index.ts @@ -28,6 +28,7 @@ import { } from "openclaw/plugin-sdk/provider-auth"; import { normalizeModelCompat } from "openclaw/plugin-sdk/provider-models"; import { fetchClaudeUsage } from "openclaw/plugin-sdk/provider-usage"; +import { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js"; const PROVIDER_ID = "anthropic"; const DEFAULT_ANTHROPIC_MODEL = "anthropic/claude-sonnet-4-6"; @@ -396,6 +397,7 @@ const anthropicPlugin = { profileId: ctx.profileId, }), }); + api.registerMediaUnderstandingProvider(anthropicMediaUnderstandingProvider); }, }; diff --git a/extensions/anthropic/media-understanding-provider.ts b/extensions/anthropic/media-understanding-provider.ts index 5b1f0711705..68a95c93546 100644 --- a/extensions/anthropic/media-understanding-provider.ts +++ b/extensions/anthropic/media-understanding-provider.ts @@ -1,5 +1,6 @@ import { describeImageWithModel, + describeImagesWithModel, type MediaUnderstandingProvider, } from "openclaw/plugin-sdk/media-understanding"; @@ -7,4 +8,5 @@ export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "anthropic", capabilities: ["image"], describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, }; diff --git a/extensions/google/media-understanding-provider.ts b/extensions/google/media-understanding-provider.ts index a64f26ca6c8..97b008ee578 100644 --- a/extensions/google/media-understanding-provider.ts +++ b/extensions/google/media-understanding-provider.ts @@ -2,6 +2,7 @@ import { normalizeGoogleModelId, parseGeminiAuth } from "openclaw/plugin-sdk/goo import { assertOkOrThrowHttpError, describeImageWithModel, + describeImagesWithModel, normalizeBaseUrl, postJsonRequest, type AudioTranscriptionRequest, @@ -142,6 +143,7 @@ export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "google", capabilities: ["image", "audio", "video"], describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, transcribeAudio: transcribeGeminiAudio, describeVideo: describeGeminiVideo, }; diff --git a/extensions/minimax/index.ts b/extensions/minimax/index.ts index 30894be556d..1ebf7382d52 100644 --- a/extensions/minimax/index.ts +++ b/extensions/minimax/index.ts @@ -13,6 +13,10 @@ import { listProfilesForProvider, } from "openclaw/plugin-sdk/provider-auth"; import { fetchMinimaxUsage } from "openclaw/plugin-sdk/provider-usage"; +import { + minimaxMediaUnderstandingProvider, + minimaxPortalMediaUnderstandingProvider, +} from "./media-understanding-provider.js"; import { loginMiniMaxPortalOAuth, type MiniMaxRegion } from "./oauth.js"; import { applyMinimaxApiConfig, applyMinimaxApiConfigCn } from "./onboard.js"; import { buildMinimaxPortalProvider, buildMinimaxProvider } from "./provider-catalog.js"; @@ -273,6 +277,8 @@ const minimaxPlugin = { ], isModernModelRef: ({ modelId }) => isModernMiniMaxModel(modelId), }); + api.registerMediaUnderstandingProvider(minimaxMediaUnderstandingProvider); + api.registerMediaUnderstandingProvider(minimaxPortalMediaUnderstandingProvider); }, }; diff --git a/extensions/minimax/media-understanding-provider.ts b/extensions/minimax/media-understanding-provider.ts index 2bda4f4d193..4501a96dee9 100644 --- a/extensions/minimax/media-understanding-provider.ts +++ b/extensions/minimax/media-understanding-provider.ts @@ -1,5 +1,6 @@ import { describeImageWithModel, + describeImagesWithModel, type MediaUnderstandingProvider, } from "openclaw/plugin-sdk/media-understanding"; @@ -7,10 +8,12 @@ export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "minimax", capabilities: ["image"], describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, }; export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "minimax-portal", capabilities: ["image"], describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, }; diff --git a/extensions/mistral/index.ts b/extensions/mistral/index.ts index 72b3b6a60ac..5a15c50a857 100644 --- a/extensions/mistral/index.ts +++ b/extensions/mistral/index.ts @@ -1,5 +1,6 @@ import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core"; import { createProviderApiKeyAuthMethod } from "openclaw/plugin-sdk/provider-auth"; +import { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js"; import { applyMistralConfig, MISTRAL_DEFAULT_MODEL_REF } from "./onboard.js"; const PROVIDER_ID = "mistral"; @@ -50,6 +51,7 @@ const mistralPlugin = { ], }, }); + api.registerMediaUnderstandingProvider(mistralMediaUnderstandingProvider); }, }; diff --git a/extensions/moonshot/index.ts b/extensions/moonshot/index.ts index e8d7ecedb0c..80bd7af6763 100644 --- a/extensions/moonshot/index.ts +++ b/extensions/moonshot/index.ts @@ -9,6 +9,7 @@ import { getScopedCredentialValue, setScopedCredentialValue, } from "openclaw/plugin-sdk/provider-web-search"; +import { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js"; import { applyMoonshotConfig, applyMoonshotConfigCn, @@ -98,6 +99,7 @@ const moonshotPlugin = { return createMoonshotThinkingWrapper(ctx.streamFn, thinkingType); }, }); + api.registerMediaUnderstandingProvider(moonshotMediaUnderstandingProvider); api.registerWebSearchProvider( createPluginBackedWebSearchProvider({ id: "kimi", diff --git a/extensions/moonshot/media-understanding-provider.ts b/extensions/moonshot/media-understanding-provider.ts index 5814ee96e22..6c652ae58d3 100644 --- a/extensions/moonshot/media-understanding-provider.ts +++ b/extensions/moonshot/media-understanding-provider.ts @@ -1,11 +1,12 @@ import { - assertOkOrThrowHttpError, describeImageWithModel, - normalizeBaseUrl, - postJsonRequest, + describeImagesWithModel, type MediaUnderstandingProvider, type VideoDescriptionRequest, type VideoDescriptionResult, + assertOkOrThrowHttpError, + normalizeBaseUrl, + postJsonRequest, } from "openclaw/plugin-sdk/media-understanding"; export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1"; @@ -116,5 +117,6 @@ export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "moonshot", capabilities: ["image", "video"], describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, describeVideo: describeMoonshotVideo, }; diff --git a/extensions/openai/index.ts b/extensions/openai/index.ts index 831e49acdd8..d22b7275691 100644 --- a/extensions/openai/index.ts +++ b/extensions/openai/index.ts @@ -1,5 +1,6 @@ import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core"; import { buildOpenAISpeechProvider } from "openclaw/plugin-sdk/speech"; +import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js"; import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js"; import { buildOpenAIProvider } from "./openai-provider.js"; @@ -12,6 +13,7 @@ const openAIPlugin = { api.registerProvider(buildOpenAIProvider()); api.registerProvider(buildOpenAICodexProviderPlugin()); api.registerSpeechProvider(buildOpenAISpeechProvider()); + api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider); }, }; diff --git a/extensions/openai/media-understanding-provider.ts b/extensions/openai/media-understanding-provider.ts index dcb0a731a91..9fb66df20dc 100644 --- a/extensions/openai/media-understanding-provider.ts +++ b/extensions/openai/media-understanding-provider.ts @@ -1,5 +1,6 @@ import { describeImageWithModel, + describeImagesWithModel, transcribeOpenAiCompatibleAudio, type AudioTranscriptionRequest, type MediaUnderstandingProvider, @@ -20,5 +21,6 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "openai", capabilities: ["image", "audio"], describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, transcribeAudio: transcribeOpenAiAudio, }; diff --git a/extensions/zai/index.ts b/extensions/zai/index.ts index 0faef49c4fb..109bf5144a1 100644 --- a/extensions/zai/index.ts +++ b/extensions/zai/index.ts @@ -25,6 +25,7 @@ import { DEFAULT_CONTEXT_TOKENS, normalizeModelCompat } from "openclaw/plugin-sd import { createZaiToolStreamWrapper } from "openclaw/plugin-sdk/provider-stream"; import { fetchZaiUsage } from "openclaw/plugin-sdk/provider-usage"; import { detectZaiEndpoint, type ZaiEndpointId } from "./detect.js"; +import { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js"; import { applyZaiConfig, applyZaiProviderConfig, ZAI_DEFAULT_MODEL_REF } from "./onboard.js"; const PROVIDER_ID = "zai"; @@ -333,6 +334,7 @@ const zaiPlugin = { fetchUsageSnapshot: async (ctx) => await fetchZaiUsage(ctx.token, ctx.timeoutMs, ctx.fetchFn), isCacheTtlEligible: () => true, }); + api.registerMediaUnderstandingProvider(zaiMediaUnderstandingProvider); }, }; diff --git a/extensions/zai/media-understanding-provider.ts b/extensions/zai/media-understanding-provider.ts index 08f8c186d4d..bd571230b2d 100644 --- a/extensions/zai/media-understanding-provider.ts +++ b/extensions/zai/media-understanding-provider.ts @@ -1,5 +1,6 @@ import { describeImageWithModel, + describeImagesWithModel, type MediaUnderstandingProvider, } from "openclaw/plugin-sdk/media-understanding"; @@ -7,4 +8,5 @@ export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "zai", capabilities: ["image"], describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, }; diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index bcec7f32de7..c58a7f9aa1a 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -32,6 +32,7 @@ async function withTempAgentDir(run: (agentDir: string) => Promise): Promi const ONE_PIXEL_PNG_B64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs="; +const ONE_PIXEL_JPEG_B64 = "QUJDRA=="; async function withTempWorkspacePng( cb: (args: { workspaceDir: string; imagePath: string }) => Promise, @@ -736,10 +737,10 @@ describe("image tool MiniMax VLM routing", () => { const res = await tool.execute("t1", { prompt: "Compare these images.", - images: [`data:image/png;base64,${pngB64}`, `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`], + images: [`data:image/png;base64,${pngB64}`, `data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`], }); - expect(fetch).toHaveBeenCalledTimes(1); + expect(fetch).toHaveBeenCalledTimes(2); const details = res.details as | { images?: Array<{ image: string }>; @@ -756,12 +757,12 @@ describe("image tool MiniMax VLM routing", () => { image: `data:image/png;base64,${pngB64}`, images: [ `data:image/png;base64,${pngB64}`, - `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`, - `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`, + `data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`, + `data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`, ], }); - expect(fetch).toHaveBeenCalledTimes(1); + expect(fetch).toHaveBeenCalledTimes(2); const dedupedDetails = deduped.details as | { images?: Array<{ image: string }>; @@ -776,7 +777,7 @@ describe("image tool MiniMax VLM routing", () => { maxImages: 1, }); - expect(fetch).toHaveBeenCalledTimes(1); + expect(fetch).toHaveBeenCalledTimes(2); expect(tooMany.details).toMatchObject({ error: "too_many_images", count: 2, diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 402ee0b3eda..8dd471b8a7d 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,9 +1,10 @@ -import { type Context, complete } from "@mariozechner/pi-ai"; import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; +import { getMediaUnderstandingProvider } from "../../media-understanding/providers/index.js"; +import { buildProviderRegistry } from "../../media-understanding/runner.js"; import { loadWebMedia } from "../../plugin-sdk/web-media.js"; import { resolveUserPath } from "../../utils.js"; -import { isMinimaxVlmModel, isMinimaxVlmProvider, minimaxUnderstandImage } from "../minimax-vlm.js"; +import { isMinimaxVlmProvider } from "../minimax-vlm.js"; import { coerceImageAssistantText, coerceImageModelConfig, @@ -14,17 +15,12 @@ import { import { applyImageModelConfigDefaults, buildTextToolResult, - resolveModelFromRegistry, resolveMediaToolLocalRoots, - resolveModelRuntimeApiKey, resolvePromptAndModelOverride, } from "./media-tool-shared.js"; import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js"; import { createSandboxBridgeReadFile, - discoverAuthStorage, - discoverModels, - ensureOpenClawModelsJson, resolveSandboxedBridgeMediaPath, runWithImageModelFallback, type AnyAgentTool, @@ -168,27 +164,6 @@ function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undef return undefined; } -function buildImageContext( - prompt: string, - images: Array<{ base64: string; mimeType: string }>, -): Context { - const content: Array< - { type: "text"; text: string } | { type: "image"; data: string; mimeType: string } - > = [{ type: "text", text: prompt }]; - for (const img of images) { - content.push({ type: "image", data: img.base64, mimeType: img.mimeType }); - } - return { - messages: [ - { - role: "user", - content, - timestamp: Date.now(), - }, - ], - }; -} - type ImageSandboxConfig = { root: string; bridge: SandboxFsBridge; @@ -200,7 +175,7 @@ async function runImagePrompt(params: { imageModelConfig: ImageModelConfig; modelOverride?: string; prompt: string; - images: Array<{ base64: string; mimeType: string }>; + images: Array<{ buffer: Buffer; mimeType: string }>; }): Promise<{ text: string; provider: string; @@ -208,50 +183,75 @@ async function runImagePrompt(params: { attempts: Array<{ provider: string; model: string; error: string }>; }> { const effectiveCfg = applyImageModelConfigDefaults(params.cfg, params.imageModelConfig); - - await ensureOpenClawModelsJson(effectiveCfg, params.agentDir); - const authStorage = discoverAuthStorage(params.agentDir); - const modelRegistry = discoverModels(authStorage, params.agentDir); + const providerCfg: OpenClawConfig = effectiveCfg ?? {}; + const providerRegistry = buildProviderRegistry(undefined, providerCfg); const result = await runWithImageModelFallback({ cfg: effectiveCfg, modelOverride: params.modelOverride, run: async (provider, modelId) => { - const model = resolveModelFromRegistry({ modelRegistry, provider, modelId }); - if (!model.input?.includes("image")) { - throw new Error(`Model does not support images: ${provider}/${modelId}`); + const imageProvider = getMediaUnderstandingProvider(provider, providerRegistry); + if (!imageProvider) { + throw new Error(`No media-understanding provider registered for ${provider}`); } - const apiKey = await resolveModelRuntimeApiKey({ - model, - cfg: effectiveCfg, - agentDir: params.agentDir, - authStorage, - }); - - // MiniMax VLM only supports a single image; use the first one. - if (isMinimaxVlmModel(model.provider, model.id)) { - const first = params.images[0]; - const imageDataUrl = `data:${first.mimeType};base64,${first.base64}`; - const text = await minimaxUnderstandImage({ - apiKey, + if (params.images.length > 1 && imageProvider.describeImages) { + const described = await imageProvider.describeImages({ + images: params.images.map((image, index) => ({ + buffer: image.buffer, + fileName: `image-${index + 1}`, + mime: image.mimeType, + })), + provider, + model: modelId, prompt: params.prompt, - imageDataUrl, - modelBaseUrl: model.baseUrl, + maxTokens: resolveImageToolMaxTokens(undefined), + timeoutMs: 30_000, + cfg: providerCfg, + agentDir: params.agentDir, }); - return { text, provider: model.provider, model: model.id }; + return { text: described.text, provider, model: described.model ?? modelId }; + } + if (!imageProvider.describeImage) { + throw new Error(`Provider does not support image analysis: ${provider}`); + } + if (params.images.length === 1) { + const image = params.images[0]; + const described = await imageProvider.describeImage({ + buffer: image.buffer, + fileName: "image-1", + mime: image.mimeType, + provider, + model: modelId, + prompt: params.prompt, + maxTokens: resolveImageToolMaxTokens(undefined), + timeoutMs: 30_000, + cfg: providerCfg, + agentDir: params.agentDir, + }); + return { text: described.text, provider, model: described.model ?? modelId }; } - const context = buildImageContext(params.prompt, params.images); - const message = await complete(model, context, { - apiKey, - maxTokens: resolveImageToolMaxTokens(model.maxTokens), - }); - const text = coerceImageAssistantText({ - message, - provider: model.provider, - model: model.id, - }); - return { text, provider: model.provider, model: model.id }; + const parts: string[] = []; + for (const [index, image] of params.images.entries()) { + const described = await imageProvider.describeImage({ + buffer: image.buffer, + fileName: `image-${index + 1}`, + mime: image.mimeType, + provider, + model: modelId, + prompt: `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length}.`, + maxTokens: resolveImageToolMaxTokens(undefined), + timeoutMs: 30_000, + cfg: providerCfg, + agentDir: params.agentDir, + }); + parts.push(`Image ${index + 1}:\n${described.text.trim()}`); + } + return { + text: parts.join("\n\n").trim(), + provider, + model: modelId, + }; }, }); @@ -383,7 +383,7 @@ export function createImageTool(options?: { // MARK: - Load and resolve each image const loadedImages: Array<{ - base64: string; + buffer: Buffer; mimeType: string; resolvedImage: string; rewrittenFrom?: string; @@ -469,9 +469,8 @@ export function createImageTool(options?: { ("contentType" in media && media.contentType) || ("mimeType" in media && media.mimeType) || "image/png"; - const base64 = media.buffer.toString("base64"); loadedImages.push({ - base64, + buffer: media.buffer, mimeType, resolvedImage, ...(resolvedPathInfo.rewrittenFrom @@ -487,7 +486,7 @@ export function createImageTool(options?: { imageModelConfig, modelOverride, prompt: promptRaw, - images: loadedImages.map((img) => ({ base64: img.base64, mimeType: img.mimeType })), + images: loadedImages.map((img) => ({ buffer: img.buffer, mimeType: img.mimeType })), }); const imageDetails = diff --git a/src/media-understanding/providers/image.test.ts b/src/media-understanding/providers/image.test.ts index 51c8739f43a..d52c6590eef 100644 --- a/src/media-understanding/providers/image.test.ts +++ b/src/media-understanding/providers/image.test.ts @@ -8,9 +8,15 @@ const getApiKeyForModelMock = vi.fn(async () => ({ source: "test", mode: "oauth", })); +const resolveApiKeyForProviderMock = vi.fn(async () => ({ + apiKey: "oauth-test", // pragma: allowlist secret + source: "test", + mode: "oauth", +})); const requireApiKeyMock = vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""); const setRuntimeApiKeyMock = vi.fn(); const discoverModelsMock = vi.fn(); +let imageImportSeq = 0; vi.mock("@mariozechner/pi-ai", async (importOriginal) => { const actual = await importOriginal(); @@ -34,6 +40,7 @@ vi.mock("../../agents/models-config.js", () => ({ vi.mock("../../agents/model-auth.js", () => ({ getApiKeyForModel: getApiKeyForModelMock, + resolveApiKeyForProvider: resolveApiKeyForProviderMock, requireApiKey: requireApiKeyMock, })); @@ -44,6 +51,11 @@ vi.mock("../../agents/pi-model-discovery-runtime.js", () => ({ discoverModels: discoverModelsMock, })); +async function importImageModule() { + imageImportSeq += 1; + return await import(/* @vite-ignore */ `./image.js?case=${imageImportSeq}`); +} + describe("describeImageWithModel", () => { beforeEach(() => { vi.clearAllMocks(); @@ -59,7 +71,7 @@ describe("describeImageWithModel", () => { }); it("routes minimax-portal image models through the MiniMax VLM endpoint", async () => { - const { describeImageWithModel } = await import("./image.js"); + const { describeImageWithModel } = await importImageModule(); const result = await describeImageWithModel({ cfg: {}, @@ -109,7 +121,7 @@ describe("describeImageWithModel", () => { content: [{ type: "text", text: "generic ok" }], }); - const { describeImageWithModel } = await import("./image.js"); + const { describeImageWithModel } = await importImageModule(); const result = await describeImageWithModel({ cfg: {}, @@ -153,7 +165,7 @@ describe("describeImageWithModel", () => { content: [{ type: "text", text: "flash ok" }], }); - const { describeImageWithModel } = await import("./image.js"); + const { describeImageWithModel } = await importImageModule(); const result = await describeImageWithModel({ cfg: {}, @@ -203,7 +215,7 @@ describe("describeImageWithModel", () => { content: [{ type: "text", text: "flash lite ok" }], }); - const { describeImageWithModel } = await import("./image.js"); + const { describeImageWithModel } = await importImageModule(); const result = await describeImageWithModel({ cfg: {}, diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts index 1511a7c9bb9..9d7dc67949b 100644 --- a/src/media-understanding/providers/image.ts +++ b/src/media-understanding/providers/image.ts @@ -1,11 +1,20 @@ import type { Api, Context, Model } from "@mariozechner/pi-ai"; import { complete } from "@mariozechner/pi-ai"; import { isMinimaxVlmModel, minimaxUnderstandImage } from "../../agents/minimax-vlm.js"; -import { getApiKeyForModel, requireApiKey } from "../../agents/model-auth.js"; +import { + getApiKeyForModel, + requireApiKey, + resolveApiKeyForProvider, +} from "../../agents/model-auth.js"; import { normalizeModelRef } from "../../agents/model-selection.js"; import { ensureOpenClawModelsJson } from "../../agents/models-config.js"; import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js"; -import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js"; +import type { + ImageDescriptionRequest, + ImageDescriptionResult, + ImagesDescriptionRequest, + ImagesDescriptionResult, +} from "../types.js"; let piModelDiscoveryRuntimePromise: Promise< typeof import("../../agents/pi-model-discovery-runtime.js") @@ -16,14 +25,29 @@ function loadPiModelDiscoveryRuntime() { return piModelDiscoveryRuntimePromise; } -export async function describeImageWithModel( - params: ImageDescriptionRequest, -): Promise { +function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requestedMaxTokens = 4096) { + if ( + typeof modelMaxTokens !== "number" || + !Number.isFinite(modelMaxTokens) || + modelMaxTokens <= 0 + ) { + return requestedMaxTokens; + } + return Math.min(requestedMaxTokens, modelMaxTokens); +} + +async function resolveImageRuntime(params: { + cfg: ImageDescriptionRequest["cfg"]; + agentDir: string; + provider: string; + model: string; + profile?: string; + preferredProfile?: string; +}): Promise<{ apiKey: string; model: Model }> { await ensureOpenClawModelsJson(params.cfg, params.agentDir); const { discoverAuthStorage, discoverModels } = await loadPiModelDiscoveryRuntime(); const authStorage = discoverAuthStorage(params.agentDir); const modelRegistry = discoverModels(authStorage, params.agentDir); - // Keep direct media config entries compatible with deprecated provider model aliases. const resolvedRef = normalizeModelRef(params.provider, params.model); const model = modelRegistry.find(resolvedRef.provider, resolvedRef.model) as Model | null; if (!model) { @@ -41,33 +65,132 @@ export async function describeImageWithModel( }); const apiKey = requireApiKey(apiKeyInfo, model.provider); authStorage.setRuntimeApiKey(model.provider, apiKey); + return { apiKey, model }; +} - const base64 = params.buffer.toString("base64"); - if (isMinimaxVlmModel(model.provider, model.id)) { - const text = await minimaxUnderstandImage({ - apiKey, - prompt: params.prompt ?? "Describe the image.", - imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`, - modelBaseUrl: model.baseUrl, - }); - return { text, model: model.id }; - } - - const context: Context = { +function buildImageContext( + prompt: string, + images: Array<{ buffer: Buffer; mime?: string }>, +): Context { + return { messages: [ { role: "user", content: [ - { type: "text", text: params.prompt ?? "Describe the image." }, - { type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" }, + { type: "text", text: prompt }, + ...images.map((image) => ({ + type: "image" as const, + data: image.buffer.toString("base64"), + mimeType: image.mime ?? "image/jpeg", + })), ], timestamp: Date.now(), }, ], }; +} + +async function describeImagesWithMinimax(params: { + apiKey: string; + modelId: string; + modelBaseUrl?: string; + prompt: string; + images: Array<{ buffer: Buffer; mime?: string }>; +}): Promise { + const responses: string[] = []; + for (const [index, image] of params.images.entries()) { + const prompt = + params.images.length > 1 + ? `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length} independently.` + : params.prompt; + const text = await minimaxUnderstandImage({ + apiKey: params.apiKey, + prompt, + imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`, + modelBaseUrl: params.modelBaseUrl, + }); + responses.push(params.images.length > 1 ? `Image ${index + 1}:\n${text.trim()}` : text.trim()); + } + return { + text: responses.join("\n\n").trim(), + model: params.modelId, + }; +} + +function isUnknownModelError(err: unknown): boolean { + return err instanceof Error && /^Unknown model:/i.test(err.message); +} + +function resolveConfiguredProviderBaseUrl( + cfg: ImageDescriptionRequest["cfg"], + provider: string, +): string | undefined { + const direct = cfg.models?.providers?.[provider]; + if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) { + return direct.baseUrl.trim(); + } + return undefined; +} + +async function resolveMinimaxVlmFallbackRuntime(params: { + cfg: ImageDescriptionRequest["cfg"]; + agentDir: string; + provider: string; + profile?: string; + preferredProfile?: string; +}): Promise<{ apiKey: string; modelBaseUrl?: string }> { + const auth = await resolveApiKeyForProvider({ + provider: params.provider, + cfg: params.cfg, + profileId: params.profile, + preferredProfile: params.preferredProfile, + agentDir: params.agentDir, + }); + return { + apiKey: requireApiKey(auth, params.provider), + modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider), + }; +} + +export async function describeImagesWithModel( + params: ImagesDescriptionRequest, +): Promise { + const prompt = params.prompt ?? "Describe the image."; + let apiKey: string; + let model: Model | undefined; + + try { + const resolved = await resolveImageRuntime(params); + apiKey = resolved.apiKey; + model = resolved.model; + } catch (err) { + if (!isMinimaxVlmModel(params.provider, params.model) || !isUnknownModelError(err)) { + throw err; + } + const fallback = await resolveMinimaxVlmFallbackRuntime(params); + return await describeImagesWithMinimax({ + apiKey: fallback.apiKey, + modelId: params.model, + modelBaseUrl: fallback.modelBaseUrl, + prompt, + images: params.images, + }); + } + + if (isMinimaxVlmModel(model.provider, model.id)) { + return await describeImagesWithMinimax({ + apiKey, + modelId: model.id, + modelBaseUrl: model.baseUrl, + prompt, + images: params.images, + }); + } + + const context = buildImageContext(prompt, params.images); const message = await complete(model, context, { apiKey, - maxTokens: params.maxTokens ?? 512, + maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512), }); const text = coerceImageAssistantText({ message, @@ -76,3 +199,26 @@ export async function describeImageWithModel( }); return { text, model: model.id }; } + +export async function describeImageWithModel( + params: ImageDescriptionRequest, +): Promise { + return await describeImagesWithModel({ + images: [ + { + buffer: params.buffer, + fileName: params.fileName, + mime: params.mime, + }, + ], + model: params.model, + provider: params.provider, + prompt: params.prompt, + maxTokens: params.maxTokens, + timeoutMs: params.timeoutMs, + profile: params.profile, + preferredProfile: params.preferredProfile, + agentDir: params.agentDir, + cfg: params.cfg, + }); +} diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index 67a45fc2019..32d1d6bcf9a 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -1,10 +1,33 @@ +import { anthropicMediaUnderstandingProvider } from "../../../extensions/anthropic/media-understanding-provider.js"; +import { googleMediaUnderstandingProvider } from "../../../extensions/google/media-understanding-provider.js"; +import { + minimaxMediaUnderstandingProvider, + minimaxPortalMediaUnderstandingProvider, +} from "../../../extensions/minimax/media-understanding-provider.js"; +import { mistralMediaUnderstandingProvider } from "../../../extensions/mistral/media-understanding-provider.js"; +import { moonshotMediaUnderstandingProvider } from "../../../extensions/moonshot/media-understanding-provider.js"; +import { openaiMediaUnderstandingProvider } from "../../../extensions/openai/media-understanding-provider.js"; +import { zaiMediaUnderstandingProvider } from "../../../extensions/zai/media-understanding-provider.js"; import { normalizeProviderId } from "../../agents/model-selection.js"; +import type { OpenClawConfig } from "../../config/config.js"; +import { loadOpenClawPlugins } from "../../plugins/loader.js"; import { getActivePluginRegistry } from "../../plugins/runtime.js"; import type { MediaUnderstandingProvider } from "../types.js"; import { deepgramProvider } from "./deepgram/index.js"; import { groqProvider } from "./groq/index.js"; -const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, deepgramProvider]; +const PROVIDERS: MediaUnderstandingProvider[] = [ + groqProvider, + deepgramProvider, + anthropicMediaUnderstandingProvider, + googleMediaUnderstandingProvider, + minimaxMediaUnderstandingProvider, + minimaxPortalMediaUnderstandingProvider, + mistralMediaUnderstandingProvider, + moonshotMediaUnderstandingProvider, + openaiMediaUnderstandingProvider, + zaiMediaUnderstandingProvider, +]; function mergeProviderIntoRegistry( registry: Map, @@ -32,12 +55,18 @@ export function normalizeMediaProviderId(id: string): string { export function buildMediaUnderstandingRegistry( overrides?: Record, + cfg?: OpenClawConfig, ): Map { const registry = new Map(); for (const provider of PROVIDERS) { mergeProviderIntoRegistry(registry, provider); } - for (const entry of getActivePluginRegistry()?.mediaUnderstandingProviders ?? []) { + const active = getActivePluginRegistry(); + const pluginRegistry = + (active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg + ? active + : loadOpenClawPlugins({ config: cfg }); + for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) { mergeProviderIntoRegistry(registry, entry.provider); } if (overrides) { diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index a04cc6420fa..807edb45c22 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -75,8 +75,9 @@ export type RunCapabilityResult = { export function buildProviderRegistry( overrides?: Record, + cfg?: OpenClawConfig, ): ProviderRegistry { - return buildMediaUnderstandingRegistry(overrides); + return buildMediaUnderstandingRegistry(overrides, cfg); } export function normalizeMediaAttachments(ctx: MsgContext): MediaAttachment[] { diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts index e9351921dac..043baf81f91 100644 --- a/src/media-understanding/runtime.ts +++ b/src/media-understanding/runtime.ts @@ -48,7 +48,7 @@ export async function runMediaUnderstandingFile( return { text: undefined }; } - const providerRegistry = buildProviderRegistry(); + const providerRegistry = buildProviderRegistry(undefined, params.cfg); const cache = createMediaAttachmentCache(attachments, { localPathRoots: [path.dirname(params.filePath)], }); diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index 60c425626de..36c467e105f 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -90,6 +90,25 @@ export type ImageDescriptionRequest = { buffer: Buffer; fileName: string; mime?: string; + prompt?: string; + maxTokens?: number; + timeoutMs: number; + profile?: string; + preferredProfile?: string; + agentDir: string; + cfg: import("../config/config.js").OpenClawConfig; + model: string; + provider: string; +}; + +export type ImagesDescriptionInput = { + buffer: Buffer; + fileName: string; + mime?: string; +}; + +export type ImagesDescriptionRequest = { + images: ImagesDescriptionInput[]; model: string; provider: string; prompt?: string; @@ -106,10 +125,16 @@ export type ImageDescriptionResult = { model?: string; }; +export type ImagesDescriptionResult = { + text: string; + model?: string; +}; + export type MediaUnderstandingProvider = { id: string; capabilities?: MediaUnderstandingCapability[]; transcribeAudio?: (req: AudioTranscriptionRequest) => Promise; describeVideo?: (req: VideoDescriptionRequest) => Promise; describeImage?: (req: ImageDescriptionRequest) => Promise; + describeImages?: (req: ImagesDescriptionRequest) => Promise; }; diff --git a/src/plugin-sdk/media-understanding.ts b/src/plugin-sdk/media-understanding.ts index 052736afc3d..0d14685dbdf 100644 --- a/src/plugin-sdk/media-understanding.ts +++ b/src/plugin-sdk/media-understanding.ts @@ -5,12 +5,15 @@ export type { AudioTranscriptionResult, ImageDescriptionRequest, ImageDescriptionResult, + ImagesDescriptionInput, + ImagesDescriptionRequest, + ImagesDescriptionResult, MediaUnderstandingProvider, VideoDescriptionRequest, VideoDescriptionResult, } from "../media-understanding/types.js"; -export { describeImageWithModel } from "../media-understanding/providers/image.js"; +export { describeImageWithModel, describeImagesWithModel } from "../media-understanding/providers/image.js"; export { transcribeOpenAiCompatibleAudio } from "../media-understanding/providers/openai-compatible-audio.js"; export { assertOkOrThrowHttpError, diff --git a/src/plugins/contracts/registry.contract.test.ts b/src/plugins/contracts/registry.contract.test.ts index 06430449808..0f6d588ea1a 100644 --- a/src/plugins/contracts/registry.contract.test.ts +++ b/src/plugins/contracts/registry.contract.test.ts @@ -43,6 +43,16 @@ function findMediaUnderstandingProviderIdsForPlugin(pluginId: string) { .toSorted((left, right) => left.localeCompare(right)); } +function findMediaUnderstandingProviderForPlugin(pluginId: string) { + const entry = mediaUnderstandingProviderContractRegistry.find( + (candidate) => candidate.pluginId === pluginId, + ); + if (!entry) { + throw new Error(`media-understanding provider contract missing for ${pluginId}`); + } + return entry.provider; +} + function findRegistrationForPlugin(pluginId: string) { const entry = pluginRegistrationContractRegistry.find( (candidate) => candidate.pluginId === pluginId, @@ -141,4 +151,25 @@ describe("plugin contract registry", () => { expect(findSpeechProviderForPlugin("elevenlabs").listVoices).toEqual(expect.any(Function)); expect(findSpeechProviderForPlugin("microsoft").listVoices).toEqual(expect.any(Function)); }); + + it("keeps bundled multi-image support explicit", () => { + expect(findMediaUnderstandingProviderForPlugin("anthropic").describeImages).toEqual( + expect.any(Function), + ); + expect(findMediaUnderstandingProviderForPlugin("google").describeImages).toEqual( + expect.any(Function), + ); + expect(findMediaUnderstandingProviderForPlugin("minimax").describeImages).toEqual( + expect.any(Function), + ); + expect(findMediaUnderstandingProviderForPlugin("moonshot").describeImages).toEqual( + expect.any(Function), + ); + expect(findMediaUnderstandingProviderForPlugin("openai").describeImages).toEqual( + expect.any(Function), + ); + expect(findMediaUnderstandingProviderForPlugin("zai").describeImages).toEqual( + expect.any(Function), + ); + }); });