feat(media): route image tool through media providers

This commit is contained in:
Peter Steinberger 2026-03-16 21:59:39 -07:00
parent 7fa3825e80
commit 50c3321d2e
No known key found for this signature in database
22 changed files with 382 additions and 106 deletions

View File

@ -28,6 +28,7 @@ import {
} from "openclaw/plugin-sdk/provider-auth";
import { normalizeModelCompat } from "openclaw/plugin-sdk/provider-models";
import { fetchClaudeUsage } from "openclaw/plugin-sdk/provider-usage";
import { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js";
const PROVIDER_ID = "anthropic";
const DEFAULT_ANTHROPIC_MODEL = "anthropic/claude-sonnet-4-6";
@ -396,6 +397,7 @@ const anthropicPlugin = {
profileId: ctx.profileId,
}),
});
api.registerMediaUnderstandingProvider(anthropicMediaUnderstandingProvider);
},
};

View File

@ -1,5 +1,6 @@
import {
describeImageWithModel,
describeImagesWithModel,
type MediaUnderstandingProvider,
} from "openclaw/plugin-sdk/media-understanding";
@ -7,4 +8,5 @@ export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "anthropic",
capabilities: ["image"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -2,6 +2,7 @@ import { normalizeGoogleModelId, parseGeminiAuth } from "openclaw/plugin-sdk/goo
import {
assertOkOrThrowHttpError,
describeImageWithModel,
describeImagesWithModel,
normalizeBaseUrl,
postJsonRequest,
type AudioTranscriptionRequest,
@ -142,6 +143,7 @@ export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "google",
capabilities: ["image", "audio", "video"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
transcribeAudio: transcribeGeminiAudio,
describeVideo: describeGeminiVideo,
};

View File

@ -13,6 +13,10 @@ import {
listProfilesForProvider,
} from "openclaw/plugin-sdk/provider-auth";
import { fetchMinimaxUsage } from "openclaw/plugin-sdk/provider-usage";
import {
minimaxMediaUnderstandingProvider,
minimaxPortalMediaUnderstandingProvider,
} from "./media-understanding-provider.js";
import { loginMiniMaxPortalOAuth, type MiniMaxRegion } from "./oauth.js";
import { applyMinimaxApiConfig, applyMinimaxApiConfigCn } from "./onboard.js";
import { buildMinimaxPortalProvider, buildMinimaxProvider } from "./provider-catalog.js";
@ -273,6 +277,8 @@ const minimaxPlugin = {
],
isModernModelRef: ({ modelId }) => isModernMiniMaxModel(modelId),
});
api.registerMediaUnderstandingProvider(minimaxMediaUnderstandingProvider);
api.registerMediaUnderstandingProvider(minimaxPortalMediaUnderstandingProvider);
},
};

View File

@ -1,5 +1,6 @@
import {
describeImageWithModel,
describeImagesWithModel,
type MediaUnderstandingProvider,
} from "openclaw/plugin-sdk/media-understanding";
@ -7,10 +8,12 @@ export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "minimax",
capabilities: ["image"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};
export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "minimax-portal",
capabilities: ["image"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -1,5 +1,6 @@
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
import { createProviderApiKeyAuthMethod } from "openclaw/plugin-sdk/provider-auth";
import { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js";
import { applyMistralConfig, MISTRAL_DEFAULT_MODEL_REF } from "./onboard.js";
const PROVIDER_ID = "mistral";
@ -50,6 +51,7 @@ const mistralPlugin = {
],
},
});
api.registerMediaUnderstandingProvider(mistralMediaUnderstandingProvider);
},
};

View File

@ -9,6 +9,7 @@ import {
getScopedCredentialValue,
setScopedCredentialValue,
} from "openclaw/plugin-sdk/provider-web-search";
import { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js";
import {
applyMoonshotConfig,
applyMoonshotConfigCn,
@ -98,6 +99,7 @@ const moonshotPlugin = {
return createMoonshotThinkingWrapper(ctx.streamFn, thinkingType);
},
});
api.registerMediaUnderstandingProvider(moonshotMediaUnderstandingProvider);
api.registerWebSearchProvider(
createPluginBackedWebSearchProvider({
id: "kimi",

View File

@ -1,11 +1,12 @@
import {
assertOkOrThrowHttpError,
describeImageWithModel,
normalizeBaseUrl,
postJsonRequest,
describeImagesWithModel,
type MediaUnderstandingProvider,
type VideoDescriptionRequest,
type VideoDescriptionResult,
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
} from "openclaw/plugin-sdk/media-understanding";
export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
@ -116,5 +117,6 @@ export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "moonshot",
capabilities: ["image", "video"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
describeVideo: describeMoonshotVideo,
};

View File

@ -1,5 +1,6 @@
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
import { buildOpenAISpeechProvider } from "openclaw/plugin-sdk/speech";
import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
import { buildOpenAIProvider } from "./openai-provider.js";
@ -12,6 +13,7 @@ const openAIPlugin = {
api.registerProvider(buildOpenAIProvider());
api.registerProvider(buildOpenAICodexProviderPlugin());
api.registerSpeechProvider(buildOpenAISpeechProvider());
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
},
};

View File

@ -1,5 +1,6 @@
import {
describeImageWithModel,
describeImagesWithModel,
transcribeOpenAiCompatibleAudio,
type AudioTranscriptionRequest,
type MediaUnderstandingProvider,
@ -20,5 +21,6 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "openai",
capabilities: ["image", "audio"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
transcribeAudio: transcribeOpenAiAudio,
};

View File

@ -25,6 +25,7 @@ import { DEFAULT_CONTEXT_TOKENS, normalizeModelCompat } from "openclaw/plugin-sd
import { createZaiToolStreamWrapper } from "openclaw/plugin-sdk/provider-stream";
import { fetchZaiUsage } from "openclaw/plugin-sdk/provider-usage";
import { detectZaiEndpoint, type ZaiEndpointId } from "./detect.js";
import { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
import { applyZaiConfig, applyZaiProviderConfig, ZAI_DEFAULT_MODEL_REF } from "./onboard.js";
const PROVIDER_ID = "zai";
@ -333,6 +334,7 @@ const zaiPlugin = {
fetchUsageSnapshot: async (ctx) => await fetchZaiUsage(ctx.token, ctx.timeoutMs, ctx.fetchFn),
isCacheTtlEligible: () => true,
});
api.registerMediaUnderstandingProvider(zaiMediaUnderstandingProvider);
},
};

View File

@ -1,5 +1,6 @@
import {
describeImageWithModel,
describeImagesWithModel,
type MediaUnderstandingProvider,
} from "openclaw/plugin-sdk/media-understanding";
@ -7,4 +8,5 @@ export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "zai",
capabilities: ["image"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -32,6 +32,7 @@ async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promi
const ONE_PIXEL_PNG_B64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs=";
const ONE_PIXEL_JPEG_B64 = "QUJDRA==";
async function withTempWorkspacePng(
cb: (args: { workspaceDir: string; imagePath: string }) => Promise<void>,
@ -736,10 +737,10 @@ describe("image tool MiniMax VLM routing", () => {
const res = await tool.execute("t1", {
prompt: "Compare these images.",
images: [`data:image/png;base64,${pngB64}`, `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`],
images: [`data:image/png;base64,${pngB64}`, `data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`],
});
expect(fetch).toHaveBeenCalledTimes(1);
expect(fetch).toHaveBeenCalledTimes(2);
const details = res.details as
| {
images?: Array<{ image: string }>;
@ -756,12 +757,12 @@ describe("image tool MiniMax VLM routing", () => {
image: `data:image/png;base64,${pngB64}`,
images: [
`data:image/png;base64,${pngB64}`,
`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`,
`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`,
`data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`,
`data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`,
],
});
expect(fetch).toHaveBeenCalledTimes(1);
expect(fetch).toHaveBeenCalledTimes(2);
const dedupedDetails = deduped.details as
| {
images?: Array<{ image: string }>;
@ -776,7 +777,7 @@ describe("image tool MiniMax VLM routing", () => {
maxImages: 1,
});
expect(fetch).toHaveBeenCalledTimes(1);
expect(fetch).toHaveBeenCalledTimes(2);
expect(tooMany.details).toMatchObject({
error: "too_many_images",
count: 2,

View File

@ -1,9 +1,10 @@
import { type Context, complete } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import { getMediaUnderstandingProvider } from "../../media-understanding/providers/index.js";
import { buildProviderRegistry } from "../../media-understanding/runner.js";
import { loadWebMedia } from "../../plugin-sdk/web-media.js";
import { resolveUserPath } from "../../utils.js";
import { isMinimaxVlmModel, isMinimaxVlmProvider, minimaxUnderstandImage } from "../minimax-vlm.js";
import { isMinimaxVlmProvider } from "../minimax-vlm.js";
import {
coerceImageAssistantText,
coerceImageModelConfig,
@ -14,17 +15,12 @@ import {
import {
applyImageModelConfigDefaults,
buildTextToolResult,
resolveModelFromRegistry,
resolveMediaToolLocalRoots,
resolveModelRuntimeApiKey,
resolvePromptAndModelOverride,
} from "./media-tool-shared.js";
import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
import {
createSandboxBridgeReadFile,
discoverAuthStorage,
discoverModels,
ensureOpenClawModelsJson,
resolveSandboxedBridgeMediaPath,
runWithImageModelFallback,
type AnyAgentTool,
@ -168,27 +164,6 @@ function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undef
return undefined;
}
function buildImageContext(
prompt: string,
images: Array<{ base64: string; mimeType: string }>,
): Context {
const content: Array<
{ type: "text"; text: string } | { type: "image"; data: string; mimeType: string }
> = [{ type: "text", text: prompt }];
for (const img of images) {
content.push({ type: "image", data: img.base64, mimeType: img.mimeType });
}
return {
messages: [
{
role: "user",
content,
timestamp: Date.now(),
},
],
};
}
type ImageSandboxConfig = {
root: string;
bridge: SandboxFsBridge;
@ -200,7 +175,7 @@ async function runImagePrompt(params: {
imageModelConfig: ImageModelConfig;
modelOverride?: string;
prompt: string;
images: Array<{ base64: string; mimeType: string }>;
images: Array<{ buffer: Buffer; mimeType: string }>;
}): Promise<{
text: string;
provider: string;
@ -208,50 +183,75 @@ async function runImagePrompt(params: {
attempts: Array<{ provider: string; model: string; error: string }>;
}> {
const effectiveCfg = applyImageModelConfigDefaults(params.cfg, params.imageModelConfig);
await ensureOpenClawModelsJson(effectiveCfg, params.agentDir);
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
const providerCfg: OpenClawConfig = effectiveCfg ?? {};
const providerRegistry = buildProviderRegistry(undefined, providerCfg);
const result = await runWithImageModelFallback({
cfg: effectiveCfg,
modelOverride: params.modelOverride,
run: async (provider, modelId) => {
const model = resolveModelFromRegistry({ modelRegistry, provider, modelId });
if (!model.input?.includes("image")) {
throw new Error(`Model does not support images: ${provider}/${modelId}`);
const imageProvider = getMediaUnderstandingProvider(provider, providerRegistry);
if (!imageProvider) {
throw new Error(`No media-understanding provider registered for ${provider}`);
}
const apiKey = await resolveModelRuntimeApiKey({
model,
cfg: effectiveCfg,
agentDir: params.agentDir,
authStorage,
});
// MiniMax VLM only supports a single image; use the first one.
if (isMinimaxVlmModel(model.provider, model.id)) {
const first = params.images[0];
const imageDataUrl = `data:${first.mimeType};base64,${first.base64}`;
const text = await minimaxUnderstandImage({
apiKey,
if (params.images.length > 1 && imageProvider.describeImages) {
const described = await imageProvider.describeImages({
images: params.images.map((image, index) => ({
buffer: image.buffer,
fileName: `image-${index + 1}`,
mime: image.mimeType,
})),
provider,
model: modelId,
prompt: params.prompt,
imageDataUrl,
modelBaseUrl: model.baseUrl,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
cfg: providerCfg,
agentDir: params.agentDir,
});
return { text, provider: model.provider, model: model.id };
return { text: described.text, provider, model: described.model ?? modelId };
}
if (!imageProvider.describeImage) {
throw new Error(`Provider does not support image analysis: ${provider}`);
}
if (params.images.length === 1) {
const image = params.images[0];
const described = await imageProvider.describeImage({
buffer: image.buffer,
fileName: "image-1",
mime: image.mimeType,
provider,
model: modelId,
prompt: params.prompt,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
cfg: providerCfg,
agentDir: params.agentDir,
});
return { text: described.text, provider, model: described.model ?? modelId };
}
const context = buildImageContext(params.prompt, params.images);
const message = await complete(model, context, {
apiKey,
maxTokens: resolveImageToolMaxTokens(model.maxTokens),
});
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
});
return { text, provider: model.provider, model: model.id };
const parts: string[] = [];
for (const [index, image] of params.images.entries()) {
const described = await imageProvider.describeImage({
buffer: image.buffer,
fileName: `image-${index + 1}`,
mime: image.mimeType,
provider,
model: modelId,
prompt: `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length}.`,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
cfg: providerCfg,
agentDir: params.agentDir,
});
parts.push(`Image ${index + 1}:\n${described.text.trim()}`);
}
return {
text: parts.join("\n\n").trim(),
provider,
model: modelId,
};
},
});
@ -383,7 +383,7 @@ export function createImageTool(options?: {
// MARK: - Load and resolve each image
const loadedImages: Array<{
base64: string;
buffer: Buffer;
mimeType: string;
resolvedImage: string;
rewrittenFrom?: string;
@ -469,9 +469,8 @@ export function createImageTool(options?: {
("contentType" in media && media.contentType) ||
("mimeType" in media && media.mimeType) ||
"image/png";
const base64 = media.buffer.toString("base64");
loadedImages.push({
base64,
buffer: media.buffer,
mimeType,
resolvedImage,
...(resolvedPathInfo.rewrittenFrom
@ -487,7 +486,7 @@ export function createImageTool(options?: {
imageModelConfig,
modelOverride,
prompt: promptRaw,
images: loadedImages.map((img) => ({ base64: img.base64, mimeType: img.mimeType })),
images: loadedImages.map((img) => ({ buffer: img.buffer, mimeType: img.mimeType })),
});
const imageDetails =

View File

@ -8,9 +8,15 @@ const getApiKeyForModelMock = vi.fn(async () => ({
source: "test",
mode: "oauth",
}));
const resolveApiKeyForProviderMock = vi.fn(async () => ({
apiKey: "oauth-test", // pragma: allowlist secret
source: "test",
mode: "oauth",
}));
const requireApiKeyMock = vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? "");
const setRuntimeApiKeyMock = vi.fn();
const discoverModelsMock = vi.fn();
let imageImportSeq = 0;
vi.mock("@mariozechner/pi-ai", async (importOriginal) => {
const actual = await importOriginal<typeof import("@mariozechner/pi-ai")>();
@ -34,6 +40,7 @@ vi.mock("../../agents/models-config.js", () => ({
vi.mock("../../agents/model-auth.js", () => ({
getApiKeyForModel: getApiKeyForModelMock,
resolveApiKeyForProvider: resolveApiKeyForProviderMock,
requireApiKey: requireApiKeyMock,
}));
@ -44,6 +51,11 @@ vi.mock("../../agents/pi-model-discovery-runtime.js", () => ({
discoverModels: discoverModelsMock,
}));
async function importImageModule() {
imageImportSeq += 1;
return await import(/* @vite-ignore */ `./image.js?case=${imageImportSeq}`);
}
describe("describeImageWithModel", () => {
beforeEach(() => {
vi.clearAllMocks();
@ -59,7 +71,7 @@ describe("describeImageWithModel", () => {
});
it("routes minimax-portal image models through the MiniMax VLM endpoint", async () => {
const { describeImageWithModel } = await import("./image.js");
const { describeImageWithModel } = await importImageModule();
const result = await describeImageWithModel({
cfg: {},
@ -109,7 +121,7 @@ describe("describeImageWithModel", () => {
content: [{ type: "text", text: "generic ok" }],
});
const { describeImageWithModel } = await import("./image.js");
const { describeImageWithModel } = await importImageModule();
const result = await describeImageWithModel({
cfg: {},
@ -153,7 +165,7 @@ describe("describeImageWithModel", () => {
content: [{ type: "text", text: "flash ok" }],
});
const { describeImageWithModel } = await import("./image.js");
const { describeImageWithModel } = await importImageModule();
const result = await describeImageWithModel({
cfg: {},
@ -203,7 +215,7 @@ describe("describeImageWithModel", () => {
content: [{ type: "text", text: "flash lite ok" }],
});
const { describeImageWithModel } = await import("./image.js");
const { describeImageWithModel } = await importImageModule();
const result = await describeImageWithModel({
cfg: {},

View File

@ -1,11 +1,20 @@
import type { Api, Context, Model } from "@mariozechner/pi-ai";
import { complete } from "@mariozechner/pi-ai";
import { isMinimaxVlmModel, minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
import { getApiKeyForModel, requireApiKey } from "../../agents/model-auth.js";
import {
getApiKeyForModel,
requireApiKey,
resolveApiKeyForProvider,
} from "../../agents/model-auth.js";
import { normalizeModelRef } from "../../agents/model-selection.js";
import { ensureOpenClawModelsJson } from "../../agents/models-config.js";
import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
import type {
ImageDescriptionRequest,
ImageDescriptionResult,
ImagesDescriptionRequest,
ImagesDescriptionResult,
} from "../types.js";
let piModelDiscoveryRuntimePromise: Promise<
typeof import("../../agents/pi-model-discovery-runtime.js")
@ -16,14 +25,29 @@ function loadPiModelDiscoveryRuntime() {
return piModelDiscoveryRuntimePromise;
}
export async function describeImageWithModel(
params: ImageDescriptionRequest,
): Promise<ImageDescriptionResult> {
function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requestedMaxTokens = 4096) {
if (
typeof modelMaxTokens !== "number" ||
!Number.isFinite(modelMaxTokens) ||
modelMaxTokens <= 0
) {
return requestedMaxTokens;
}
return Math.min(requestedMaxTokens, modelMaxTokens);
}
async function resolveImageRuntime(params: {
cfg: ImageDescriptionRequest["cfg"];
agentDir: string;
provider: string;
model: string;
profile?: string;
preferredProfile?: string;
}): Promise<{ apiKey: string; model: Model<Api> }> {
await ensureOpenClawModelsJson(params.cfg, params.agentDir);
const { discoverAuthStorage, discoverModels } = await loadPiModelDiscoveryRuntime();
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
// Keep direct media config entries compatible with deprecated provider model aliases.
const resolvedRef = normalizeModelRef(params.provider, params.model);
const model = modelRegistry.find(resolvedRef.provider, resolvedRef.model) as Model<Api> | null;
if (!model) {
@ -41,33 +65,132 @@ export async function describeImageWithModel(
});
const apiKey = requireApiKey(apiKeyInfo, model.provider);
authStorage.setRuntimeApiKey(model.provider, apiKey);
return { apiKey, model };
}
const base64 = params.buffer.toString("base64");
if (isMinimaxVlmModel(model.provider, model.id)) {
const text = await minimaxUnderstandImage({
apiKey,
prompt: params.prompt ?? "Describe the image.",
imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
modelBaseUrl: model.baseUrl,
});
return { text, model: model.id };
}
const context: Context = {
function buildImageContext(
prompt: string,
images: Array<{ buffer: Buffer; mime?: string }>,
): Context {
return {
messages: [
{
role: "user",
content: [
{ type: "text", text: params.prompt ?? "Describe the image." },
{ type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
{ type: "text", text: prompt },
...images.map((image) => ({
type: "image" as const,
data: image.buffer.toString("base64"),
mimeType: image.mime ?? "image/jpeg",
})),
],
timestamp: Date.now(),
},
],
};
}
async function describeImagesWithMinimax(params: {
apiKey: string;
modelId: string;
modelBaseUrl?: string;
prompt: string;
images: Array<{ buffer: Buffer; mime?: string }>;
}): Promise<ImagesDescriptionResult> {
const responses: string[] = [];
for (const [index, image] of params.images.entries()) {
const prompt =
params.images.length > 1
? `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length} independently.`
: params.prompt;
const text = await minimaxUnderstandImage({
apiKey: params.apiKey,
prompt,
imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`,
modelBaseUrl: params.modelBaseUrl,
});
responses.push(params.images.length > 1 ? `Image ${index + 1}:\n${text.trim()}` : text.trim());
}
return {
text: responses.join("\n\n").trim(),
model: params.modelId,
};
}
function isUnknownModelError(err: unknown): boolean {
return err instanceof Error && /^Unknown model:/i.test(err.message);
}
function resolveConfiguredProviderBaseUrl(
cfg: ImageDescriptionRequest["cfg"],
provider: string,
): string | undefined {
const direct = cfg.models?.providers?.[provider];
if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) {
return direct.baseUrl.trim();
}
return undefined;
}
async function resolveMinimaxVlmFallbackRuntime(params: {
cfg: ImageDescriptionRequest["cfg"];
agentDir: string;
provider: string;
profile?: string;
preferredProfile?: string;
}): Promise<{ apiKey: string; modelBaseUrl?: string }> {
const auth = await resolveApiKeyForProvider({
provider: params.provider,
cfg: params.cfg,
profileId: params.profile,
preferredProfile: params.preferredProfile,
agentDir: params.agentDir,
});
return {
apiKey: requireApiKey(auth, params.provider),
modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider),
};
}
export async function describeImagesWithModel(
params: ImagesDescriptionRequest,
): Promise<ImagesDescriptionResult> {
const prompt = params.prompt ?? "Describe the image.";
let apiKey: string;
let model: Model<Api> | undefined;
try {
const resolved = await resolveImageRuntime(params);
apiKey = resolved.apiKey;
model = resolved.model;
} catch (err) {
if (!isMinimaxVlmModel(params.provider, params.model) || !isUnknownModelError(err)) {
throw err;
}
const fallback = await resolveMinimaxVlmFallbackRuntime(params);
return await describeImagesWithMinimax({
apiKey: fallback.apiKey,
modelId: params.model,
modelBaseUrl: fallback.modelBaseUrl,
prompt,
images: params.images,
});
}
if (isMinimaxVlmModel(model.provider, model.id)) {
return await describeImagesWithMinimax({
apiKey,
modelId: model.id,
modelBaseUrl: model.baseUrl,
prompt,
images: params.images,
});
}
const context = buildImageContext(prompt, params.images);
const message = await complete(model, context, {
apiKey,
maxTokens: params.maxTokens ?? 512,
maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512),
});
const text = coerceImageAssistantText({
message,
@ -76,3 +199,26 @@ export async function describeImageWithModel(
});
return { text, model: model.id };
}
export async function describeImageWithModel(
params: ImageDescriptionRequest,
): Promise<ImageDescriptionResult> {
return await describeImagesWithModel({
images: [
{
buffer: params.buffer,
fileName: params.fileName,
mime: params.mime,
},
],
model: params.model,
provider: params.provider,
prompt: params.prompt,
maxTokens: params.maxTokens,
timeoutMs: params.timeoutMs,
profile: params.profile,
preferredProfile: params.preferredProfile,
agentDir: params.agentDir,
cfg: params.cfg,
});
}

View File

@ -1,10 +1,33 @@
import { anthropicMediaUnderstandingProvider } from "../../../extensions/anthropic/media-understanding-provider.js";
import { googleMediaUnderstandingProvider } from "../../../extensions/google/media-understanding-provider.js";
import {
minimaxMediaUnderstandingProvider,
minimaxPortalMediaUnderstandingProvider,
} from "../../../extensions/minimax/media-understanding-provider.js";
import { mistralMediaUnderstandingProvider } from "../../../extensions/mistral/media-understanding-provider.js";
import { moonshotMediaUnderstandingProvider } from "../../../extensions/moonshot/media-understanding-provider.js";
import { openaiMediaUnderstandingProvider } from "../../../extensions/openai/media-understanding-provider.js";
import { zaiMediaUnderstandingProvider } from "../../../extensions/zai/media-understanding-provider.js";
import { normalizeProviderId } from "../../agents/model-selection.js";
import type { OpenClawConfig } from "../../config/config.js";
import { loadOpenClawPlugins } from "../../plugins/loader.js";
import { getActivePluginRegistry } from "../../plugins/runtime.js";
import type { MediaUnderstandingProvider } from "../types.js";
import { deepgramProvider } from "./deepgram/index.js";
import { groqProvider } from "./groq/index.js";
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, deepgramProvider];
const PROVIDERS: MediaUnderstandingProvider[] = [
groqProvider,
deepgramProvider,
anthropicMediaUnderstandingProvider,
googleMediaUnderstandingProvider,
minimaxMediaUnderstandingProvider,
minimaxPortalMediaUnderstandingProvider,
mistralMediaUnderstandingProvider,
moonshotMediaUnderstandingProvider,
openaiMediaUnderstandingProvider,
zaiMediaUnderstandingProvider,
];
function mergeProviderIntoRegistry(
registry: Map<string, MediaUnderstandingProvider>,
@ -32,12 +55,18 @@ export function normalizeMediaProviderId(id: string): string {
export function buildMediaUnderstandingRegistry(
overrides?: Record<string, MediaUnderstandingProvider>,
cfg?: OpenClawConfig,
): Map<string, MediaUnderstandingProvider> {
const registry = new Map<string, MediaUnderstandingProvider>();
for (const provider of PROVIDERS) {
mergeProviderIntoRegistry(registry, provider);
}
for (const entry of getActivePluginRegistry()?.mediaUnderstandingProviders ?? []) {
const active = getActivePluginRegistry();
const pluginRegistry =
(active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg
? active
: loadOpenClawPlugins({ config: cfg });
for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) {
mergeProviderIntoRegistry(registry, entry.provider);
}
if (overrides) {

View File

@ -75,8 +75,9 @@ export type RunCapabilityResult = {
export function buildProviderRegistry(
overrides?: Record<string, MediaUnderstandingProvider>,
cfg?: OpenClawConfig,
): ProviderRegistry {
return buildMediaUnderstandingRegistry(overrides);
return buildMediaUnderstandingRegistry(overrides, cfg);
}
export function normalizeMediaAttachments(ctx: MsgContext): MediaAttachment[] {

View File

@ -48,7 +48,7 @@ export async function runMediaUnderstandingFile(
return { text: undefined };
}
const providerRegistry = buildProviderRegistry();
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
const cache = createMediaAttachmentCache(attachments, {
localPathRoots: [path.dirname(params.filePath)],
});

View File

@ -90,6 +90,25 @@ export type ImageDescriptionRequest = {
buffer: Buffer;
fileName: string;
mime?: string;
prompt?: string;
maxTokens?: number;
timeoutMs: number;
profile?: string;
preferredProfile?: string;
agentDir: string;
cfg: import("../config/config.js").OpenClawConfig;
model: string;
provider: string;
};
export type ImagesDescriptionInput = {
buffer: Buffer;
fileName: string;
mime?: string;
};
export type ImagesDescriptionRequest = {
images: ImagesDescriptionInput[];
model: string;
provider: string;
prompt?: string;
@ -106,10 +125,16 @@ export type ImageDescriptionResult = {
model?: string;
};
export type ImagesDescriptionResult = {
text: string;
model?: string;
};
export type MediaUnderstandingProvider = {
id: string;
capabilities?: MediaUnderstandingCapability[];
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
describeImages?: (req: ImagesDescriptionRequest) => Promise<ImagesDescriptionResult>;
};

View File

@ -5,12 +5,15 @@ export type {
AudioTranscriptionResult,
ImageDescriptionRequest,
ImageDescriptionResult,
ImagesDescriptionInput,
ImagesDescriptionRequest,
ImagesDescriptionResult,
MediaUnderstandingProvider,
VideoDescriptionRequest,
VideoDescriptionResult,
} from "../media-understanding/types.js";
export { describeImageWithModel } from "../media-understanding/providers/image.js";
export { describeImageWithModel, describeImagesWithModel } from "../media-understanding/providers/image.js";
export { transcribeOpenAiCompatibleAudio } from "../media-understanding/providers/openai-compatible-audio.js";
export {
assertOkOrThrowHttpError,

View File

@ -43,6 +43,16 @@ function findMediaUnderstandingProviderIdsForPlugin(pluginId: string) {
.toSorted((left, right) => left.localeCompare(right));
}
function findMediaUnderstandingProviderForPlugin(pluginId: string) {
const entry = mediaUnderstandingProviderContractRegistry.find(
(candidate) => candidate.pluginId === pluginId,
);
if (!entry) {
throw new Error(`media-understanding provider contract missing for ${pluginId}`);
}
return entry.provider;
}
function findRegistrationForPlugin(pluginId: string) {
const entry = pluginRegistrationContractRegistry.find(
(candidate) => candidate.pluginId === pluginId,
@ -141,4 +151,25 @@ describe("plugin contract registry", () => {
expect(findSpeechProviderForPlugin("elevenlabs").listVoices).toEqual(expect.any(Function));
expect(findSpeechProviderForPlugin("microsoft").listVoices).toEqual(expect.any(Function));
});
it("keeps bundled multi-image support explicit", () => {
expect(findMediaUnderstandingProviderForPlugin("anthropic").describeImages).toEqual(
expect.any(Function),
);
expect(findMediaUnderstandingProviderForPlugin("google").describeImages).toEqual(
expect.any(Function),
);
expect(findMediaUnderstandingProviderForPlugin("minimax").describeImages).toEqual(
expect.any(Function),
);
expect(findMediaUnderstandingProviderForPlugin("moonshot").describeImages).toEqual(
expect.any(Function),
);
expect(findMediaUnderstandingProviderForPlugin("openai").describeImages).toEqual(
expect.any(Function),
);
expect(findMediaUnderstandingProviderForPlugin("zai").describeImages).toEqual(
expect.any(Function),
);
});
});