feat(media): route image tool through media providers
This commit is contained in:
parent
7fa3825e80
commit
50c3321d2e
@ -28,6 +28,7 @@ import {
|
||||
} from "openclaw/plugin-sdk/provider-auth";
|
||||
import { normalizeModelCompat } from "openclaw/plugin-sdk/provider-models";
|
||||
import { fetchClaudeUsage } from "openclaw/plugin-sdk/provider-usage";
|
||||
import { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
||||
const PROVIDER_ID = "anthropic";
|
||||
const DEFAULT_ANTHROPIC_MODEL = "anthropic/claude-sonnet-4-6";
|
||||
@ -396,6 +397,7 @@ const anthropicPlugin = {
|
||||
profileId: ctx.profileId,
|
||||
}),
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(anthropicMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import {
|
||||
describeImageWithModel,
|
||||
describeImagesWithModel,
|
||||
type MediaUnderstandingProvider,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
|
||||
@ -7,4 +8,5 @@ export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "anthropic",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
||||
@ -2,6 +2,7 @@ import { normalizeGoogleModelId, parseGeminiAuth } from "openclaw/plugin-sdk/goo
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
describeImageWithModel,
|
||||
describeImagesWithModel,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
type AudioTranscriptionRequest,
|
||||
@ -142,6 +143,7 @@ export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
capabilities: ["image", "audio", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
transcribeAudio: transcribeGeminiAudio,
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
|
||||
@ -13,6 +13,10 @@ import {
|
||||
listProfilesForProvider,
|
||||
} from "openclaw/plugin-sdk/provider-auth";
|
||||
import { fetchMinimaxUsage } from "openclaw/plugin-sdk/provider-usage";
|
||||
import {
|
||||
minimaxMediaUnderstandingProvider,
|
||||
minimaxPortalMediaUnderstandingProvider,
|
||||
} from "./media-understanding-provider.js";
|
||||
import { loginMiniMaxPortalOAuth, type MiniMaxRegion } from "./oauth.js";
|
||||
import { applyMinimaxApiConfig, applyMinimaxApiConfigCn } from "./onboard.js";
|
||||
import { buildMinimaxPortalProvider, buildMinimaxProvider } from "./provider-catalog.js";
|
||||
@ -273,6 +277,8 @@ const minimaxPlugin = {
|
||||
],
|
||||
isModernModelRef: ({ modelId }) => isModernMiniMaxModel(modelId),
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(minimaxMediaUnderstandingProvider);
|
||||
api.registerMediaUnderstandingProvider(minimaxPortalMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import {
|
||||
describeImageWithModel,
|
||||
describeImagesWithModel,
|
||||
type MediaUnderstandingProvider,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
|
||||
@ -7,10 +8,12 @@ export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
||||
export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax-portal",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
|
||||
import { createProviderApiKeyAuthMethod } from "openclaw/plugin-sdk/provider-auth";
|
||||
import { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import { applyMistralConfig, MISTRAL_DEFAULT_MODEL_REF } from "./onboard.js";
|
||||
|
||||
const PROVIDER_ID = "mistral";
|
||||
@ -50,6 +51,7 @@ const mistralPlugin = {
|
||||
],
|
||||
},
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(mistralMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@ import {
|
||||
getScopedCredentialValue,
|
||||
setScopedCredentialValue,
|
||||
} from "openclaw/plugin-sdk/provider-web-search";
|
||||
import { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import {
|
||||
applyMoonshotConfig,
|
||||
applyMoonshotConfigCn,
|
||||
@ -98,6 +99,7 @@ const moonshotPlugin = {
|
||||
return createMoonshotThinkingWrapper(ctx.streamFn, thinkingType);
|
||||
},
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(moonshotMediaUnderstandingProvider);
|
||||
api.registerWebSearchProvider(
|
||||
createPluginBackedWebSearchProvider({
|
||||
id: "kimi",
|
||||
|
||||
@ -1,11 +1,12 @@
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
describeImageWithModel,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
describeImagesWithModel,
|
||||
type MediaUnderstandingProvider,
|
||||
type VideoDescriptionRequest,
|
||||
type VideoDescriptionResult,
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
|
||||
export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
|
||||
@ -116,5 +117,6 @@ export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "moonshot",
|
||||
capabilities: ["image", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
describeVideo: describeMoonshotVideo,
|
||||
};
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
|
||||
import { buildOpenAISpeechProvider } from "openclaw/plugin-sdk/speech";
|
||||
import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
|
||||
import { buildOpenAIProvider } from "./openai-provider.js";
|
||||
|
||||
@ -12,6 +13,7 @@ const openAIPlugin = {
|
||||
api.registerProvider(buildOpenAIProvider());
|
||||
api.registerProvider(buildOpenAICodexProviderPlugin());
|
||||
api.registerSpeechProvider(buildOpenAISpeechProvider());
|
||||
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import {
|
||||
describeImageWithModel,
|
||||
describeImagesWithModel,
|
||||
transcribeOpenAiCompatibleAudio,
|
||||
type AudioTranscriptionRequest,
|
||||
type MediaUnderstandingProvider,
|
||||
@ -20,5 +21,6 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
capabilities: ["image", "audio"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
transcribeAudio: transcribeOpenAiAudio,
|
||||
};
|
||||
|
||||
@ -25,6 +25,7 @@ import { DEFAULT_CONTEXT_TOKENS, normalizeModelCompat } from "openclaw/plugin-sd
|
||||
import { createZaiToolStreamWrapper } from "openclaw/plugin-sdk/provider-stream";
|
||||
import { fetchZaiUsage } from "openclaw/plugin-sdk/provider-usage";
|
||||
import { detectZaiEndpoint, type ZaiEndpointId } from "./detect.js";
|
||||
import { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import { applyZaiConfig, applyZaiProviderConfig, ZAI_DEFAULT_MODEL_REF } from "./onboard.js";
|
||||
|
||||
const PROVIDER_ID = "zai";
|
||||
@ -333,6 +334,7 @@ const zaiPlugin = {
|
||||
fetchUsageSnapshot: async (ctx) => await fetchZaiUsage(ctx.token, ctx.timeoutMs, ctx.fetchFn),
|
||||
isCacheTtlEligible: () => true,
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(zaiMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import {
|
||||
describeImageWithModel,
|
||||
describeImagesWithModel,
|
||||
type MediaUnderstandingProvider,
|
||||
} from "openclaw/plugin-sdk/media-understanding";
|
||||
|
||||
@ -7,4 +8,5 @@ export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "zai",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
||||
@ -32,6 +32,7 @@ async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promi
|
||||
const ONE_PIXEL_PNG_B64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
|
||||
const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs=";
|
||||
const ONE_PIXEL_JPEG_B64 = "QUJDRA==";
|
||||
|
||||
async function withTempWorkspacePng(
|
||||
cb: (args: { workspaceDir: string; imagePath: string }) => Promise<void>,
|
||||
@ -736,10 +737,10 @@ describe("image tool MiniMax VLM routing", () => {
|
||||
|
||||
const res = await tool.execute("t1", {
|
||||
prompt: "Compare these images.",
|
||||
images: [`data:image/png;base64,${pngB64}`, `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`],
|
||||
images: [`data:image/png;base64,${pngB64}`, `data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`],
|
||||
});
|
||||
|
||||
expect(fetch).toHaveBeenCalledTimes(1);
|
||||
expect(fetch).toHaveBeenCalledTimes(2);
|
||||
const details = res.details as
|
||||
| {
|
||||
images?: Array<{ image: string }>;
|
||||
@ -756,12 +757,12 @@ describe("image tool MiniMax VLM routing", () => {
|
||||
image: `data:image/png;base64,${pngB64}`,
|
||||
images: [
|
||||
`data:image/png;base64,${pngB64}`,
|
||||
`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`,
|
||||
`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`,
|
||||
`data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`,
|
||||
`data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`,
|
||||
],
|
||||
});
|
||||
|
||||
expect(fetch).toHaveBeenCalledTimes(1);
|
||||
expect(fetch).toHaveBeenCalledTimes(2);
|
||||
const dedupedDetails = deduped.details as
|
||||
| {
|
||||
images?: Array<{ image: string }>;
|
||||
@ -776,7 +777,7 @@ describe("image tool MiniMax VLM routing", () => {
|
||||
maxImages: 1,
|
||||
});
|
||||
|
||||
expect(fetch).toHaveBeenCalledTimes(1);
|
||||
expect(fetch).toHaveBeenCalledTimes(2);
|
||||
expect(tooMany.details).toMatchObject({
|
||||
error: "too_many_images",
|
||||
count: 2,
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
import { type Context, complete } from "@mariozechner/pi-ai";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import { getMediaUnderstandingProvider } from "../../media-understanding/providers/index.js";
|
||||
import { buildProviderRegistry } from "../../media-understanding/runner.js";
|
||||
import { loadWebMedia } from "../../plugin-sdk/web-media.js";
|
||||
import { resolveUserPath } from "../../utils.js";
|
||||
import { isMinimaxVlmModel, isMinimaxVlmProvider, minimaxUnderstandImage } from "../minimax-vlm.js";
|
||||
import { isMinimaxVlmProvider } from "../minimax-vlm.js";
|
||||
import {
|
||||
coerceImageAssistantText,
|
||||
coerceImageModelConfig,
|
||||
@ -14,17 +15,12 @@ import {
|
||||
import {
|
||||
applyImageModelConfigDefaults,
|
||||
buildTextToolResult,
|
||||
resolveModelFromRegistry,
|
||||
resolveMediaToolLocalRoots,
|
||||
resolveModelRuntimeApiKey,
|
||||
resolvePromptAndModelOverride,
|
||||
} from "./media-tool-shared.js";
|
||||
import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
|
||||
import {
|
||||
createSandboxBridgeReadFile,
|
||||
discoverAuthStorage,
|
||||
discoverModels,
|
||||
ensureOpenClawModelsJson,
|
||||
resolveSandboxedBridgeMediaPath,
|
||||
runWithImageModelFallback,
|
||||
type AnyAgentTool,
|
||||
@ -168,27 +164,6 @@ function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undef
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function buildImageContext(
|
||||
prompt: string,
|
||||
images: Array<{ base64: string; mimeType: string }>,
|
||||
): Context {
|
||||
const content: Array<
|
||||
{ type: "text"; text: string } | { type: "image"; data: string; mimeType: string }
|
||||
> = [{ type: "text", text: prompt }];
|
||||
for (const img of images) {
|
||||
content.push({ type: "image", data: img.base64, mimeType: img.mimeType });
|
||||
}
|
||||
return {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content,
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
type ImageSandboxConfig = {
|
||||
root: string;
|
||||
bridge: SandboxFsBridge;
|
||||
@ -200,7 +175,7 @@ async function runImagePrompt(params: {
|
||||
imageModelConfig: ImageModelConfig;
|
||||
modelOverride?: string;
|
||||
prompt: string;
|
||||
images: Array<{ base64: string; mimeType: string }>;
|
||||
images: Array<{ buffer: Buffer; mimeType: string }>;
|
||||
}): Promise<{
|
||||
text: string;
|
||||
provider: string;
|
||||
@ -208,50 +183,75 @@ async function runImagePrompt(params: {
|
||||
attempts: Array<{ provider: string; model: string; error: string }>;
|
||||
}> {
|
||||
const effectiveCfg = applyImageModelConfigDefaults(params.cfg, params.imageModelConfig);
|
||||
|
||||
await ensureOpenClawModelsJson(effectiveCfg, params.agentDir);
|
||||
const authStorage = discoverAuthStorage(params.agentDir);
|
||||
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
||||
const providerCfg: OpenClawConfig = effectiveCfg ?? {};
|
||||
const providerRegistry = buildProviderRegistry(undefined, providerCfg);
|
||||
|
||||
const result = await runWithImageModelFallback({
|
||||
cfg: effectiveCfg,
|
||||
modelOverride: params.modelOverride,
|
||||
run: async (provider, modelId) => {
|
||||
const model = resolveModelFromRegistry({ modelRegistry, provider, modelId });
|
||||
if (!model.input?.includes("image")) {
|
||||
throw new Error(`Model does not support images: ${provider}/${modelId}`);
|
||||
const imageProvider = getMediaUnderstandingProvider(provider, providerRegistry);
|
||||
if (!imageProvider) {
|
||||
throw new Error(`No media-understanding provider registered for ${provider}`);
|
||||
}
|
||||
const apiKey = await resolveModelRuntimeApiKey({
|
||||
model,
|
||||
cfg: effectiveCfg,
|
||||
agentDir: params.agentDir,
|
||||
authStorage,
|
||||
});
|
||||
|
||||
// MiniMax VLM only supports a single image; use the first one.
|
||||
if (isMinimaxVlmModel(model.provider, model.id)) {
|
||||
const first = params.images[0];
|
||||
const imageDataUrl = `data:${first.mimeType};base64,${first.base64}`;
|
||||
const text = await minimaxUnderstandImage({
|
||||
apiKey,
|
||||
if (params.images.length > 1 && imageProvider.describeImages) {
|
||||
const described = await imageProvider.describeImages({
|
||||
images: params.images.map((image, index) => ({
|
||||
buffer: image.buffer,
|
||||
fileName: `image-${index + 1}`,
|
||||
mime: image.mimeType,
|
||||
})),
|
||||
provider,
|
||||
model: modelId,
|
||||
prompt: params.prompt,
|
||||
imageDataUrl,
|
||||
modelBaseUrl: model.baseUrl,
|
||||
maxTokens: resolveImageToolMaxTokens(undefined),
|
||||
timeoutMs: 30_000,
|
||||
cfg: providerCfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
return { text, provider: model.provider, model: model.id };
|
||||
return { text: described.text, provider, model: described.model ?? modelId };
|
||||
}
|
||||
if (!imageProvider.describeImage) {
|
||||
throw new Error(`Provider does not support image analysis: ${provider}`);
|
||||
}
|
||||
if (params.images.length === 1) {
|
||||
const image = params.images[0];
|
||||
const described = await imageProvider.describeImage({
|
||||
buffer: image.buffer,
|
||||
fileName: "image-1",
|
||||
mime: image.mimeType,
|
||||
provider,
|
||||
model: modelId,
|
||||
prompt: params.prompt,
|
||||
maxTokens: resolveImageToolMaxTokens(undefined),
|
||||
timeoutMs: 30_000,
|
||||
cfg: providerCfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
return { text: described.text, provider, model: described.model ?? modelId };
|
||||
}
|
||||
|
||||
const context = buildImageContext(params.prompt, params.images);
|
||||
const message = await complete(model, context, {
|
||||
apiKey,
|
||||
maxTokens: resolveImageToolMaxTokens(model.maxTokens),
|
||||
});
|
||||
const text = coerceImageAssistantText({
|
||||
message,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
});
|
||||
return { text, provider: model.provider, model: model.id };
|
||||
const parts: string[] = [];
|
||||
for (const [index, image] of params.images.entries()) {
|
||||
const described = await imageProvider.describeImage({
|
||||
buffer: image.buffer,
|
||||
fileName: `image-${index + 1}`,
|
||||
mime: image.mimeType,
|
||||
provider,
|
||||
model: modelId,
|
||||
prompt: `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length}.`,
|
||||
maxTokens: resolveImageToolMaxTokens(undefined),
|
||||
timeoutMs: 30_000,
|
||||
cfg: providerCfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
parts.push(`Image ${index + 1}:\n${described.text.trim()}`);
|
||||
}
|
||||
return {
|
||||
text: parts.join("\n\n").trim(),
|
||||
provider,
|
||||
model: modelId,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
@ -383,7 +383,7 @@ export function createImageTool(options?: {
|
||||
|
||||
// MARK: - Load and resolve each image
|
||||
const loadedImages: Array<{
|
||||
base64: string;
|
||||
buffer: Buffer;
|
||||
mimeType: string;
|
||||
resolvedImage: string;
|
||||
rewrittenFrom?: string;
|
||||
@ -469,9 +469,8 @@ export function createImageTool(options?: {
|
||||
("contentType" in media && media.contentType) ||
|
||||
("mimeType" in media && media.mimeType) ||
|
||||
"image/png";
|
||||
const base64 = media.buffer.toString("base64");
|
||||
loadedImages.push({
|
||||
base64,
|
||||
buffer: media.buffer,
|
||||
mimeType,
|
||||
resolvedImage,
|
||||
...(resolvedPathInfo.rewrittenFrom
|
||||
@ -487,7 +486,7 @@ export function createImageTool(options?: {
|
||||
imageModelConfig,
|
||||
modelOverride,
|
||||
prompt: promptRaw,
|
||||
images: loadedImages.map((img) => ({ base64: img.base64, mimeType: img.mimeType })),
|
||||
images: loadedImages.map((img) => ({ buffer: img.buffer, mimeType: img.mimeType })),
|
||||
});
|
||||
|
||||
const imageDetails =
|
||||
|
||||
@ -8,9 +8,15 @@ const getApiKeyForModelMock = vi.fn(async () => ({
|
||||
source: "test",
|
||||
mode: "oauth",
|
||||
}));
|
||||
const resolveApiKeyForProviderMock = vi.fn(async () => ({
|
||||
apiKey: "oauth-test", // pragma: allowlist secret
|
||||
source: "test",
|
||||
mode: "oauth",
|
||||
}));
|
||||
const requireApiKeyMock = vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? "");
|
||||
const setRuntimeApiKeyMock = vi.fn();
|
||||
const discoverModelsMock = vi.fn();
|
||||
let imageImportSeq = 0;
|
||||
|
||||
vi.mock("@mariozechner/pi-ai", async (importOriginal) => {
|
||||
const actual = await importOriginal<typeof import("@mariozechner/pi-ai")>();
|
||||
@ -34,6 +40,7 @@ vi.mock("../../agents/models-config.js", () => ({
|
||||
|
||||
vi.mock("../../agents/model-auth.js", () => ({
|
||||
getApiKeyForModel: getApiKeyForModelMock,
|
||||
resolveApiKeyForProvider: resolveApiKeyForProviderMock,
|
||||
requireApiKey: requireApiKeyMock,
|
||||
}));
|
||||
|
||||
@ -44,6 +51,11 @@ vi.mock("../../agents/pi-model-discovery-runtime.js", () => ({
|
||||
discoverModels: discoverModelsMock,
|
||||
}));
|
||||
|
||||
async function importImageModule() {
|
||||
imageImportSeq += 1;
|
||||
return await import(/* @vite-ignore */ `./image.js?case=${imageImportSeq}`);
|
||||
}
|
||||
|
||||
describe("describeImageWithModel", () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
@ -59,7 +71,7 @@ describe("describeImageWithModel", () => {
|
||||
});
|
||||
|
||||
it("routes minimax-portal image models through the MiniMax VLM endpoint", async () => {
|
||||
const { describeImageWithModel } = await import("./image.js");
|
||||
const { describeImageWithModel } = await importImageModule();
|
||||
|
||||
const result = await describeImageWithModel({
|
||||
cfg: {},
|
||||
@ -109,7 +121,7 @@ describe("describeImageWithModel", () => {
|
||||
content: [{ type: "text", text: "generic ok" }],
|
||||
});
|
||||
|
||||
const { describeImageWithModel } = await import("./image.js");
|
||||
const { describeImageWithModel } = await importImageModule();
|
||||
|
||||
const result = await describeImageWithModel({
|
||||
cfg: {},
|
||||
@ -153,7 +165,7 @@ describe("describeImageWithModel", () => {
|
||||
content: [{ type: "text", text: "flash ok" }],
|
||||
});
|
||||
|
||||
const { describeImageWithModel } = await import("./image.js");
|
||||
const { describeImageWithModel } = await importImageModule();
|
||||
|
||||
const result = await describeImageWithModel({
|
||||
cfg: {},
|
||||
@ -203,7 +215,7 @@ describe("describeImageWithModel", () => {
|
||||
content: [{ type: "text", text: "flash lite ok" }],
|
||||
});
|
||||
|
||||
const { describeImageWithModel } = await import("./image.js");
|
||||
const { describeImageWithModel } = await importImageModule();
|
||||
|
||||
const result = await describeImageWithModel({
|
||||
cfg: {},
|
||||
|
||||
@ -1,11 +1,20 @@
|
||||
import type { Api, Context, Model } from "@mariozechner/pi-ai";
|
||||
import { complete } from "@mariozechner/pi-ai";
|
||||
import { isMinimaxVlmModel, minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
|
||||
import { getApiKeyForModel, requireApiKey } from "../../agents/model-auth.js";
|
||||
import {
|
||||
getApiKeyForModel,
|
||||
requireApiKey,
|
||||
resolveApiKeyForProvider,
|
||||
} from "../../agents/model-auth.js";
|
||||
import { normalizeModelRef } from "../../agents/model-selection.js";
|
||||
import { ensureOpenClawModelsJson } from "../../agents/models-config.js";
|
||||
import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
|
||||
import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
|
||||
import type {
|
||||
ImageDescriptionRequest,
|
||||
ImageDescriptionResult,
|
||||
ImagesDescriptionRequest,
|
||||
ImagesDescriptionResult,
|
||||
} from "../types.js";
|
||||
|
||||
let piModelDiscoveryRuntimePromise: Promise<
|
||||
typeof import("../../agents/pi-model-discovery-runtime.js")
|
||||
@ -16,14 +25,29 @@ function loadPiModelDiscoveryRuntime() {
|
||||
return piModelDiscoveryRuntimePromise;
|
||||
}
|
||||
|
||||
export async function describeImageWithModel(
|
||||
params: ImageDescriptionRequest,
|
||||
): Promise<ImageDescriptionResult> {
|
||||
function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requestedMaxTokens = 4096) {
|
||||
if (
|
||||
typeof modelMaxTokens !== "number" ||
|
||||
!Number.isFinite(modelMaxTokens) ||
|
||||
modelMaxTokens <= 0
|
||||
) {
|
||||
return requestedMaxTokens;
|
||||
}
|
||||
return Math.min(requestedMaxTokens, modelMaxTokens);
|
||||
}
|
||||
|
||||
async function resolveImageRuntime(params: {
|
||||
cfg: ImageDescriptionRequest["cfg"];
|
||||
agentDir: string;
|
||||
provider: string;
|
||||
model: string;
|
||||
profile?: string;
|
||||
preferredProfile?: string;
|
||||
}): Promise<{ apiKey: string; model: Model<Api> }> {
|
||||
await ensureOpenClawModelsJson(params.cfg, params.agentDir);
|
||||
const { discoverAuthStorage, discoverModels } = await loadPiModelDiscoveryRuntime();
|
||||
const authStorage = discoverAuthStorage(params.agentDir);
|
||||
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
||||
// Keep direct media config entries compatible with deprecated provider model aliases.
|
||||
const resolvedRef = normalizeModelRef(params.provider, params.model);
|
||||
const model = modelRegistry.find(resolvedRef.provider, resolvedRef.model) as Model<Api> | null;
|
||||
if (!model) {
|
||||
@ -41,33 +65,132 @@ export async function describeImageWithModel(
|
||||
});
|
||||
const apiKey = requireApiKey(apiKeyInfo, model.provider);
|
||||
authStorage.setRuntimeApiKey(model.provider, apiKey);
|
||||
return { apiKey, model };
|
||||
}
|
||||
|
||||
const base64 = params.buffer.toString("base64");
|
||||
if (isMinimaxVlmModel(model.provider, model.id)) {
|
||||
const text = await minimaxUnderstandImage({
|
||||
apiKey,
|
||||
prompt: params.prompt ?? "Describe the image.",
|
||||
imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
|
||||
modelBaseUrl: model.baseUrl,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
}
|
||||
|
||||
const context: Context = {
|
||||
function buildImageContext(
|
||||
prompt: string,
|
||||
images: Array<{ buffer: Buffer; mime?: string }>,
|
||||
): Context {
|
||||
return {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: params.prompt ?? "Describe the image." },
|
||||
{ type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
|
||||
{ type: "text", text: prompt },
|
||||
...images.map((image) => ({
|
||||
type: "image" as const,
|
||||
data: image.buffer.toString("base64"),
|
||||
mimeType: image.mime ?? "image/jpeg",
|
||||
})),
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function describeImagesWithMinimax(params: {
|
||||
apiKey: string;
|
||||
modelId: string;
|
||||
modelBaseUrl?: string;
|
||||
prompt: string;
|
||||
images: Array<{ buffer: Buffer; mime?: string }>;
|
||||
}): Promise<ImagesDescriptionResult> {
|
||||
const responses: string[] = [];
|
||||
for (const [index, image] of params.images.entries()) {
|
||||
const prompt =
|
||||
params.images.length > 1
|
||||
? `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length} independently.`
|
||||
: params.prompt;
|
||||
const text = await minimaxUnderstandImage({
|
||||
apiKey: params.apiKey,
|
||||
prompt,
|
||||
imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`,
|
||||
modelBaseUrl: params.modelBaseUrl,
|
||||
});
|
||||
responses.push(params.images.length > 1 ? `Image ${index + 1}:\n${text.trim()}` : text.trim());
|
||||
}
|
||||
return {
|
||||
text: responses.join("\n\n").trim(),
|
||||
model: params.modelId,
|
||||
};
|
||||
}
|
||||
|
||||
function isUnknownModelError(err: unknown): boolean {
|
||||
return err instanceof Error && /^Unknown model:/i.test(err.message);
|
||||
}
|
||||
|
||||
function resolveConfiguredProviderBaseUrl(
|
||||
cfg: ImageDescriptionRequest["cfg"],
|
||||
provider: string,
|
||||
): string | undefined {
|
||||
const direct = cfg.models?.providers?.[provider];
|
||||
if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) {
|
||||
return direct.baseUrl.trim();
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
async function resolveMinimaxVlmFallbackRuntime(params: {
|
||||
cfg: ImageDescriptionRequest["cfg"];
|
||||
agentDir: string;
|
||||
provider: string;
|
||||
profile?: string;
|
||||
preferredProfile?: string;
|
||||
}): Promise<{ apiKey: string; modelBaseUrl?: string }> {
|
||||
const auth = await resolveApiKeyForProvider({
|
||||
provider: params.provider,
|
||||
cfg: params.cfg,
|
||||
profileId: params.profile,
|
||||
preferredProfile: params.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
return {
|
||||
apiKey: requireApiKey(auth, params.provider),
|
||||
modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider),
|
||||
};
|
||||
}
|
||||
|
||||
export async function describeImagesWithModel(
|
||||
params: ImagesDescriptionRequest,
|
||||
): Promise<ImagesDescriptionResult> {
|
||||
const prompt = params.prompt ?? "Describe the image.";
|
||||
let apiKey: string;
|
||||
let model: Model<Api> | undefined;
|
||||
|
||||
try {
|
||||
const resolved = await resolveImageRuntime(params);
|
||||
apiKey = resolved.apiKey;
|
||||
model = resolved.model;
|
||||
} catch (err) {
|
||||
if (!isMinimaxVlmModel(params.provider, params.model) || !isUnknownModelError(err)) {
|
||||
throw err;
|
||||
}
|
||||
const fallback = await resolveMinimaxVlmFallbackRuntime(params);
|
||||
return await describeImagesWithMinimax({
|
||||
apiKey: fallback.apiKey,
|
||||
modelId: params.model,
|
||||
modelBaseUrl: fallback.modelBaseUrl,
|
||||
prompt,
|
||||
images: params.images,
|
||||
});
|
||||
}
|
||||
|
||||
if (isMinimaxVlmModel(model.provider, model.id)) {
|
||||
return await describeImagesWithMinimax({
|
||||
apiKey,
|
||||
modelId: model.id,
|
||||
modelBaseUrl: model.baseUrl,
|
||||
prompt,
|
||||
images: params.images,
|
||||
});
|
||||
}
|
||||
|
||||
const context = buildImageContext(prompt, params.images);
|
||||
const message = await complete(model, context, {
|
||||
apiKey,
|
||||
maxTokens: params.maxTokens ?? 512,
|
||||
maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512),
|
||||
});
|
||||
const text = coerceImageAssistantText({
|
||||
message,
|
||||
@ -76,3 +199,26 @@ export async function describeImageWithModel(
|
||||
});
|
||||
return { text, model: model.id };
|
||||
}
|
||||
|
||||
export async function describeImageWithModel(
|
||||
params: ImageDescriptionRequest,
|
||||
): Promise<ImageDescriptionResult> {
|
||||
return await describeImagesWithModel({
|
||||
images: [
|
||||
{
|
||||
buffer: params.buffer,
|
||||
fileName: params.fileName,
|
||||
mime: params.mime,
|
||||
},
|
||||
],
|
||||
model: params.model,
|
||||
provider: params.provider,
|
||||
prompt: params.prompt,
|
||||
maxTokens: params.maxTokens,
|
||||
timeoutMs: params.timeoutMs,
|
||||
profile: params.profile,
|
||||
preferredProfile: params.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
cfg: params.cfg,
|
||||
});
|
||||
}
|
||||
|
||||
@ -1,10 +1,33 @@
|
||||
import { anthropicMediaUnderstandingProvider } from "../../../extensions/anthropic/media-understanding-provider.js";
|
||||
import { googleMediaUnderstandingProvider } from "../../../extensions/google/media-understanding-provider.js";
|
||||
import {
|
||||
minimaxMediaUnderstandingProvider,
|
||||
minimaxPortalMediaUnderstandingProvider,
|
||||
} from "../../../extensions/minimax/media-understanding-provider.js";
|
||||
import { mistralMediaUnderstandingProvider } from "../../../extensions/mistral/media-understanding-provider.js";
|
||||
import { moonshotMediaUnderstandingProvider } from "../../../extensions/moonshot/media-understanding-provider.js";
|
||||
import { openaiMediaUnderstandingProvider } from "../../../extensions/openai/media-understanding-provider.js";
|
||||
import { zaiMediaUnderstandingProvider } from "../../../extensions/zai/media-understanding-provider.js";
|
||||
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import { loadOpenClawPlugins } from "../../plugins/loader.js";
|
||||
import { getActivePluginRegistry } from "../../plugins/runtime.js";
|
||||
import type { MediaUnderstandingProvider } from "../types.js";
|
||||
import { deepgramProvider } from "./deepgram/index.js";
|
||||
import { groqProvider } from "./groq/index.js";
|
||||
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, deepgramProvider];
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [
|
||||
groqProvider,
|
||||
deepgramProvider,
|
||||
anthropicMediaUnderstandingProvider,
|
||||
googleMediaUnderstandingProvider,
|
||||
minimaxMediaUnderstandingProvider,
|
||||
minimaxPortalMediaUnderstandingProvider,
|
||||
mistralMediaUnderstandingProvider,
|
||||
moonshotMediaUnderstandingProvider,
|
||||
openaiMediaUnderstandingProvider,
|
||||
zaiMediaUnderstandingProvider,
|
||||
];
|
||||
|
||||
function mergeProviderIntoRegistry(
|
||||
registry: Map<string, MediaUnderstandingProvider>,
|
||||
@ -32,12 +55,18 @@ export function normalizeMediaProviderId(id: string): string {
|
||||
|
||||
export function buildMediaUnderstandingRegistry(
|
||||
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||
cfg?: OpenClawConfig,
|
||||
): Map<string, MediaUnderstandingProvider> {
|
||||
const registry = new Map<string, MediaUnderstandingProvider>();
|
||||
for (const provider of PROVIDERS) {
|
||||
mergeProviderIntoRegistry(registry, provider);
|
||||
}
|
||||
for (const entry of getActivePluginRegistry()?.mediaUnderstandingProviders ?? []) {
|
||||
const active = getActivePluginRegistry();
|
||||
const pluginRegistry =
|
||||
(active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg
|
||||
? active
|
||||
: loadOpenClawPlugins({ config: cfg });
|
||||
for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) {
|
||||
mergeProviderIntoRegistry(registry, entry.provider);
|
||||
}
|
||||
if (overrides) {
|
||||
|
||||
@ -75,8 +75,9 @@ export type RunCapabilityResult = {
|
||||
|
||||
export function buildProviderRegistry(
|
||||
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||
cfg?: OpenClawConfig,
|
||||
): ProviderRegistry {
|
||||
return buildMediaUnderstandingRegistry(overrides);
|
||||
return buildMediaUnderstandingRegistry(overrides, cfg);
|
||||
}
|
||||
|
||||
export function normalizeMediaAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||
|
||||
@ -48,7 +48,7 @@ export async function runMediaUnderstandingFile(
|
||||
return { text: undefined };
|
||||
}
|
||||
|
||||
const providerRegistry = buildProviderRegistry();
|
||||
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
|
||||
const cache = createMediaAttachmentCache(attachments, {
|
||||
localPathRoots: [path.dirname(params.filePath)],
|
||||
});
|
||||
|
||||
@ -90,6 +90,25 @@ export type ImageDescriptionRequest = {
|
||||
buffer: Buffer;
|
||||
fileName: string;
|
||||
mime?: string;
|
||||
prompt?: string;
|
||||
maxTokens?: number;
|
||||
timeoutMs: number;
|
||||
profile?: string;
|
||||
preferredProfile?: string;
|
||||
agentDir: string;
|
||||
cfg: import("../config/config.js").OpenClawConfig;
|
||||
model: string;
|
||||
provider: string;
|
||||
};
|
||||
|
||||
export type ImagesDescriptionInput = {
|
||||
buffer: Buffer;
|
||||
fileName: string;
|
||||
mime?: string;
|
||||
};
|
||||
|
||||
export type ImagesDescriptionRequest = {
|
||||
images: ImagesDescriptionInput[];
|
||||
model: string;
|
||||
provider: string;
|
||||
prompt?: string;
|
||||
@ -106,10 +125,16 @@ export type ImageDescriptionResult = {
|
||||
model?: string;
|
||||
};
|
||||
|
||||
export type ImagesDescriptionResult = {
|
||||
text: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingProvider = {
|
||||
id: string;
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
|
||||
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
|
||||
describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
|
||||
describeImages?: (req: ImagesDescriptionRequest) => Promise<ImagesDescriptionResult>;
|
||||
};
|
||||
|
||||
@ -5,12 +5,15 @@ export type {
|
||||
AudioTranscriptionResult,
|
||||
ImageDescriptionRequest,
|
||||
ImageDescriptionResult,
|
||||
ImagesDescriptionInput,
|
||||
ImagesDescriptionRequest,
|
||||
ImagesDescriptionResult,
|
||||
MediaUnderstandingProvider,
|
||||
VideoDescriptionRequest,
|
||||
VideoDescriptionResult,
|
||||
} from "../media-understanding/types.js";
|
||||
|
||||
export { describeImageWithModel } from "../media-understanding/providers/image.js";
|
||||
export { describeImageWithModel, describeImagesWithModel } from "../media-understanding/providers/image.js";
|
||||
export { transcribeOpenAiCompatibleAudio } from "../media-understanding/providers/openai-compatible-audio.js";
|
||||
export {
|
||||
assertOkOrThrowHttpError,
|
||||
|
||||
@ -43,6 +43,16 @@ function findMediaUnderstandingProviderIdsForPlugin(pluginId: string) {
|
||||
.toSorted((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
function findMediaUnderstandingProviderForPlugin(pluginId: string) {
|
||||
const entry = mediaUnderstandingProviderContractRegistry.find(
|
||||
(candidate) => candidate.pluginId === pluginId,
|
||||
);
|
||||
if (!entry) {
|
||||
throw new Error(`media-understanding provider contract missing for ${pluginId}`);
|
||||
}
|
||||
return entry.provider;
|
||||
}
|
||||
|
||||
function findRegistrationForPlugin(pluginId: string) {
|
||||
const entry = pluginRegistrationContractRegistry.find(
|
||||
(candidate) => candidate.pluginId === pluginId,
|
||||
@ -141,4 +151,25 @@ describe("plugin contract registry", () => {
|
||||
expect(findSpeechProviderForPlugin("elevenlabs").listVoices).toEqual(expect.any(Function));
|
||||
expect(findSpeechProviderForPlugin("microsoft").listVoices).toEqual(expect.any(Function));
|
||||
});
|
||||
|
||||
it("keeps bundled multi-image support explicit", () => {
|
||||
expect(findMediaUnderstandingProviderForPlugin("anthropic").describeImages).toEqual(
|
||||
expect.any(Function),
|
||||
);
|
||||
expect(findMediaUnderstandingProviderForPlugin("google").describeImages).toEqual(
|
||||
expect.any(Function),
|
||||
);
|
||||
expect(findMediaUnderstandingProviderForPlugin("minimax").describeImages).toEqual(
|
||||
expect.any(Function),
|
||||
);
|
||||
expect(findMediaUnderstandingProviderForPlugin("moonshot").describeImages).toEqual(
|
||||
expect.any(Function),
|
||||
);
|
||||
expect(findMediaUnderstandingProviderForPlugin("openai").describeImages).toEqual(
|
||||
expect.any(Function),
|
||||
);
|
||||
expect(findMediaUnderstandingProviderForPlugin("zai").describeImages).toEqual(
|
||||
expect.any(Function),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user