fix(agents): resolve imageModel "Unknown model" for custom providers (#33185)

The image tool failed with "Unknown model" for any user-configured custom
provider (vllm, nvidia-api, iflow) because resolveImageRuntime used bare
modelRegistry.find() which only checks built-in models.json entries.

- Replace modelRegistry.find() with resolveModelWithRegistry() to use the
  full 4-layer resolution stack (registry → inline config → plugin → ad-hoc)
- Add config-aware input field fallback for provider-prefixed model IDs
  (e.g. "vllm/Qwen3.5" in config vs "Qwen3.5" after ref parsing)
- Fall back to describeImageWithModel in image-tool.ts when no
  media-understanding provider is registered for custom providers
- Normalize nvidia-api provider alias to nvidia in provider-id.ts
- Add regression tests for custom provider image model resolution

Closes #33185

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
JaiminBhojani 2026-03-20 22:31:54 +05:30
parent bfe979dd5b
commit f4efe1514e
5 changed files with 198 additions and 39 deletions

View File

@ -114,6 +114,7 @@ describe("model-selection", () => {
expect(normalizeProviderId("qwen")).toBe("qwen-portal");
expect(normalizeProviderId("kimi-code")).toBe("kimi");
expect(normalizeProviderId("kimi-coding")).toBe("kimi");
expect(normalizeProviderId("nvidia-api")).toBe("nvidia");
expect(normalizeProviderId("bedrock")).toBe("amazon-bedrock");
expect(normalizeProviderId("aws-bedrock")).toBe("amazon-bedrock");
expect(normalizeProviderId("amazon-bedrock")).toBe("amazon-bedrock");
@ -242,6 +243,12 @@ describe("model-selection", () => {
defaultProvider: "anthropic",
expected: { provider: "openai", model: "gpt-5.3-codex-codex" },
},
{
name: "normalizes nvidia-api provider alias to nvidia and preserves nested model id",
variants: ["nvidia-api/meta/llama-3.2-90b-vision-instruct"],
defaultProvider: "openai",
expected: { provider: "nvidia", model: "meta/llama-3.2-90b-vision-instruct" },
},
{
name: "normalizes gemini 3.1 flash-lite ids for google-vertex",
variants: ["google-vertex/gemini-3.1-flash-lite", "gemini-3.1-flash-lite"],

View File

@ -15,6 +15,9 @@ export function normalizeProviderId(provider: string): string {
if (normalized === "kimi" || normalized === "kimi-code" || normalized === "kimi-coding") {
return "kimi";
}
if (normalized === "nvidia-api") {
return "nvidia";
}
if (normalized === "bedrock" || normalized === "aws-bedrock") {
return "amazon-bedrock";
}

View File

@ -1,5 +1,9 @@
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import {
describeImageWithModel,
describeImagesWithModel,
} from "../../media-understanding/providers/image.js";
import { getMediaUnderstandingProvider } from "../../media-understanding/providers/index.js";
import { buildProviderRegistry } from "../../media-understanding/runner.js";
import { loadWebMedia } from "../../media/web-media.js";
@ -146,9 +150,44 @@ async function runImagePrompt(params: {
modelOverride: params.modelOverride,
run: async (provider, modelId) => {
const imageProvider = getMediaUnderstandingProvider(provider, providerRegistry);
// When no media-understanding provider is registered (custom/self-hosted
// providers like vllm, nvidia-api, iflow), fall back to the generic
// model-based image description — same pattern as runner.entries.ts.
if (!imageProvider) {
throw new Error(`No media-understanding provider registered for ${provider}`);
if (params.images.length > 1) {
const described = await describeImagesWithModel({
images: params.images.map((image, index) => ({
buffer: image.buffer,
fileName: `image-${index + 1}`,
mime: image.mimeType,
})),
provider,
model: modelId,
prompt: params.prompt,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
cfg: providerCfg,
agentDir: params.agentDir,
});
return { text: described.text, provider, model: described.model ?? modelId };
}
const image = params.images[0];
const described = await describeImageWithModel({
buffer: image.buffer,
fileName: "image-1",
mime: image.mimeType,
provider,
model: modelId,
prompt: params.prompt,
maxTokens: resolveImageToolMaxTokens(undefined),
timeoutMs: 30_000,
cfg: providerCfg,
agentDir: params.agentDir,
});
return { text: described.text, provider, model: described.model ?? modelId };
}
if (params.images.length > 1 && imageProvider.describeImages) {
const described = await imageProvider.describeImages({
images: params.images.map((image, index) => ({

View File

@ -16,6 +16,7 @@ const resolveApiKeyForProviderMock = vi.fn(async () => ({
const requireApiKeyMock = vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? "");
const setRuntimeApiKeyMock = vi.fn();
const discoverModelsMock = vi.fn();
const resolveModelWithRegistryMock = vi.fn();
type ImageModule = typeof import("./image.js");
let describeImageWithModel: ImageModule["describeImageWithModel"];
@ -52,15 +53,17 @@ describe("describeImageWithModel", () => {
}),
discoverModels: discoverModelsMock,
}));
vi.doMock("../../agents/pi-embedded-runner/model.js", () => ({
resolveModelWithRegistry: resolveModelWithRegistryMock,
}));
({ describeImageWithModel } = await import("./image.js"));
minimaxUnderstandImageMock.mockResolvedValue("portal ok");
discoverModelsMock.mockReturnValue({
find: vi.fn(() => ({
provider: "minimax-portal",
id: "MiniMax-VL-01",
input: ["text", "image"],
baseUrl: "https://api.minimax.io/anthropic",
})),
discoverModelsMock.mockReturnValue({ find: vi.fn(() => null) });
resolveModelWithRegistryMock.mockReturnValue({
provider: "minimax-portal",
id: "MiniMax-VL-01",
input: ["text", "image"],
baseUrl: "https://api.minimax.io/anthropic",
});
});
@ -95,13 +98,11 @@ describe("describeImageWithModel", () => {
});
it("uses generic completion for non-canonical minimax-portal image models", async () => {
discoverModelsMock.mockReturnValue({
find: vi.fn(() => ({
provider: "minimax-portal",
id: "custom-vision",
input: ["text", "image"],
baseUrl: "https://api.minimax.io/anthropic",
})),
resolveModelWithRegistryMock.mockReturnValue({
provider: "minimax-portal",
id: "custom-vision",
input: ["text", "image"],
baseUrl: "https://api.minimax.io/anthropic",
});
completeMock.mockResolvedValue({
role: "assistant",
@ -134,17 +135,12 @@ describe("describeImageWithModel", () => {
});
it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
const findMock = vi.fn((provider: string, modelId: string) => {
expect(provider).toBe("google");
expect(modelId).toBe("gemini-3-flash-preview");
return {
provider: "google",
id: "gemini-3-flash-preview",
input: ["text", "image"],
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
};
resolveModelWithRegistryMock.mockReturnValue({
provider: "google",
id: "gemini-3-flash-preview",
input: ["text", "image"],
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
});
discoverModelsMock.mockReturnValue({ find: findMock });
completeMock.mockResolvedValue({
role: "assistant",
api: "google-generative-ai",
@ -172,7 +168,7 @@ describe("describeImageWithModel", () => {
text: "flash ok",
model: "gemini-3-flash-preview",
});
expect(findMock).toHaveBeenCalledOnce();
expect(resolveModelWithRegistryMock).toHaveBeenCalledOnce();
expect(getApiKeyForModelMock).toHaveBeenCalledWith(
expect.objectContaining({
profileId: "google:default",
@ -182,17 +178,12 @@ describe("describeImageWithModel", () => {
});
it("normalizes gemini 3.1 flash-lite ids before lookup and keeps profile auth selection", async () => {
const findMock = vi.fn((provider: string, modelId: string) => {
expect(provider).toBe("google");
expect(modelId).toBe("gemini-3.1-flash-lite-preview");
return {
provider: "google",
id: "gemini-3.1-flash-lite-preview",
input: ["text", "image"],
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
};
resolveModelWithRegistryMock.mockReturnValue({
provider: "google",
id: "gemini-3.1-flash-lite-preview",
input: ["text", "image"],
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
});
discoverModelsMock.mockReturnValue({ find: findMock });
completeMock.mockResolvedValue({
role: "assistant",
api: "google-generative-ai",
@ -220,7 +211,7 @@ describe("describeImageWithModel", () => {
text: "flash lite ok",
model: "gemini-3.1-flash-lite-preview",
});
expect(findMock).toHaveBeenCalledOnce();
expect(resolveModelWithRegistryMock).toHaveBeenCalledOnce();
expect(getApiKeyForModelMock).toHaveBeenCalledWith(
expect.objectContaining({
profileId: "google:default",
@ -228,4 +219,90 @@ describe("describeImageWithModel", () => {
);
expect(setRuntimeApiKeyMock).toHaveBeenCalledWith("google", "oauth-test");
});
it("resolves custom provider image models via config fallback when not in registry (#33185)", async () => {
// Simulate resolveModelWithRegistry returning an ad-hoc model with input: ["text"]
// (the default when model ID matching fails due to provider-prefixed IDs).
resolveModelWithRegistryMock.mockReturnValue({
provider: "vllm",
id: "Qwen3.5",
api: "openai-completions",
baseUrl: "http://127.0.0.1:1234/v1",
input: ["text"],
contextWindow: 128000,
maxTokens: 8192,
});
completeMock.mockResolvedValue({
role: "assistant",
api: "openai-completions",
provider: "vllm",
model: "Qwen3.5",
stopReason: "stop",
timestamp: Date.now(),
content: [{ type: "text", text: "custom vision ok" }],
});
const cfg = {
models: {
providers: {
vllm: {
baseUrl: "http://127.0.0.1:1234/v1",
apiKey: "vllm-local", // pragma: allowlist secret
api: "openai-completions" as const,
models: [
{
id: "vllm/Qwen3.5",
name: "Qwen3.5",
input: ["image", "text"] as string[],
contextWindow: 128000,
maxTokens: 8192,
},
],
},
},
},
};
const result = await describeImageWithModel({
cfg,
agentDir: "/tmp/openclaw-agent",
provider: "vllm",
model: "Qwen3.5",
buffer: Buffer.from("png-bytes"),
fileName: "image.png",
mime: "image/png",
prompt: "Describe the image.",
timeoutMs: 1000,
});
expect(result).toEqual({
text: "custom vision ok",
model: "Qwen3.5",
});
expect(resolveModelWithRegistryMock).toHaveBeenCalledWith(
expect.objectContaining({
provider: "vllm",
modelId: "Qwen3.5",
}),
);
expect(completeMock).toHaveBeenCalledOnce();
});
it("throws Unknown model when custom provider model is not resolvable at all (#33185)", async () => {
resolveModelWithRegistryMock.mockReturnValue(undefined);
await expect(
describeImageWithModel({
cfg: {},
agentDir: "/tmp/openclaw-agent",
provider: "nonexistent",
model: "fake-model",
buffer: Buffer.from("png-bytes"),
fileName: "image.png",
mime: "image/png",
prompt: "Describe the image.",
timeoutMs: 1000,
}),
).rejects.toThrow("Unknown model: nonexistent/fake-model");
});
});

View File

@ -6,8 +6,9 @@ import {
requireApiKey,
resolveApiKeyForProvider,
} from "../../agents/model-auth.js";
import { normalizeModelRef } from "../../agents/model-selection.js";
import { findNormalizedProviderValue, normalizeModelRef } from "../../agents/model-selection.js";
import { ensureOpenClawModelsJson } from "../../agents/models-config.js";
import { resolveModelWithRegistry } from "../../agents/pi-embedded-runner/model.js";
import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
import type {
ImageDescriptionRequest,
@ -49,10 +50,42 @@ async function resolveImageRuntime(params: {
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
const resolvedRef = normalizeModelRef(params.provider, params.model);
const model = modelRegistry.find(resolvedRef.provider, resolvedRef.model) as Model<Api> | null;
// Use the full model resolution stack (registry → inline config → plugin →
// ad-hoc provider config) instead of bare modelRegistry.find(), which misses
// user-configured custom provider models (e.g. vllm, nvidia-api, iflow).
let model = (resolveModelWithRegistry({
provider: resolvedRef.provider,
modelId: resolvedRef.model,
modelRegistry,
cfg: params.cfg,
agentDir: params.agentDir,
}) ?? null) as Model<Api> | null;
if (!model) {
throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`);
}
// When the model was resolved via the ad-hoc provider config fallback, the
// input field defaults to ["text"] because the config model lookup uses exact
// ID matching which can miss provider-prefixed IDs (e.g. "vllm/Qwen3.5" in
// config vs "Qwen3.5" after model ref parsing). Check the user's configured
// model definition for explicit image support so the tool works correctly.
if (!model.input?.includes("image")) {
const providerConfig = findNormalizedProviderValue(
params.cfg?.models?.providers,
resolvedRef.provider,
);
const configuredModel = providerConfig?.models?.find(
(m) =>
m.id === resolvedRef.model ||
m.id === `${resolvedRef.provider}/${resolvedRef.model}`,
);
if (configuredModel?.input?.includes("image")) {
model = { ...model, input: configuredModel.input } as Model<Api>;
}
}
if (!model.input?.includes("image")) {
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
}