From f4efe1514e2ee1029e5c1b8077ee0bcebe043af9 Mon Sep 17 00:00:00 2001 From: JaiminBhojani Date: Fri, 20 Mar 2026 22:31:54 +0530 Subject: [PATCH 1/4] fix(agents): resolve imageModel "Unknown model" for custom providers (#33185) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The image tool failed with "Unknown model" for any user-configured custom provider (vllm, nvidia-api, iflow) because resolveImageRuntime used bare modelRegistry.find() which only checks built-in models.json entries. - Replace modelRegistry.find() with resolveModelWithRegistry() to use the full 4-layer resolution stack (registry → inline config → plugin → ad-hoc) - Add config-aware input field fallback for provider-prefixed model IDs (e.g. "vllm/Qwen3.5" in config vs "Qwen3.5" after ref parsing) - Fall back to describeImageWithModel in image-tool.ts when no media-understanding provider is registered for custom providers - Normalize nvidia-api provider alias to nvidia in provider-id.ts - Add regression tests for custom provider image model resolution Closes #33185 Co-Authored-By: Claude Opus 4.6 --- src/agents/model-selection.test.ts | 7 + src/agents/provider-id.ts | 3 + src/agents/tools/image-tool.ts | 41 ++++- .../providers/image.test.ts | 149 +++++++++++++----- src/media-understanding/providers/image.ts | 37 ++++- 5 files changed, 198 insertions(+), 39 deletions(-) diff --git a/src/agents/model-selection.test.ts b/src/agents/model-selection.test.ts index e7d583d106f..a421412bffa 100644 --- a/src/agents/model-selection.test.ts +++ b/src/agents/model-selection.test.ts @@ -114,6 +114,7 @@ describe("model-selection", () => { expect(normalizeProviderId("qwen")).toBe("qwen-portal"); expect(normalizeProviderId("kimi-code")).toBe("kimi"); expect(normalizeProviderId("kimi-coding")).toBe("kimi"); + expect(normalizeProviderId("nvidia-api")).toBe("nvidia"); expect(normalizeProviderId("bedrock")).toBe("amazon-bedrock"); expect(normalizeProviderId("aws-bedrock")).toBe("amazon-bedrock"); expect(normalizeProviderId("amazon-bedrock")).toBe("amazon-bedrock"); @@ -242,6 +243,12 @@ describe("model-selection", () => { defaultProvider: "anthropic", expected: { provider: "openai", model: "gpt-5.3-codex-codex" }, }, + { + name: "normalizes nvidia-api provider alias to nvidia and preserves nested model id", + variants: ["nvidia-api/meta/llama-3.2-90b-vision-instruct"], + defaultProvider: "openai", + expected: { provider: "nvidia", model: "meta/llama-3.2-90b-vision-instruct" }, + }, { name: "normalizes gemini 3.1 flash-lite ids for google-vertex", variants: ["google-vertex/gemini-3.1-flash-lite", "gemini-3.1-flash-lite"], diff --git a/src/agents/provider-id.ts b/src/agents/provider-id.ts index bd82c3c3edd..5bcde2691ac 100644 --- a/src/agents/provider-id.ts +++ b/src/agents/provider-id.ts @@ -15,6 +15,9 @@ export function normalizeProviderId(provider: string): string { if (normalized === "kimi" || normalized === "kimi-code" || normalized === "kimi-coding") { return "kimi"; } + if (normalized === "nvidia-api") { + return "nvidia"; + } if (normalized === "bedrock" || normalized === "aws-bedrock") { return "amazon-bedrock"; } diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index f72bd4fd4e7..a669e9f0f2c 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,5 +1,9 @@ import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; +import { + describeImageWithModel, + describeImagesWithModel, +} from "../../media-understanding/providers/image.js"; import { getMediaUnderstandingProvider } from "../../media-understanding/providers/index.js"; import { buildProviderRegistry } from "../../media-understanding/runner.js"; import { loadWebMedia } from "../../media/web-media.js"; @@ -146,9 +150,44 @@ async function runImagePrompt(params: { modelOverride: params.modelOverride, run: async (provider, modelId) => { const imageProvider = getMediaUnderstandingProvider(provider, providerRegistry); + + // When no media-understanding provider is registered (custom/self-hosted + // providers like vllm, nvidia-api, iflow), fall back to the generic + // model-based image description — same pattern as runner.entries.ts. if (!imageProvider) { - throw new Error(`No media-understanding provider registered for ${provider}`); + if (params.images.length > 1) { + const described = await describeImagesWithModel({ + images: params.images.map((image, index) => ({ + buffer: image.buffer, + fileName: `image-${index + 1}`, + mime: image.mimeType, + })), + provider, + model: modelId, + prompt: params.prompt, + maxTokens: resolveImageToolMaxTokens(undefined), + timeoutMs: 30_000, + cfg: providerCfg, + agentDir: params.agentDir, + }); + return { text: described.text, provider, model: described.model ?? modelId }; + } + const image = params.images[0]; + const described = await describeImageWithModel({ + buffer: image.buffer, + fileName: "image-1", + mime: image.mimeType, + provider, + model: modelId, + prompt: params.prompt, + maxTokens: resolveImageToolMaxTokens(undefined), + timeoutMs: 30_000, + cfg: providerCfg, + agentDir: params.agentDir, + }); + return { text: described.text, provider, model: described.model ?? modelId }; } + if (params.images.length > 1 && imageProvider.describeImages) { const described = await imageProvider.describeImages({ images: params.images.map((image, index) => ({ diff --git a/src/media-understanding/providers/image.test.ts b/src/media-understanding/providers/image.test.ts index 9044d8ba83d..96bd467ce84 100644 --- a/src/media-understanding/providers/image.test.ts +++ b/src/media-understanding/providers/image.test.ts @@ -16,6 +16,7 @@ const resolveApiKeyForProviderMock = vi.fn(async () => ({ const requireApiKeyMock = vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""); const setRuntimeApiKeyMock = vi.fn(); const discoverModelsMock = vi.fn(); +const resolveModelWithRegistryMock = vi.fn(); type ImageModule = typeof import("./image.js"); let describeImageWithModel: ImageModule["describeImageWithModel"]; @@ -52,15 +53,17 @@ describe("describeImageWithModel", () => { }), discoverModels: discoverModelsMock, })); + vi.doMock("../../agents/pi-embedded-runner/model.js", () => ({ + resolveModelWithRegistry: resolveModelWithRegistryMock, + })); ({ describeImageWithModel } = await import("./image.js")); minimaxUnderstandImageMock.mockResolvedValue("portal ok"); - discoverModelsMock.mockReturnValue({ - find: vi.fn(() => ({ - provider: "minimax-portal", - id: "MiniMax-VL-01", - input: ["text", "image"], - baseUrl: "https://api.minimax.io/anthropic", - })), + discoverModelsMock.mockReturnValue({ find: vi.fn(() => null) }); + resolveModelWithRegistryMock.mockReturnValue({ + provider: "minimax-portal", + id: "MiniMax-VL-01", + input: ["text", "image"], + baseUrl: "https://api.minimax.io/anthropic", }); }); @@ -95,13 +98,11 @@ describe("describeImageWithModel", () => { }); it("uses generic completion for non-canonical minimax-portal image models", async () => { - discoverModelsMock.mockReturnValue({ - find: vi.fn(() => ({ - provider: "minimax-portal", - id: "custom-vision", - input: ["text", "image"], - baseUrl: "https://api.minimax.io/anthropic", - })), + resolveModelWithRegistryMock.mockReturnValue({ + provider: "minimax-portal", + id: "custom-vision", + input: ["text", "image"], + baseUrl: "https://api.minimax.io/anthropic", }); completeMock.mockResolvedValue({ role: "assistant", @@ -134,17 +135,12 @@ describe("describeImageWithModel", () => { }); it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => { - const findMock = vi.fn((provider: string, modelId: string) => { - expect(provider).toBe("google"); - expect(modelId).toBe("gemini-3-flash-preview"); - return { - provider: "google", - id: "gemini-3-flash-preview", - input: ["text", "image"], - baseUrl: "https://generativelanguage.googleapis.com/v1beta", - }; + resolveModelWithRegistryMock.mockReturnValue({ + provider: "google", + id: "gemini-3-flash-preview", + input: ["text", "image"], + baseUrl: "https://generativelanguage.googleapis.com/v1beta", }); - discoverModelsMock.mockReturnValue({ find: findMock }); completeMock.mockResolvedValue({ role: "assistant", api: "google-generative-ai", @@ -172,7 +168,7 @@ describe("describeImageWithModel", () => { text: "flash ok", model: "gemini-3-flash-preview", }); - expect(findMock).toHaveBeenCalledOnce(); + expect(resolveModelWithRegistryMock).toHaveBeenCalledOnce(); expect(getApiKeyForModelMock).toHaveBeenCalledWith( expect.objectContaining({ profileId: "google:default", @@ -182,17 +178,12 @@ describe("describeImageWithModel", () => { }); it("normalizes gemini 3.1 flash-lite ids before lookup and keeps profile auth selection", async () => { - const findMock = vi.fn((provider: string, modelId: string) => { - expect(provider).toBe("google"); - expect(modelId).toBe("gemini-3.1-flash-lite-preview"); - return { - provider: "google", - id: "gemini-3.1-flash-lite-preview", - input: ["text", "image"], - baseUrl: "https://generativelanguage.googleapis.com/v1beta", - }; + resolveModelWithRegistryMock.mockReturnValue({ + provider: "google", + id: "gemini-3.1-flash-lite-preview", + input: ["text", "image"], + baseUrl: "https://generativelanguage.googleapis.com/v1beta", }); - discoverModelsMock.mockReturnValue({ find: findMock }); completeMock.mockResolvedValue({ role: "assistant", api: "google-generative-ai", @@ -220,7 +211,7 @@ describe("describeImageWithModel", () => { text: "flash lite ok", model: "gemini-3.1-flash-lite-preview", }); - expect(findMock).toHaveBeenCalledOnce(); + expect(resolveModelWithRegistryMock).toHaveBeenCalledOnce(); expect(getApiKeyForModelMock).toHaveBeenCalledWith( expect.objectContaining({ profileId: "google:default", @@ -228,4 +219,90 @@ describe("describeImageWithModel", () => { ); expect(setRuntimeApiKeyMock).toHaveBeenCalledWith("google", "oauth-test"); }); + + it("resolves custom provider image models via config fallback when not in registry (#33185)", async () => { + // Simulate resolveModelWithRegistry returning an ad-hoc model with input: ["text"] + // (the default when model ID matching fails due to provider-prefixed IDs). + resolveModelWithRegistryMock.mockReturnValue({ + provider: "vllm", + id: "Qwen3.5", + api: "openai-completions", + baseUrl: "http://127.0.0.1:1234/v1", + input: ["text"], + contextWindow: 128000, + maxTokens: 8192, + }); + completeMock.mockResolvedValue({ + role: "assistant", + api: "openai-completions", + provider: "vllm", + model: "Qwen3.5", + stopReason: "stop", + timestamp: Date.now(), + content: [{ type: "text", text: "custom vision ok" }], + }); + + const cfg = { + models: { + providers: { + vllm: { + baseUrl: "http://127.0.0.1:1234/v1", + apiKey: "vllm-local", // pragma: allowlist secret + api: "openai-completions" as const, + models: [ + { + id: "vllm/Qwen3.5", + name: "Qwen3.5", + input: ["image", "text"] as string[], + contextWindow: 128000, + maxTokens: 8192, + }, + ], + }, + }, + }, + }; + + const result = await describeImageWithModel({ + cfg, + agentDir: "/tmp/openclaw-agent", + provider: "vllm", + model: "Qwen3.5", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "custom vision ok", + model: "Qwen3.5", + }); + expect(resolveModelWithRegistryMock).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "vllm", + modelId: "Qwen3.5", + }), + ); + expect(completeMock).toHaveBeenCalledOnce(); + }); + + it("throws Unknown model when custom provider model is not resolvable at all (#33185)", async () => { + resolveModelWithRegistryMock.mockReturnValue(undefined); + + await expect( + describeImageWithModel({ + cfg: {}, + agentDir: "/tmp/openclaw-agent", + provider: "nonexistent", + model: "fake-model", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }), + ).rejects.toThrow("Unknown model: nonexistent/fake-model"); + }); }); diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts index 9d7dc67949b..85fb754fdaf 100644 --- a/src/media-understanding/providers/image.ts +++ b/src/media-understanding/providers/image.ts @@ -6,8 +6,9 @@ import { requireApiKey, resolveApiKeyForProvider, } from "../../agents/model-auth.js"; -import { normalizeModelRef } from "../../agents/model-selection.js"; +import { findNormalizedProviderValue, normalizeModelRef } from "../../agents/model-selection.js"; import { ensureOpenClawModelsJson } from "../../agents/models-config.js"; +import { resolveModelWithRegistry } from "../../agents/pi-embedded-runner/model.js"; import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js"; import type { ImageDescriptionRequest, @@ -49,10 +50,42 @@ async function resolveImageRuntime(params: { const authStorage = discoverAuthStorage(params.agentDir); const modelRegistry = discoverModels(authStorage, params.agentDir); const resolvedRef = normalizeModelRef(params.provider, params.model); - const model = modelRegistry.find(resolvedRef.provider, resolvedRef.model) as Model | null; + + // Use the full model resolution stack (registry → inline config → plugin → + // ad-hoc provider config) instead of bare modelRegistry.find(), which misses + // user-configured custom provider models (e.g. vllm, nvidia-api, iflow). + let model = (resolveModelWithRegistry({ + provider: resolvedRef.provider, + modelId: resolvedRef.model, + modelRegistry, + cfg: params.cfg, + agentDir: params.agentDir, + }) ?? null) as Model | null; + if (!model) { throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`); } + + // When the model was resolved via the ad-hoc provider config fallback, the + // input field defaults to ["text"] because the config model lookup uses exact + // ID matching which can miss provider-prefixed IDs (e.g. "vllm/Qwen3.5" in + // config vs "Qwen3.5" after model ref parsing). Check the user's configured + // model definition for explicit image support so the tool works correctly. + if (!model.input?.includes("image")) { + const providerConfig = findNormalizedProviderValue( + params.cfg?.models?.providers, + resolvedRef.provider, + ); + const configuredModel = providerConfig?.models?.find( + (m) => + m.id === resolvedRef.model || + m.id === `${resolvedRef.provider}/${resolvedRef.model}`, + ); + if (configuredModel?.input?.includes("image")) { + model = { ...model, input: configuredModel.input } as Model; + } + } + if (!model.input?.includes("image")) { throw new Error(`Model does not support images: ${params.provider}/${params.model}`); } From 86231b5dbf718c2dff60e8baba6a623bb004a3a8 Mon Sep 17 00:00:00 2001 From: JaiminBhojani Date: Fri, 20 Mar 2026 23:00:19 +0530 Subject: [PATCH 2/4] =?UTF-8?q?fix:=20address=20PR=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20test=20coverage,=20formatting,=20and=20config=20fal?= =?UTF-8?q?lback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix oxfmt formatting issue in image.ts (CI check failure) - Add toHaveBeenCalledWith assertions to Google model normalization tests to verify normalized model IDs are passed to resolveModelWithRegistry - Copy api field from configured model in config fallback, not just input, so custom providers with explicit api configs route through correct adapter - Add dedicated tests for !imageProvider fallback in image-tool.test.ts covering both single-image and multi-image branches - Fix type errors in test configs (missing reasoning/cost fields) Co-Authored-By: Claude Opus 4.6 --- src/agents/tools/image-tool.test.ts | 116 ++++++++++++++++++ .../providers/image.test.ts | 12 +- src/media-understanding/providers/image.ts | 10 +- 3 files changed, 131 insertions(+), 7 deletions(-) diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index c48a705dc01..b2dae2dd14e 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -883,3 +883,119 @@ describe("image tool response validation", () => { expect(text).toBe("hello"); }); }); + +describe("image tool custom provider fallback (#33185)", () => { + const pngB64 = ONE_PIXEL_PNG_B64; + const priorFetch = global.fetch; + registerImageToolEnvReset(priorFetch, [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "COPILOT_GITHUB_TOKEN", + "GH_TOKEN", + "GITHUB_TOKEN", + ]); + + it("falls back to describeImageWithModel for single image when no media-understanding provider is registered", async () => { + await withTempAgentDir(async (agentDir) => { + await writeAuthProfiles(agentDir, { + version: 1, + profiles: { + "vllm:default": { type: "api_key", provider: "vllm", key: "sk-vllm-test" }, + }, + }); + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "vllm/Qwen3.5" }, + imageModel: { primary: "vllm/Qwen3.5" }, + }, + }, + models: { + providers: { + vllm: { + baseUrl: "http://127.0.0.1:1234/v1", + models: [makeModelDefinition("Qwen3.5", ["text", "image"])], + }, + }, + }, + }; + const tool = createRequiredImageTool({ config: cfg, agentDir }); + + // Mock the fallback function at module level + const spy = vi + .spyOn( + await import("../../media-understanding/providers/image.js"), + "describeImageWithModel", + ) + .mockResolvedValue({ text: "custom fallback ok", model: "Qwen3.5" }); + + const res = await tool.execute("t1", { + prompt: "Describe the image.", + image: `data:image/png;base64,${pngB64}`, + }); + + expect(spy).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "vllm", + model: "Qwen3.5", + prompt: "Describe the image.", + }), + ); + const text = + (res.content?.find((b: { type: string }) => b.type === "text") as { text?: string }) + ?.text ?? ""; + expect(text).toBe("custom fallback ok"); + spy.mockRestore(); + }); + }); + + it("falls back to describeImagesWithModel for multiple images when no media-understanding provider is registered", async () => { + await withTempAgentDir(async (agentDir) => { + await writeAuthProfiles(agentDir, { + version: 1, + profiles: { + "vllm:default": { type: "api_key", provider: "vllm", key: "sk-vllm-test" }, + }, + }); + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "vllm/Qwen3.5" }, + imageModel: { primary: "vllm/Qwen3.5" }, + }, + }, + models: { + providers: { + vllm: { + baseUrl: "http://127.0.0.1:1234/v1", + models: [makeModelDefinition("Qwen3.5", ["text", "image"])], + }, + }, + }, + }; + const tool = createRequiredImageTool({ config: cfg, agentDir }); + + const spy = vi + .spyOn( + await import("../../media-understanding/providers/image.js"), + "describeImagesWithModel", + ) + .mockResolvedValue({ text: "Image 1:\nfirst\n\nImage 2:\nsecond", model: "Qwen3.5" }); + + const res = await tool.execute("t1", { + prompt: "Compare these images.", + images: [`data:image/png;base64,${pngB64}`, `data:image/png;base64,${pngB64}`], + }); + + expect(spy).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "vllm", + model: "Qwen3.5", + prompt: "Compare these images.", + }), + ); + expect(spy.mock.calls[0][0].images).toHaveLength(2); + spy.mockRestore(); + }); + }); +}); diff --git a/src/media-understanding/providers/image.test.ts b/src/media-understanding/providers/image.test.ts index 96bd467ce84..d78361f23bd 100644 --- a/src/media-understanding/providers/image.test.ts +++ b/src/media-understanding/providers/image.test.ts @@ -168,7 +168,9 @@ describe("describeImageWithModel", () => { text: "flash ok", model: "gemini-3-flash-preview", }); - expect(resolveModelWithRegistryMock).toHaveBeenCalledOnce(); + expect(resolveModelWithRegistryMock).toHaveBeenCalledWith( + expect.objectContaining({ provider: "google", modelId: "gemini-3-flash-preview" }), + ); expect(getApiKeyForModelMock).toHaveBeenCalledWith( expect.objectContaining({ profileId: "google:default", @@ -211,7 +213,9 @@ describe("describeImageWithModel", () => { text: "flash lite ok", model: "gemini-3.1-flash-lite-preview", }); - expect(resolveModelWithRegistryMock).toHaveBeenCalledOnce(); + expect(resolveModelWithRegistryMock).toHaveBeenCalledWith( + expect.objectContaining({ provider: "google", modelId: "gemini-3.1-flash-lite-preview" }), + ); expect(getApiKeyForModelMock).toHaveBeenCalledWith( expect.objectContaining({ profileId: "google:default", @@ -253,7 +257,9 @@ describe("describeImageWithModel", () => { { id: "vllm/Qwen3.5", name: "Qwen3.5", - input: ["image", "text"] as string[], + reasoning: false, + input: ["image", "text"] as Array<"text" | "image">, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 128000, maxTokens: 8192, }, diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts index 85fb754fdaf..630b9644515 100644 --- a/src/media-understanding/providers/image.ts +++ b/src/media-understanding/providers/image.ts @@ -77,12 +77,14 @@ async function resolveImageRuntime(params: { resolvedRef.provider, ); const configuredModel = providerConfig?.models?.find( - (m) => - m.id === resolvedRef.model || - m.id === `${resolvedRef.provider}/${resolvedRef.model}`, + (m) => m.id === resolvedRef.model || m.id === `${resolvedRef.provider}/${resolvedRef.model}`, ); if (configuredModel?.input?.includes("image")) { - model = { ...model, input: configuredModel.input } as Model; + model = { + ...model, + input: configuredModel.input, + ...(configuredModel.api ? { api: configuredModel.api } : {}), + } as Model; } } From 35a29f71f2ef8bf4593afcc07a26d31010abdf50 Mon Sep 17 00:00:00 2001 From: JaiminBhojani Date: Fri, 20 Mar 2026 23:27:45 +0530 Subject: [PATCH 3/4] fix: match provider-prefixed model IDs against original alias and resolve lint errors Match config model IDs using the original (pre-normalization) provider name (e.g. nvidia-api/meta-llama) so the image input fallback works for aliased providers. Remove unnecessary type assertion in image.ts and rename unused variable in image-tool.test.ts to satisfy oxlint. Co-Authored-By: Claude Opus 4.6 --- src/agents/tools/image-tool.test.ts | 2 +- .../providers/image.test.ts | 65 +++++++++++++++++++ src/media-understanding/providers/image.ts | 23 ++++--- 3 files changed, 81 insertions(+), 9 deletions(-) diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index b2dae2dd14e..56a48298154 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -982,7 +982,7 @@ describe("image tool custom provider fallback (#33185)", () => { ) .mockResolvedValue({ text: "Image 1:\nfirst\n\nImage 2:\nsecond", model: "Qwen3.5" }); - const res = await tool.execute("t1", { + const _res = await tool.execute("t1", { prompt: "Compare these images.", images: [`data:image/png;base64,${pngB64}`, `data:image/png;base64,${pngB64}`], }); diff --git a/src/media-understanding/providers/image.test.ts b/src/media-understanding/providers/image.test.ts index d78361f23bd..30bf6f49bbe 100644 --- a/src/media-understanding/providers/image.test.ts +++ b/src/media-understanding/providers/image.test.ts @@ -294,6 +294,71 @@ describe("describeImageWithModel", () => { expect(completeMock).toHaveBeenCalledOnce(); }); + it("matches provider-prefixed model IDs against the original provider alias (#33185)", async () => { + // When provider is "nvidia-api", resolvedRef.provider becomes "nvidia" after + // normalization, but the user's config stores "nvidia-api/meta-llama". The + // lookup must also try the original params.provider prefix. + resolveModelWithRegistryMock.mockReturnValue({ + provider: "nvidia", + id: "meta-llama", + api: "openai-completions", + baseUrl: "https://integrate.api.nvidia.com/v1", + input: ["text"], + contextWindow: 128000, + maxTokens: 4096, + }); + completeMock.mockResolvedValue({ + role: "assistant", + api: "openai-completions", + provider: "nvidia", + model: "meta-llama", + stopReason: "stop", + timestamp: Date.now(), + content: [{ type: "text", text: "nvidia vision ok" }], + }); + + const cfg = { + models: { + providers: { + "nvidia-api": { + baseUrl: "https://integrate.api.nvidia.com/v1", + apiKey: "nvidia-key", // pragma: allowlist secret + api: "openai-completions" as const, + models: [ + { + id: "nvidia-api/meta-llama", + name: "meta-llama", + reasoning: false, + input: ["image", "text"] as Array<"text" | "image">, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128000, + maxTokens: 4096, + }, + ], + }, + }, + }, + }; + + const result = await describeImageWithModel({ + cfg, + agentDir: "/tmp/openclaw-agent", + provider: "nvidia-api", + model: "meta-llama", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "nvidia vision ok", + model: "meta-llama", + }); + expect(completeMock).toHaveBeenCalledOnce(); + }); + it("throws Unknown model when custom provider model is not resolvable at all (#33185)", async () => { resolveModelWithRegistryMock.mockReturnValue(undefined); diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts index 630b9644515..0dac7295f89 100644 --- a/src/media-understanding/providers/image.ts +++ b/src/media-understanding/providers/image.ts @@ -54,13 +54,14 @@ async function resolveImageRuntime(params: { // Use the full model resolution stack (registry → inline config → plugin → // ad-hoc provider config) instead of bare modelRegistry.find(), which misses // user-configured custom provider models (e.g. vllm, nvidia-api, iflow). - let model = (resolveModelWithRegistry({ - provider: resolvedRef.provider, - modelId: resolvedRef.model, - modelRegistry, - cfg: params.cfg, - agentDir: params.agentDir, - }) ?? null) as Model | null; + let model: Model | null = + resolveModelWithRegistry({ + provider: resolvedRef.provider, + modelId: resolvedRef.model, + modelRegistry, + cfg: params.cfg, + agentDir: params.agentDir, + }) ?? null; if (!model) { throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`); @@ -71,13 +72,19 @@ async function resolveImageRuntime(params: { // ID matching which can miss provider-prefixed IDs (e.g. "vllm/Qwen3.5" in // config vs "Qwen3.5" after model ref parsing). Check the user's configured // model definition for explicit image support so the tool works correctly. + // We also match against the original params.provider (pre-normalization) since + // configs may use aliases like "nvidia-api/meta/..." while resolvedRef.provider + // is normalized to "nvidia". if (!model.input?.includes("image")) { const providerConfig = findNormalizedProviderValue( params.cfg?.models?.providers, resolvedRef.provider, ); const configuredModel = providerConfig?.models?.find( - (m) => m.id === resolvedRef.model || m.id === `${resolvedRef.provider}/${resolvedRef.model}`, + (m) => + m.id === resolvedRef.model || + m.id === `${resolvedRef.provider}/${resolvedRef.model}` || + m.id === `${params.provider}/${resolvedRef.model}`, ); if (configuredModel?.input?.includes("image")) { model = { From 8d0db12885433b5dbdef2e52393ea2aa857e1e7b Mon Sep 17 00:00:00 2001 From: JaiminBhojani Date: Fri, 20 Mar 2026 23:47:27 +0530 Subject: [PATCH 4/4] fix: prefer exact provider alias over normalized lookup for image config fallback Use params.provider (exact key, e.g. "nvidia-api") for direct config lookup before falling back to findNormalizedProviderValue. This prevents ambiguity when configs contain both an alias and its canonical name (e.g. "nvidia-api" and "nvidia"), which could cause the wrong provider block to be selected and miss the model definition. Co-Authored-By: Claude Opus 4.6 --- .../providers/image.test.ts | 15 +++++++++++---- src/media-understanding/providers/image.ts | 18 +++++++++--------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/media-understanding/providers/image.test.ts b/src/media-understanding/providers/image.test.ts index 30bf6f49bbe..854b4f7c067 100644 --- a/src/media-understanding/providers/image.test.ts +++ b/src/media-understanding/providers/image.test.ts @@ -294,10 +294,11 @@ describe("describeImageWithModel", () => { expect(completeMock).toHaveBeenCalledOnce(); }); - it("matches provider-prefixed model IDs against the original provider alias (#33185)", async () => { - // When provider is "nvidia-api", resolvedRef.provider becomes "nvidia" after - // normalization, but the user's config stores "nvidia-api/meta-llama". The - // lookup must also try the original params.provider prefix. + it("prefers exact provider alias over normalized lookup for config fallback (#33185)", async () => { + // When provider is "nvidia-api", resolvedRef.provider normalizes to "nvidia". + // If the config contains both "nvidia" and "nvidia-api" entries, the exact + // params.provider key must be used so the nvidia-api/ definition is + // found rather than falling into the "nvidia" block. resolveModelWithRegistryMock.mockReturnValue({ provider: "nvidia", id: "meta-llama", @@ -320,6 +321,12 @@ describe("describeImageWithModel", () => { const cfg = { models: { providers: { + nvidia: { + baseUrl: "https://integrate.api.nvidia.com/v1", + apiKey: "nvidia-key", // pragma: allowlist secret + api: "openai-completions" as const, + models: [], + }, "nvidia-api": { baseUrl: "https://integrate.api.nvidia.com/v1", apiKey: "nvidia-key", // pragma: allowlist secret diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts index 0dac7295f89..21afa58795c 100644 --- a/src/media-understanding/providers/image.ts +++ b/src/media-understanding/providers/image.ts @@ -72,19 +72,19 @@ async function resolveImageRuntime(params: { // ID matching which can miss provider-prefixed IDs (e.g. "vllm/Qwen3.5" in // config vs "Qwen3.5" after model ref parsing). Check the user's configured // model definition for explicit image support so the tool works correctly. - // We also match against the original params.provider (pre-normalization) since - // configs may use aliases like "nvidia-api/meta/..." while resolvedRef.provider - // is normalized to "nvidia". + // We prefer the exact params.provider key first so that configs containing + // both an alias (e.g. "nvidia-api") and the canonical name ("nvidia") resolve + // to the correct block — findNormalizedProviderValue would pick whichever + // entry normalizes first, which may be the wrong one. if (!model.input?.includes("image")) { - const providerConfig = findNormalizedProviderValue( - params.cfg?.models?.providers, - resolvedRef.provider, - ); + const providers = params.cfg?.models?.providers; + const providerConfig = + providers?.[params.provider] ?? findNormalizedProviderValue(providers, resolvedRef.provider); const configuredModel = providerConfig?.models?.find( (m) => m.id === resolvedRef.model || - m.id === `${resolvedRef.provider}/${resolvedRef.model}` || - m.id === `${params.provider}/${resolvedRef.model}`, + m.id === `${params.provider}/${resolvedRef.model}` || + m.id === `${resolvedRef.provider}/${resolvedRef.model}`, ); if (configuredModel?.input?.includes("image")) { model = {