fix(image): respect imageModel config for OpenRouter vision models

Fix OpenRouter models being hardcoded with `input: ["text"]` which caused
image tool validation to fail even when `imageModel.primary` was correctly
configured to an OpenRouter vision model.

Changes:
- Check provider config for model input capabilities before falling back
- Add heuristic detection of vision models by model ID patterns
- Propagate configured model metadata (reasoning, cost, context window)

Fixes #44648
This commit is contained in:
忻役 2026-03-13 13:30:16 +08:00 committed by Jerry-Xin
parent c08f2aa21a
commit 0a59973284
3 changed files with 262 additions and 7 deletions

View File

@ -458,16 +458,24 @@ describe("resolveModel", () => {
});
});
it("falls back to text-only when OpenRouter API cache is empty", () => {
it("falls back to heuristic vision detection when OpenRouter API cache is empty", () => {
mockGetOpenRouterModelCapabilities.mockReturnValue(undefined);
const result = resolveModel("openrouter", "openrouter/healer-alpha", "/tmp/agent");
expect(result.error).toBeUndefined();
expect(result.model).toMatchObject({
// Vision model detected by heuristic
const visionResult = resolveModel("openrouter", "openai/gpt-4o", "/tmp/agent");
expect(visionResult.error).toBeUndefined();
expect(visionResult.model).toMatchObject({
provider: "openrouter",
id: "openrouter/healer-alpha",
reasoning: false,
id: "openai/gpt-4o",
input: ["text", "image"],
});
// Non-vision model defaults to text-only
const textResult = resolveModel("openrouter", "deepseek/deepseek-chat", "/tmp/agent");
expect(textResult.error).toBeUndefined();
expect(textResult.model).toMatchObject({
provider: "openrouter",
id: "deepseek/deepseek-chat",
input: ["text"],
});
});
@ -535,6 +543,110 @@ describe("resolveModel", () => {
});
});
it("resolves OpenRouter vision models with image input based on configured model", () => {
const cfg = {
models: {
providers: {
openrouter: {
baseUrl: "https://openrouter.ai/api/v1",
api: "openai-completions",
models: [
{
...makeModel("anthropic/claude-opus-4-6"),
input: ["text", "image"],
},
],
},
},
},
} as OpenClawConfig;
const result = resolveModel("openrouter", "anthropic/claude-opus-4-6", "/tmp/agent", cfg);
expect(result.error).toBeUndefined();
expect(result.model).toMatchObject({
provider: "openrouter",
id: "anthropic/claude-opus-4-6",
input: ["text", "image"],
});
});
it("resolves OpenRouter vision models by model ID pattern heuristic", () => {
// Without explicit config, vision models are detected by ID pattern
mockGetOpenRouterModelCapabilities.mockReturnValue(undefined);
const visionModelIds = [
"openai/gpt-4o",
"openai/gpt-4-turbo",
"anthropic/claude-3-opus",
"anthropic/claude-sonnet-4-5",
"google/gemini-1.5-pro",
"google/gemini-flash-2.0",
"qwen/qwen2-vl-72b",
"mistralai/pixtral-large",
"meta-llama/llama-3.2-90b-vision",
];
for (const modelId of visionModelIds) {
const result = resolveModel("openrouter", modelId, "/tmp/agent");
expect(result.error).toBeUndefined();
expect(result.model?.input).toContain("image");
expect(result.model?.input).toContain("text");
}
});
it("resolves OpenRouter text-only models without image input", () => {
// Models without vision patterns default to text-only
mockGetOpenRouterModelCapabilities.mockReturnValue(undefined);
const textOnlyModelIds = [
"deepseek/deepseek-chat",
"meta-llama/llama-3.1-70b-instruct",
"mistralai/mistral-large",
];
for (const modelId of textOnlyModelIds) {
const result = resolveModel("openrouter", modelId, "/tmp/agent");
expect(result.error).toBeUndefined();
expect(result.model?.input).toEqual(["text"]);
}
});
it("uses explicitly configured imageModel.primary with OpenRouter (#44648)", () => {
// Regression test: when imageModel.primary is configured to an OpenRouter vision model,
// the model should resolve with image input capability
const cfg = {
agents: {
defaults: {
imageModel: { primary: "openrouter/openai/gpt-4o" },
},
},
models: {
providers: {
openrouter: {
baseUrl: "https://openrouter.ai/api/v1",
api: "openai-completions",
models: [
{
...makeModel("openai/gpt-4o"),
input: ["text", "image"],
},
],
},
},
},
} as unknown as OpenClawConfig;
const result = resolveModel("openrouter", "openai/gpt-4o", "/tmp/agent", cfg);
expect(result.error).toBeUndefined();
expect(result.model).toMatchObject({
provider: "openrouter",
id: "openai/gpt-4o",
input: ["text", "image"],
});
});
it("prefers configured provider api metadata over discovered registry model", () => {
mockDiscoveredModel({
provider: "onehub",

View File

@ -33,6 +33,47 @@ type InlineProviderConfig = {
headers?: unknown;
};
/**
* Heuristic to detect vision-capable models based on their model ID.
* OpenRouter models often follow naming patterns that indicate vision support.
*/
function isLikelyVisionModel(modelId: string): boolean {
const lower = modelId.toLowerCase();
// Common vision model patterns:
// - Models with "vision", "vl", or "visual" in the name
// - Claude 3+ models (claude-3-*, claude-opus-4-*, claude-sonnet-4-*) except claude-2
// - GPT-4 vision variants (gpt-4-vision, gpt-4o, gpt-4-turbo)
// - Gemini models (gemini-1.5-*, gemini-2-*, gemini-pro-vision)
// - Llama vision models (llava, llama-3.2-*-vision)
// - Qwen vision models (qwen-vl, qwen2-vl)
// - Pixtral models
const visionPatterns = [
/vision/,
/\bvl\b/, // "vl" as a separate segment (e.g., qwen-vl, MiniMax-VL-01)
/-vl-/, // "vl" as a segment (e.g., qwen2-vl-72b)
/visual/,
/claude-3/,
/claude-opus-4/,
/claude-sonnet-4/,
/claude-haiku-4/,
/gpt-4o/,
/gpt-4-turbo/,
/gpt-4-vision/,
/gpt-5/,
/gemini-1\.5/,
/gemini-2/,
/gemini-pro-vision/,
/gemini-flash/,
/llava/,
/llama-3\.2.*vision/,
/pixtral/,
/qwen-vl/,
/qwen2-vl/,
/qwen2\.5-vl/,
];
return visionPatterns.some((pattern) => pattern.test(lower));
}
function sanitizeModelHeaders(
headers: unknown,
opts?: { stripSecretRefMarkers?: boolean },
@ -248,6 +289,7 @@ export function resolveModelWithRegistry(params: {
}
const { provider, modelId, cfg, modelRegistry, agentDir } = params;
const normalizedProvider = normalizeProviderId(provider);
const providerConfig = resolveConfiguredProviderConfig(cfg, provider);
const pluginDynamicModel = runProviderDynamicModel({
provider,
@ -270,6 +312,40 @@ export function resolveModelWithRegistry(params: {
});
}
// OpenRouter is a pass-through proxy - any model ID available on OpenRouter
// should work without being pre-registered in the local catalog.
// This fallback uses heuristics when the plugin-based capability lookup returns nothing.
if (normalizedProvider === "openrouter") {
// Check if the provider config specifies input capabilities for this model
const configuredModel = providerConfig?.models?.find((candidate) => candidate.id === modelId);
const configuredInput = configuredModel?.input;
// Use configured input if available, otherwise detect vision models by ID pattern
const resolvedInput: Array<"text" | "image"> =
Array.isArray(configuredInput) && configuredInput.length > 0
? configuredInput.filter((item): item is "text" | "image" => item === "text" || item === "image")
: isLikelyVisionModel(modelId)
? ["text", "image"]
: ["text"];
return normalizeResolvedModel({
provider,
cfg,
agentDir,
model: {
id: modelId,
name: modelId,
api: "openai-completions",
provider,
baseUrl: "https://openrouter.ai/api/v1",
reasoning: configuredModel?.reasoning ?? false,
input: resolvedInput,
cost: configuredModel?.cost ?? { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: configuredModel?.contextWindow ?? DEFAULT_CONTEXT_TOKENS,
// Align with OPENROUTER_DEFAULT_MAX_TOKENS in models-config.providers.ts
maxTokens: configuredModel?.maxTokens ?? 8192,
} as Model<Api>,
});
}
const configuredModel = providerConfig?.models?.find((candidate) => candidate.id === modelId);
const providerHeaders = sanitizeModelHeaders(providerConfig?.headers, {
stripSecretRefMarkers: true,

View File

@ -465,6 +465,73 @@ describe("image tool implicit imageModel config", () => {
});
});
it("uses explicitly configured openrouter imageModel.primary with image input (#44648)", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("OPENROUTER_API_KEY", "openrouter-test");
const fetch = stubOpenAiCompletionsOkFetch("ok openrouter");
const cfg: OpenClawConfig = {
agents: {
defaults: {
model: { primary: "openrouter/deepseek/deepseek-chat" },
imageModel: { primary: "openrouter/openai/gpt-4o" },
},
},
models: {
providers: {
openrouter: {
api: "openai-completions",
baseUrl: "https://openrouter.ai/api/v1",
models: [
makeModelDefinition("openai/gpt-4o", ["text", "image"]),
makeModelDefinition("deepseek/deepseek-chat", ["text"]),
],
},
},
},
};
const tool = requireImageTool(createImageTool({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "Describe this image.",
image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
});
expect(fetch).toHaveBeenCalledTimes(1);
const [url] = fetch.mock.calls[0] as [unknown];
expect(String(url)).toBe("https://openrouter.ai/api/v1/chat/completions");
expect(result.content).toEqual(
expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok openrouter" })]),
);
});
});
it("resolves OpenRouter vision models by ID pattern heuristic (#44648)", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("OPENROUTER_API_KEY", "openrouter-test");
const fetch = stubOpenAiCompletionsOkFetch("ok vision");
// No explicit model config - relies on ID pattern detection
const cfg: OpenClawConfig = {
agents: {
defaults: {
model: { primary: "openrouter/some-text-model" },
imageModel: { primary: "openrouter/anthropic/claude-3-opus" },
},
},
};
const tool = requireImageTool(createImageTool({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "Describe this image.",
image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
});
expect(fetch).toHaveBeenCalledTimes(1);
expect(result.content).toEqual(
expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok vision" })]),
);
});
});
it("exposes an Anthropic-safe image schema without union keywords", async () => {
await withMinimaxImageToolFromTempAgentDir(async (tool) => {
const violations = findSchemaUnionKeywords(tool.parameters, "image.parameters");