Merge 2ce93382dd96dae787e426d7d097a809ceeeac5b into 8a05c05596ca9ba0735dafd8e359885de4c2c969

This commit is contained in:
Jerry-Xin 2026-03-21 14:03:48 +08:00 committed by GitHub
commit 354344da53
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 275 additions and 7 deletions

View File

@ -459,16 +459,24 @@ describe("resolveModel", () => {
});
});
it("falls back to text-only when OpenRouter API cache is empty", () => {
it("falls back to heuristic vision detection when OpenRouter API cache is empty", () => {
mockGetOpenRouterModelCapabilities.mockReturnValue(undefined);
const result = resolveModel("openrouter", "openrouter/healer-alpha", "/tmp/agent");
expect(result.error).toBeUndefined();
expect(result.model).toMatchObject({
// Vision model detected by heuristic
const visionResult = resolveModel("openrouter", "openai/gpt-4o", "/tmp/agent");
expect(visionResult.error).toBeUndefined();
expect(visionResult.model).toMatchObject({
provider: "openrouter",
id: "openrouter/healer-alpha",
reasoning: false,
id: "openai/gpt-4o",
input: ["text", "image"],
});
// Non-vision model defaults to text-only
const textResult = resolveModel("openrouter", "deepseek/deepseek-chat", "/tmp/agent");
expect(textResult.error).toBeUndefined();
expect(textResult.model).toMatchObject({
provider: "openrouter",
id: "deepseek/deepseek-chat",
input: ["text"],
});
});
@ -536,6 +544,110 @@ describe("resolveModel", () => {
});
});
it("resolves OpenRouter vision models with image input based on configured model", () => {
const cfg = {
models: {
providers: {
openrouter: {
baseUrl: "https://openrouter.ai/api/v1",
api: "openai-completions",
models: [
{
...makeModel("anthropic/claude-opus-4-6"),
input: ["text", "image"],
},
],
},
},
},
} as OpenClawConfig;
const result = resolveModel("openrouter", "anthropic/claude-opus-4-6", "/tmp/agent", cfg);
expect(result.error).toBeUndefined();
expect(result.model).toMatchObject({
provider: "openrouter",
id: "anthropic/claude-opus-4-6",
input: ["text", "image"],
});
});
it("resolves OpenRouter vision models by model ID pattern heuristic", () => {
// Without explicit config, vision models are detected by ID pattern
mockGetOpenRouterModelCapabilities.mockReturnValue(undefined);
const visionModelIds = [
"openai/gpt-4o",
"openai/gpt-4-turbo",
"anthropic/claude-3-opus",
"anthropic/claude-sonnet-4-5",
"google/gemini-1.5-pro",
"google/gemini-flash-2.0",
"qwen/qwen2-vl-72b",
"mistralai/pixtral-large",
"meta-llama/llama-3.2-90b-vision",
];
for (const modelId of visionModelIds) {
const result = resolveModel("openrouter", modelId, "/tmp/agent");
expect(result.error).toBeUndefined();
expect(result.model?.input).toContain("image");
expect(result.model?.input).toContain("text");
}
});
it("resolves OpenRouter text-only models without image input", () => {
// Models without vision patterns default to text-only
mockGetOpenRouterModelCapabilities.mockReturnValue(undefined);
const textOnlyModelIds = [
"deepseek/deepseek-chat",
"meta-llama/llama-3.1-70b-instruct",
"mistralai/mistral-large",
];
for (const modelId of textOnlyModelIds) {
const result = resolveModel("openrouter", modelId, "/tmp/agent");
expect(result.error).toBeUndefined();
expect(result.model?.input).toEqual(["text"]);
}
});
it("uses explicitly configured imageModel.primary with OpenRouter (#44648)", () => {
// Regression test: when imageModel.primary is configured to an OpenRouter vision model,
// the model should resolve with image input capability
const cfg = {
agents: {
defaults: {
imageModel: { primary: "openrouter/openai/gpt-4o" },
},
},
models: {
providers: {
openrouter: {
baseUrl: "https://openrouter.ai/api/v1",
api: "openai-completions",
models: [
{
...makeModel("openai/gpt-4o"),
input: ["text", "image"],
},
],
},
},
},
} as unknown as OpenClawConfig;
const result = resolveModel("openrouter", "openai/gpt-4o", "/tmp/agent", cfg);
expect(result.error).toBeUndefined();
expect(result.model).toMatchObject({
provider: "openrouter",
id: "openai/gpt-4o",
input: ["text", "image"],
});
});
it("prefers configured provider api metadata over discovered registry model", () => {
mockDiscoveredModel({
provider: "onehub",

View File

@ -33,6 +33,42 @@ type InlineProviderConfig = {
headers?: unknown;
};
/**
* Heuristic to detect vision-capable models based on their model ID.
* OpenRouter models often follow naming patterns that indicate vision support.
*/
function isLikelyVisionModel(modelId: string): boolean {
const lower = modelId.toLowerCase();
// Common vision model patterns:
// - Models with "vision", "vl", or "visual" in the name
// - Claude 3+ models (claude-3-*, claude-opus-4-*, claude-sonnet-4-*) except claude-2
// - GPT-4 vision variants (gpt-4-vision, gpt-4o, gpt-4-turbo)
// - Gemini models (gemini-1.5-*, gemini-2-*, gemini-pro-vision)
// - Llama vision models (llava, llama-3.2-*-vision)
// - Qwen vision models (qwen-vl, qwen2-vl)
// - Pixtral models
const visionPatterns = [
/vision/,
/\bvl\b/, // "vl" as a separate segment (e.g., qwen-vl, qwen2-vl-72b, MiniMax-VL-01)
/visual/,
/claude-3/,
/claude-opus-4/,
/claude-sonnet-4/,
/claude-haiku-4/,
/gpt-4o/,
/gpt-4-turbo/,
/gpt-4-vision/,
/gemini-1\.5/,
/gemini-2/,
/gemini-pro-vision/,
/gemini-flash/,
/llava/,
/llama-3\.2.*vision/,
/pixtral/,
];
return visionPatterns.some((pattern) => pattern.test(lower));
}
function sanitizeModelHeaders(
headers: unknown,
opts?: { stripSecretRefMarkers?: boolean },
@ -248,6 +284,7 @@ export function resolveModelWithRegistry(params: {
}
const { provider, modelId, cfg, modelRegistry, agentDir } = params;
const normalizedProvider = normalizeProviderId(provider);
const providerConfig = resolveConfiguredProviderConfig(cfg, provider);
const pluginDynamicModel = runProviderDynamicModel({
provider,
@ -270,6 +307,58 @@ export function resolveModelWithRegistry(params: {
});
}
// OpenRouter is a pass-through proxy - any model ID available on OpenRouter
// should work without being pre-registered in the local catalog.
// This fallback uses heuristics when the plugin-based capability lookup returns nothing.
// Note: configured models with provider-level `api` return early via inlineMatch,
// so we rely on heuristic detection for vision support here, unless explicitly configured.
if (normalizedProvider === "openrouter") {
// Honor explicitly configured input from providerConfig.models before applying heuristic.
const configuredOpenRouterModel = providerConfig?.models?.find(
(candidate) => candidate.id === modelId,
);
const configuredInput = configuredOpenRouterModel?.input
? configuredOpenRouterModel.input.filter((item) => item === "text" || item === "image")
: undefined;
const resolvedInput: Array<"text" | "image"> =
configuredInput && configuredInput.length > 0
? configuredInput
: isLikelyVisionModel(modelId)
? ["text", "image"]
: ["text"];
const providerHeaders = sanitizeModelHeaders(providerConfig?.headers, {
stripSecretRefMarkers: true,
});
const modelHeaders = sanitizeModelHeaders(configuredOpenRouterModel?.headers, {
stripSecretRefMarkers: true,
});
return normalizeResolvedModel({
provider,
cfg,
agentDir,
model: {
id: modelId,
name: modelId,
api: configuredOpenRouterModel?.api ?? providerConfig?.api ?? "openai-completions",
provider,
baseUrl: providerConfig?.baseUrl ?? "https://openrouter.ai/api/v1",
reasoning: configuredOpenRouterModel?.reasoning ?? false,
input: resolvedInput,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: configuredOpenRouterModel?.contextWindow ?? DEFAULT_CONTEXT_TOKENS,
// Align with OPENROUTER_DEFAULT_MAX_TOKENS in models-config.providers.ts
maxTokens: configuredOpenRouterModel?.maxTokens ?? 8192,
...(providerHeaders || modelHeaders
? { headers: { ...providerHeaders, ...modelHeaders } }
: {}),
} as Model<Api>,
});
}
// Fallback for non-OpenRouter providers with custom providerConfig or mock models.
// OpenRouter returns early above using isLikelyVisionModel heuristic.
const configuredModel = providerConfig?.models?.find((candidate) => candidate.id === modelId);
const providerHeaders = sanitizeModelHeaders(providerConfig?.headers, {
stripSecretRefMarkers: true,

View File

@ -466,6 +466,73 @@ describe("image tool implicit imageModel config", () => {
});
});
it("uses explicitly configured openrouter imageModel.primary with image input (#44648)", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("OPENROUTER_API_KEY", "openrouter-test");
const fetch = stubOpenAiCompletionsOkFetch("ok openrouter");
const cfg: OpenClawConfig = {
agents: {
defaults: {
model: { primary: "openrouter/deepseek/deepseek-chat" },
imageModel: { primary: "openrouter/openai/gpt-4o" },
},
},
models: {
providers: {
openrouter: {
api: "openai-completions",
baseUrl: "https://openrouter.ai/api/v1",
models: [
makeModelDefinition("openai/gpt-4o", ["text", "image"]),
makeModelDefinition("deepseek/deepseek-chat", ["text"]),
],
},
},
},
};
const tool = requireImageTool(createImageTool({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "Describe this image.",
image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
});
expect(fetch).toHaveBeenCalledTimes(1);
const [url] = fetch.mock.calls[0] as [unknown];
expect(String(url)).toBe("https://openrouter.ai/api/v1/chat/completions");
expect(result.content).toEqual(
expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok openrouter" })]),
);
});
});
it("resolves OpenRouter vision models by ID pattern heuristic (#44648)", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("OPENROUTER_API_KEY", "openrouter-test");
const fetch = stubOpenAiCompletionsOkFetch("ok vision");
// No explicit model config - relies on ID pattern detection
const cfg: OpenClawConfig = {
agents: {
defaults: {
model: { primary: "openrouter/some-text-model" },
imageModel: { primary: "openrouter/anthropic/claude-3-opus" },
},
},
};
const tool = requireImageTool(createImageTool({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "Describe this image.",
image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
});
expect(fetch).toHaveBeenCalledTimes(1);
expect(result.content).toEqual(
expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok vision" })]),
);
});
});
it("exposes an Anthropic-safe image schema without union keywords", async () => {
await withMinimaxImageToolFromTempAgentDir(async (tool) => {
const violations = findSchemaUnionKeywords(tool.parameters, "image.parameters");