diff --git a/src/agents/pi-embedded-runner/model.test.ts b/src/agents/pi-embedded-runner/model.test.ts index b733e3a3f5f..f007c9e7475 100644 --- a/src/agents/pi-embedded-runner/model.test.ts +++ b/src/agents/pi-embedded-runner/model.test.ts @@ -459,16 +459,24 @@ describe("resolveModel", () => { }); }); - it("falls back to text-only when OpenRouter API cache is empty", () => { + it("falls back to heuristic vision detection when OpenRouter API cache is empty", () => { mockGetOpenRouterModelCapabilities.mockReturnValue(undefined); - const result = resolveModel("openrouter", "openrouter/healer-alpha", "/tmp/agent"); - - expect(result.error).toBeUndefined(); - expect(result.model).toMatchObject({ + // Vision model detected by heuristic + const visionResult = resolveModel("openrouter", "openai/gpt-4o", "/tmp/agent"); + expect(visionResult.error).toBeUndefined(); + expect(visionResult.model).toMatchObject({ provider: "openrouter", - id: "openrouter/healer-alpha", - reasoning: false, + id: "openai/gpt-4o", + input: ["text", "image"], + }); + + // Non-vision model defaults to text-only + const textResult = resolveModel("openrouter", "deepseek/deepseek-chat", "/tmp/agent"); + expect(textResult.error).toBeUndefined(); + expect(textResult.model).toMatchObject({ + provider: "openrouter", + id: "deepseek/deepseek-chat", input: ["text"], }); }); @@ -536,6 +544,110 @@ describe("resolveModel", () => { }); }); + it("resolves OpenRouter vision models with image input based on configured model", () => { + const cfg = { + models: { + providers: { + openrouter: { + baseUrl: "https://openrouter.ai/api/v1", + api: "openai-completions", + models: [ + { + ...makeModel("anthropic/claude-opus-4-6"), + input: ["text", "image"], + }, + ], + }, + }, + }, + } as OpenClawConfig; + + const result = resolveModel("openrouter", "anthropic/claude-opus-4-6", "/tmp/agent", cfg); + + expect(result.error).toBeUndefined(); + expect(result.model).toMatchObject({ + provider: "openrouter", + id: "anthropic/claude-opus-4-6", + input: ["text", "image"], + }); + }); + + it("resolves OpenRouter vision models by model ID pattern heuristic", () => { + // Without explicit config, vision models are detected by ID pattern + mockGetOpenRouterModelCapabilities.mockReturnValue(undefined); + const visionModelIds = [ + "openai/gpt-4o", + "openai/gpt-4-turbo", + "anthropic/claude-3-opus", + "anthropic/claude-sonnet-4-5", + "google/gemini-1.5-pro", + "google/gemini-flash-2.0", + "qwen/qwen2-vl-72b", + "mistralai/pixtral-large", + "meta-llama/llama-3.2-90b-vision", + ]; + + for (const modelId of visionModelIds) { + const result = resolveModel("openrouter", modelId, "/tmp/agent"); + + expect(result.error).toBeUndefined(); + expect(result.model?.input).toContain("image"); + expect(result.model?.input).toContain("text"); + } + }); + + it("resolves OpenRouter text-only models without image input", () => { + // Models without vision patterns default to text-only + mockGetOpenRouterModelCapabilities.mockReturnValue(undefined); + const textOnlyModelIds = [ + "deepseek/deepseek-chat", + "meta-llama/llama-3.1-70b-instruct", + "mistralai/mistral-large", + ]; + + for (const modelId of textOnlyModelIds) { + const result = resolveModel("openrouter", modelId, "/tmp/agent"); + + expect(result.error).toBeUndefined(); + expect(result.model?.input).toEqual(["text"]); + } + }); + + it("uses explicitly configured imageModel.primary with OpenRouter (#44648)", () => { + // Regression test: when imageModel.primary is configured to an OpenRouter vision model, + // the model should resolve with image input capability + const cfg = { + agents: { + defaults: { + imageModel: { primary: "openrouter/openai/gpt-4o" }, + }, + }, + models: { + providers: { + openrouter: { + baseUrl: "https://openrouter.ai/api/v1", + api: "openai-completions", + models: [ + { + ...makeModel("openai/gpt-4o"), + input: ["text", "image"], + }, + ], + }, + }, + }, + } as unknown as OpenClawConfig; + + const result = resolveModel("openrouter", "openai/gpt-4o", "/tmp/agent", cfg); + + expect(result.error).toBeUndefined(); + expect(result.model).toMatchObject({ + provider: "openrouter", + id: "openai/gpt-4o", + input: ["text", "image"], + }); + }); + it("prefers configured provider api metadata over discovered registry model", () => { mockDiscoveredModel({ provider: "onehub", diff --git a/src/agents/pi-embedded-runner/model.ts b/src/agents/pi-embedded-runner/model.ts index 5bf97a683d0..544690a3b34 100644 --- a/src/agents/pi-embedded-runner/model.ts +++ b/src/agents/pi-embedded-runner/model.ts @@ -33,6 +33,42 @@ type InlineProviderConfig = { headers?: unknown; }; +/** + * Heuristic to detect vision-capable models based on their model ID. + * OpenRouter models often follow naming patterns that indicate vision support. + */ +function isLikelyVisionModel(modelId: string): boolean { + const lower = modelId.toLowerCase(); + // Common vision model patterns: + // - Models with "vision", "vl", or "visual" in the name + // - Claude 3+ models (claude-3-*, claude-opus-4-*, claude-sonnet-4-*) except claude-2 + // - GPT-4 vision variants (gpt-4-vision, gpt-4o, gpt-4-turbo) + // - Gemini models (gemini-1.5-*, gemini-2-*, gemini-pro-vision) + // - Llama vision models (llava, llama-3.2-*-vision) + // - Qwen vision models (qwen-vl, qwen2-vl) + // - Pixtral models + const visionPatterns = [ + /vision/, + /\bvl\b/, // "vl" as a separate segment (e.g., qwen-vl, qwen2-vl-72b, MiniMax-VL-01) + /visual/, + /claude-3/, + /claude-opus-4/, + /claude-sonnet-4/, + /claude-haiku-4/, + /gpt-4o/, + /gpt-4-turbo/, + /gpt-4-vision/, + /gemini-1\.5/, + /gemini-2/, + /gemini-pro-vision/, + /gemini-flash/, + /llava/, + /llama-3\.2.*vision/, + /pixtral/, + ]; + return visionPatterns.some((pattern) => pattern.test(lower)); +} + function sanitizeModelHeaders( headers: unknown, opts?: { stripSecretRefMarkers?: boolean }, @@ -248,6 +284,7 @@ export function resolveModelWithRegistry(params: { } const { provider, modelId, cfg, modelRegistry, agentDir } = params; + const normalizedProvider = normalizeProviderId(provider); const providerConfig = resolveConfiguredProviderConfig(cfg, provider); const pluginDynamicModel = runProviderDynamicModel({ provider, @@ -270,6 +307,58 @@ export function resolveModelWithRegistry(params: { }); } + // OpenRouter is a pass-through proxy - any model ID available on OpenRouter + // should work without being pre-registered in the local catalog. + // This fallback uses heuristics when the plugin-based capability lookup returns nothing. + // Note: configured models with provider-level `api` return early via inlineMatch, + // so we rely on heuristic detection for vision support here, unless explicitly configured. + if (normalizedProvider === "openrouter") { + // Honor explicitly configured input from providerConfig.models before applying heuristic. + const configuredOpenRouterModel = providerConfig?.models?.find( + (candidate) => candidate.id === modelId, + ); + const configuredInput = configuredOpenRouterModel?.input + ? configuredOpenRouterModel.input.filter((item) => item === "text" || item === "image") + : undefined; + const resolvedInput: Array<"text" | "image"> = + configuredInput && configuredInput.length > 0 + ? configuredInput + : isLikelyVisionModel(modelId) + ? ["text", "image"] + : ["text"]; + + const providerHeaders = sanitizeModelHeaders(providerConfig?.headers, { + stripSecretRefMarkers: true, + }); + const modelHeaders = sanitizeModelHeaders(configuredOpenRouterModel?.headers, { + stripSecretRefMarkers: true, + }); + + return normalizeResolvedModel({ + provider, + cfg, + agentDir, + model: { + id: modelId, + name: modelId, + api: configuredOpenRouterModel?.api ?? providerConfig?.api ?? "openai-completions", + provider, + baseUrl: providerConfig?.baseUrl ?? "https://openrouter.ai/api/v1", + reasoning: configuredOpenRouterModel?.reasoning ?? false, + input: resolvedInput, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: configuredOpenRouterModel?.contextWindow ?? DEFAULT_CONTEXT_TOKENS, + // Align with OPENROUTER_DEFAULT_MAX_TOKENS in models-config.providers.ts + maxTokens: configuredOpenRouterModel?.maxTokens ?? 8192, + ...(providerHeaders || modelHeaders + ? { headers: { ...providerHeaders, ...modelHeaders } } + : {}), + } as Model, + }); + } + + // Fallback for non-OpenRouter providers with custom providerConfig or mock models. + // OpenRouter returns early above using isLikelyVisionModel heuristic. const configuredModel = providerConfig?.models?.find((candidate) => candidate.id === modelId); const providerHeaders = sanitizeModelHeaders(providerConfig?.headers, { stripSecretRefMarkers: true, diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index c48a705dc01..27d662db01f 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -466,6 +466,73 @@ describe("image tool implicit imageModel config", () => { }); }); + it("uses explicitly configured openrouter imageModel.primary with image input (#44648)", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("OPENROUTER_API_KEY", "openrouter-test"); + const fetch = stubOpenAiCompletionsOkFetch("ok openrouter"); + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "openrouter/deepseek/deepseek-chat" }, + imageModel: { primary: "openrouter/openai/gpt-4o" }, + }, + }, + models: { + providers: { + openrouter: { + api: "openai-completions", + baseUrl: "https://openrouter.ai/api/v1", + models: [ + makeModelDefinition("openai/gpt-4o", ["text", "image"]), + makeModelDefinition("deepseek/deepseek-chat", ["text"]), + ], + }, + }, + }, + }; + + const tool = requireImageTool(createImageTool({ config: cfg, agentDir })); + const result = await tool.execute("t1", { + prompt: "Describe this image.", + image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`, + }); + + expect(fetch).toHaveBeenCalledTimes(1); + const [url] = fetch.mock.calls[0] as [unknown]; + expect(String(url)).toBe("https://openrouter.ai/api/v1/chat/completions"); + expect(result.content).toEqual( + expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok openrouter" })]), + ); + }); + }); + + it("resolves OpenRouter vision models by ID pattern heuristic (#44648)", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("OPENROUTER_API_KEY", "openrouter-test"); + const fetch = stubOpenAiCompletionsOkFetch("ok vision"); + // No explicit model config - relies on ID pattern detection + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "openrouter/some-text-model" }, + imageModel: { primary: "openrouter/anthropic/claude-3-opus" }, + }, + }, + }; + + const tool = requireImageTool(createImageTool({ config: cfg, agentDir })); + const result = await tool.execute("t1", { + prompt: "Describe this image.", + image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`, + }); + + expect(fetch).toHaveBeenCalledTimes(1); + expect(result.content).toEqual( + expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok vision" })]), + ); + }); + }); + it("exposes an Anthropic-safe image schema without union keywords", async () => { await withMinimaxImageToolFromTempAgentDir(async (tool) => { const violations = findSchemaUnionKeywords(tool.parameters, "image.parameters");