diff --git a/src/media-understanding/providers/index.test.ts b/src/media-understanding/providers/index.test.ts index 31bc041a608..d40e4ac6082 100644 --- a/src/media-understanding/providers/index.test.ts +++ b/src/media-understanding/providers/index.test.ts @@ -60,4 +60,28 @@ describe("media-understanding provider registry", () => { expect(provider?.id).toBe("google"); }); + + it("auto-registers media-understanding for config providers with image-capable models (#51392)", () => { + const cfg = { + models: { + providers: { + glm: { + models: [{ id: "glm-4.6v", input: ["text", "image"] }], + }, + textOnly: { + models: [{ id: "text-model", input: ["text"] }], + }, + }, + }, + } as never; + const registry = buildMediaUnderstandingRegistry(undefined, cfg); + const glmProvider = getMediaUnderstandingProvider("glm", registry); + const textOnlyProvider = getMediaUnderstandingProvider("textOnly", registry); + + expect(glmProvider?.id).toBe("glm"); + expect(glmProvider?.capabilities).toEqual(["image"]); + expect(glmProvider?.describeImage).toBeDefined(); + expect(glmProvider?.describeImages).toBeDefined(); + expect(textOnlyProvider).toBeUndefined(); + }); }); diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index 521d55caee1..09bf033d5dc 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -5,6 +5,10 @@ import { getActivePluginRegistry } from "../../plugins/runtime.js"; import type { MediaUnderstandingProvider } from "../types.js"; import { deepgramProvider } from "./deepgram/index.js"; import { groqProvider } from "./groq/index.js"; +import { + describeImageWithModel, + describeImagesWithModel, +} from "./image.js"; const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, deepgramProvider]; @@ -48,6 +52,28 @@ export function buildMediaUnderstandingRegistry( for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) { mergeProviderIntoRegistry(registry, entry.provider); } + // Auto-register media-understanding for config providers with image-capable models (#51392) + const configProviders = cfg?.models?.providers; + if (configProviders && typeof configProviders === "object") { + for (const [providerKey, providerCfg] of Object.entries(configProviders)) { + if (!providerKey?.trim()) continue; + const normalizedKey = normalizeMediaProviderId(providerKey); + if (registry.has(normalizedKey)) continue; + const models = (providerCfg as { models?: Array<{ input?: string[] }> })?.models ?? []; + const hasImageModel = models.some( + (m) => Array.isArray(m?.input) && m.input.includes("image"), + ); + if (hasImageModel) { + const autoProvider: MediaUnderstandingProvider = { + id: normalizedKey, + capabilities: ["image"], + describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, + }; + mergeProviderIntoRegistry(registry, autoProvider); + } + } + } if (overrides) { for (const [key, provider] of Object.entries(overrides)) { const normalizedKey = normalizeMediaProviderId(key);