From d4cb9170923021b3bad69517ce90fb40a7b786c2 Mon Sep 17 00:00:00 2001 From: Jinhao Dong Date: Wed, 18 Mar 2026 20:23:23 +0800 Subject: [PATCH] feat(xiaomi): add media understanding provider for image and video Add Xiaomi media understanding provider with image and video capabilities using MiMo V2 Omni. Enables automatic media description when users send image or video attachments. --- extensions/xiaomi/index.ts | 2 + .../xiaomi/media-understanding-provider.ts | 122 ++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 extensions/xiaomi/media-understanding-provider.ts diff --git a/extensions/xiaomi/index.ts b/extensions/xiaomi/index.ts index def263b1cda..8abce6866d1 100644 --- a/extensions/xiaomi/index.ts +++ b/extensions/xiaomi/index.ts @@ -2,6 +2,7 @@ import { definePluginEntry } from "openclaw/plugin-sdk/core"; import { createProviderApiKeyAuthMethod } from "openclaw/plugin-sdk/provider-auth"; import { buildSingleProviderApiKeyCatalog } from "openclaw/plugin-sdk/provider-catalog"; import { PROVIDER_LABELS } from "openclaw/plugin-sdk/provider-usage"; +import { xiaomiMediaUnderstandingProvider } from "./media-understanding-provider.js"; import { applyXiaomiConfig, XIAOMI_DEFAULT_MODEL_REF } from "./onboard.js"; import { buildXiaomiProvider } from "./provider-catalog.js"; @@ -60,5 +61,6 @@ export default definePluginEntry({ windows: [], }), }); + api.registerMediaUnderstandingProvider(xiaomiMediaUnderstandingProvider); }, }); diff --git a/extensions/xiaomi/media-understanding-provider.ts b/extensions/xiaomi/media-understanding-provider.ts new file mode 100644 index 00000000000..4ec060e039a --- /dev/null +++ b/extensions/xiaomi/media-understanding-provider.ts @@ -0,0 +1,122 @@ +import { + describeImageWithModel, + describeImagesWithModel, + type MediaUnderstandingProvider, + type VideoDescriptionRequest, + type VideoDescriptionResult, + assertOkOrThrowHttpError, + normalizeBaseUrl, + postJsonRequest, +} from "openclaw/plugin-sdk/media-understanding"; + +export const DEFAULT_XIAOMI_VIDEO_BASE_URL = "https://api.xiaomimimo.com/v1"; +const DEFAULT_XIAOMI_VIDEO_MODEL = "mimo-v2-omni"; +const DEFAULT_XIAOMI_VIDEO_PROMPT = "Describe the video."; + +type XiaomiVideoPayload = { + choices?: Array<{ + message?: { + content?: string | Array<{ text?: string }>; + reasoning_content?: string; + }; + }>; +}; + +function resolveModel(model?: string): string { + const trimmed = model?.trim(); + return trimmed || DEFAULT_XIAOMI_VIDEO_MODEL; +} + +function resolvePrompt(prompt?: string): string { + const trimmed = prompt?.trim(); + return trimmed || DEFAULT_XIAOMI_VIDEO_PROMPT; +} + +function coerceResponseText(payload: XiaomiVideoPayload): string | null { + const message = payload.choices?.[0]?.message; + if (!message) { + return null; + } + if (typeof message.content === "string" && message.content.trim()) { + return message.content.trim(); + } + if (Array.isArray(message.content)) { + const text = message.content + .map((part) => (typeof part.text === "string" ? part.text.trim() : "")) + .filter(Boolean) + .join("\n") + .trim(); + if (text) { + return text; + } + } + if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { + return message.reasoning_content.trim(); + } + return null; +} + +export async function describeXiaomiVideo( + params: VideoDescriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_XIAOMI_VIDEO_BASE_URL); + const model = resolveModel(params.model); + const mime = params.mime ?? "video/mp4"; + const prompt = resolvePrompt(params.prompt); + const url = `${baseUrl}/chat/completions`; + + const headers = new Headers(params.headers); + if (!headers.has("content-type")) { + headers.set("content-type", "application/json"); + } + if (!headers.has("authorization")) { + headers.set("authorization", `Bearer ${params.apiKey}`); + } + + const body = { + model, + messages: [ + { + role: "user", + content: [ + { type: "text", text: prompt }, + { + type: "video_url", + video_url: { + url: `data:${mime};base64,${params.buffer.toString("base64")}`, + }, + }, + ], + }, + ], + }; + + const { response: res, release } = await postJsonRequest({ + url, + headers, + body, + timeoutMs: params.timeoutMs, + fetchFn, + }); + + try { + await assertOkOrThrowHttpError(res, "Xiaomi video description failed"); + const payload = (await res.json()) as XiaomiVideoPayload; + const text = coerceResponseText(payload); + if (!text) { + throw new Error("Xiaomi video description response missing content"); + } + return { text, model }; + } finally { + await release(); + } +} + +export const xiaomiMediaUnderstandingProvider: MediaUnderstandingProvider = { + id: "xiaomi", + capabilities: ["image", "video"], + describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, + describeVideo: describeXiaomiVideo, +};