diff --git a/extensions/xiaomi/index.ts b/extensions/xiaomi/index.ts index def263b1cda..8abce6866d1 100644 --- a/extensions/xiaomi/index.ts +++ b/extensions/xiaomi/index.ts @@ -2,6 +2,7 @@ import { definePluginEntry } from "openclaw/plugin-sdk/core"; import { createProviderApiKeyAuthMethod } from "openclaw/plugin-sdk/provider-auth"; import { buildSingleProviderApiKeyCatalog } from "openclaw/plugin-sdk/provider-catalog"; import { PROVIDER_LABELS } from "openclaw/plugin-sdk/provider-usage"; +import { xiaomiMediaUnderstandingProvider } from "./media-understanding-provider.js"; import { applyXiaomiConfig, XIAOMI_DEFAULT_MODEL_REF } from "./onboard.js"; import { buildXiaomiProvider } from "./provider-catalog.js"; @@ -60,5 +61,6 @@ export default definePluginEntry({ windows: [], }), }); + api.registerMediaUnderstandingProvider(xiaomiMediaUnderstandingProvider); }, }); diff --git a/extensions/xiaomi/media-understanding-provider.ts b/extensions/xiaomi/media-understanding-provider.ts new file mode 100644 index 00000000000..4ec060e039a --- /dev/null +++ b/extensions/xiaomi/media-understanding-provider.ts @@ -0,0 +1,122 @@ +import { + describeImageWithModel, + describeImagesWithModel, + type MediaUnderstandingProvider, + type VideoDescriptionRequest, + type VideoDescriptionResult, + assertOkOrThrowHttpError, + normalizeBaseUrl, + postJsonRequest, +} from "openclaw/plugin-sdk/media-understanding"; + +export const DEFAULT_XIAOMI_VIDEO_BASE_URL = "https://api.xiaomimimo.com/v1"; +const DEFAULT_XIAOMI_VIDEO_MODEL = "mimo-v2-omni"; +const DEFAULT_XIAOMI_VIDEO_PROMPT = "Describe the video."; + +type XiaomiVideoPayload = { + choices?: Array<{ + message?: { + content?: string | Array<{ text?: string }>; + reasoning_content?: string; + }; + }>; +}; + +function resolveModel(model?: string): string { + const trimmed = model?.trim(); + return trimmed || DEFAULT_XIAOMI_VIDEO_MODEL; +} + +function resolvePrompt(prompt?: string): string { + const trimmed = prompt?.trim(); + return trimmed || DEFAULT_XIAOMI_VIDEO_PROMPT; +} + +function coerceResponseText(payload: XiaomiVideoPayload): string | null { + const message = payload.choices?.[0]?.message; + if (!message) { + return null; + } + if (typeof message.content === "string" && message.content.trim()) { + return message.content.trim(); + } + if (Array.isArray(message.content)) { + const text = message.content + .map((part) => (typeof part.text === "string" ? part.text.trim() : "")) + .filter(Boolean) + .join("\n") + .trim(); + if (text) { + return text; + } + } + if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { + return message.reasoning_content.trim(); + } + return null; +} + +export async function describeXiaomiVideo( + params: VideoDescriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_XIAOMI_VIDEO_BASE_URL); + const model = resolveModel(params.model); + const mime = params.mime ?? "video/mp4"; + const prompt = resolvePrompt(params.prompt); + const url = `${baseUrl}/chat/completions`; + + const headers = new Headers(params.headers); + if (!headers.has("content-type")) { + headers.set("content-type", "application/json"); + } + if (!headers.has("authorization")) { + headers.set("authorization", `Bearer ${params.apiKey}`); + } + + const body = { + model, + messages: [ + { + role: "user", + content: [ + { type: "text", text: prompt }, + { + type: "video_url", + video_url: { + url: `data:${mime};base64,${params.buffer.toString("base64")}`, + }, + }, + ], + }, + ], + }; + + const { response: res, release } = await postJsonRequest({ + url, + headers, + body, + timeoutMs: params.timeoutMs, + fetchFn, + }); + + try { + await assertOkOrThrowHttpError(res, "Xiaomi video description failed"); + const payload = (await res.json()) as XiaomiVideoPayload; + const text = coerceResponseText(payload); + if (!text) { + throw new Error("Xiaomi video description response missing content"); + } + return { text, model }; + } finally { + await release(); + } +} + +export const xiaomiMediaUnderstandingProvider: MediaUnderstandingProvider = { + id: "xiaomi", + capabilities: ["image", "video"], + describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, + describeVideo: describeXiaomiVideo, +};