2026-03-16 22:58:55 -07:00

147 lines
4.2 KiB
TypeScript

import fs from "node:fs/promises";
import path from "node:path";
import type { MsgContext } from "../auto-reply/templating.js";
import type { OpenClawConfig } from "../config/config.js";
import { getMediaUnderstandingProvider } from "./providers/index.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,
runCapability,
type ActiveMediaModel,
} from "./runner.js";
import type { MediaUnderstandingCapability, MediaUnderstandingOutput } from "./types.js";
const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandingOutput["kind"]> = {
audio: "audio.transcription",
image: "image.description",
video: "video.description",
};
export type RunMediaUnderstandingFileParams = {
capability: MediaUnderstandingCapability;
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
};
export type RunMediaUnderstandingFileResult = {
text: string | undefined;
provider?: string;
model?: string;
output?: MediaUnderstandingOutput;
};
function buildFileContext(params: { filePath: string; mime?: string }): MsgContext {
return {
MediaPath: params.filePath,
MediaType: params.mime,
};
}
export async function runMediaUnderstandingFile(
params: RunMediaUnderstandingFileParams,
): Promise<RunMediaUnderstandingFileResult> {
const ctx = buildFileContext(params);
const attachments = normalizeMediaAttachments(ctx);
if (attachments.length === 0) {
return { text: undefined };
}
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
const cache = createMediaAttachmentCache(attachments, {
localPathRoots: [path.dirname(params.filePath)],
});
try {
const result = await runCapability({
capability: params.capability,
cfg: params.cfg,
ctx,
attachments: cache,
media: attachments,
agentDir: params.agentDir,
providerRegistry,
config: params.cfg.tools?.media?.[params.capability],
activeModel: params.activeModel,
});
const output = result.outputs.find(
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
);
const text = output?.text?.trim();
return {
text: text || undefined,
provider: output?.provider,
model: output?.model,
output,
};
} finally {
await cache.cleanup();
}
}
export async function describeImageFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<RunMediaUnderstandingFileResult> {
return await runMediaUnderstandingFile({ ...params, capability: "image" });
}
export async function describeImageFileWithModel(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
provider: string;
model: string;
prompt: string;
maxTokens?: number;
timeoutMs?: number;
}) {
const timeoutMs = params.timeoutMs ?? 30_000;
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
const provider = getMediaUnderstandingProvider(params.provider, providerRegistry);
if (!provider?.describeImage) {
throw new Error(`Provider does not support image analysis: ${params.provider}`);
}
const buffer = await fs.readFile(params.filePath);
return await provider.describeImage({
buffer,
fileName: path.basename(params.filePath),
mime: params.mime,
provider: params.provider,
model: params.model,
prompt: params.prompt,
maxTokens: params.maxTokens,
timeoutMs,
cfg: params.cfg,
agentDir: params.agentDir ?? "",
});
}
export async function describeVideoFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<RunMediaUnderstandingFileResult> {
return await runMediaUnderstandingFile({ ...params, capability: "video" });
}
export async function transcribeAudioFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<{ text: string | undefined }> {
const result = await runMediaUnderstandingFile({ ...params, capability: "audio" });
return { text: result.text };
}