diff --git a/src/extension-host/media-runtime-execution.ts b/src/extension-host/media-runtime-execution.ts
new file mode 100644
index 00000000000..83cdbdbb87f
--- /dev/null
+++ b/src/extension-host/media-runtime-execution.ts
@@ -0,0 +1,678 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import {
+  collectProviderApiKeysForExecution,
+  executeWithApiKeyRotation,
+} from "../agents/api-key-rotation.js";
+import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { applyTemplate } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import type {
+  MediaUnderstandingConfig,
+  MediaUnderstandingModelConfig,
+} from "../config/types.tools.js";
+import {
+  getExtensionHostMediaUnderstandingProvider,
+  normalizeExtensionHostMediaProviderId,
+} from "../extension-host/media-runtime-registry.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { resolveProxyFetchFromEnv } from "../infra/net/proxy-fetch.js";
+import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
+import { MediaAttachmentCache } from "../media-understanding/attachments.js";
+import {
+  CLI_OUTPUT_MAX_BUFFER,
+  DEFAULT_AUDIO_MODELS,
+  DEFAULT_TIMEOUT_SECONDS,
+  MIN_AUDIO_FILE_BYTES,
+} from "../media-understanding/defaults.js";
+import { MediaUnderstandingSkipError } from "../media-understanding/errors.js";
+import { fileExists } from "../media-understanding/fs.js";
+import { extractGeminiResponse } from "../media-understanding/output-extract.js";
+import {
+  resolveMaxBytes,
+  resolveMaxChars,
+  resolvePrompt,
+  resolveTimeoutMs,
+} from "../media-understanding/resolve.js";
+import type {
+  MediaUnderstandingCapability,
+  MediaUnderstandingDecision,
+  MediaUnderstandingModelDecision,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+} from "../media-understanding/types.js";
+import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "../media-understanding/video.js";
+import { runExec } from "../process/exec.js";
+
+export type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
+
+function sanitizeProviderHeaders(
+  headers: Record<string, unknown> | undefined,
+): Record<string, string> | undefined {
+  if (!headers) {
+    return undefined;
+  }
+  const next: Record<string, string> = {};
+  for (const [key, value] of Object.entries(headers)) {
+    if (typeof value !== "string") {
+      continue;
+    }
+    // Intentionally preserve marker-shaped values here. This path handles
+    // explicit config/runtime provider headers, where literal values may
+    // legitimately match marker patterns; discovered models.json entries are
+    // sanitized separately in the model registry path.
+    next[key] = value;
+  }
+  return Object.keys(next).length > 0 ? next : undefined;
+}
+
+function trimOutput(text: string, maxChars?: number): string {
+  const trimmed = text.trim();
+  if (!maxChars || trimmed.length <= maxChars) {
+    return trimmed;
+  }
+  return trimmed.slice(0, maxChars).trim();
+}
+
+function extractSherpaOnnxText(raw: string): string | null {
+  const tryParse = (value: string): string | null => {
+    const trimmed = value.trim();
+    if (!trimmed) {
+      return null;
+    }
+    const head = trimmed[0];
+    if (head !== "{" && head !== '"') {
+      return null;
+    }
+    try {
+      const parsed = JSON.parse(trimmed) as unknown;
+      if (typeof parsed === "string") {
+        return tryParse(parsed);
+      }
+      if (parsed && typeof parsed === "object") {
+        const text = (parsed as { text?: unknown }).text;
+        if (typeof text === "string" && text.trim()) {
+          return text.trim();
+        }
+      }
+    } catch {}
+    return null;
+  };
+
+  const direct = tryParse(raw);
+  if (direct) {
+    return direct;
+  }
+
+  const lines = raw
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean);
+  for (let i = lines.length - 1; i >= 0; i -= 1) {
+    const parsed = tryParse(lines[i] ?? "");
+    if (parsed) {
+      return parsed;
+    }
+  }
+  return null;
+}
+
+function commandBase(command: string): string {
+  return path.parse(command).name;
+}
+
+function findArgValue(args: string[], keys: string[]): string | undefined {
+  for (let i = 0; i < args.length; i += 1) {
+    if (keys.includes(args[i] ?? "")) {
+      const value = args[i + 1];
+      if (value) {
+        return value;
+      }
+    }
+  }
+  return undefined;
+}
+
+function hasArg(args: string[], keys: string[]): boolean {
+  return args.some((arg) => keys.includes(arg));
+}
+
+function resolveWhisperOutputPath(args: string[], mediaPath: string): string | null {
+  const outputDir = findArgValue(args, ["--output_dir", "-o"]);
+  const outputFormat = findArgValue(args, ["--output_format"]);
+  if (!outputDir || !outputFormat) {
+    return null;
+  }
+  const formats = outputFormat.split(",").map((value) => value.trim());
+  if (!formats.includes("txt")) {
+    return null;
+  }
+  const base = path.parse(mediaPath).name;
+  return path.join(outputDir, `${base}.txt`);
+}
+
+function resolveWhisperCppOutputPath(args: string[]): string | null {
+  if (!hasArg(args, ["-otxt", "--output-txt"])) {
+    return null;
+  }
+  const outputBase = findArgValue(args, ["-of", "--output-file"]);
+  if (!outputBase) {
+    return null;
+  }
+  return `${outputBase}.txt`;
+}
+
+function resolveParakeetOutputPath(args: string[], mediaPath: string): string | null {
+  const outputDir = findArgValue(args, ["--output-dir"]);
+  const outputFormat = findArgValue(args, ["--output-format"]);
+  if (!outputDir) {
+    return null;
+  }
+  if (outputFormat && outputFormat !== "txt") {
+    return null;
+  }
+  const base = path.parse(mediaPath).name;
+  return path.join(outputDir, `${base}.txt`);
+}
+
+async function resolveCliOutput(params: {
+  command: string;
+  args: string[];
+  stdout: string;
+  mediaPath: string;
+}): Promise<string> {
+  const commandId = commandBase(params.command);
+  const fileOutput =
+    commandId === "whisper-cli"
+      ? resolveWhisperCppOutputPath(params.args)
+      : commandId === "whisper"
+        ? resolveWhisperOutputPath(params.args, params.mediaPath)
+        : commandId === "parakeet-mlx"
+          ? resolveParakeetOutputPath(params.args, params.mediaPath)
+          : null;
+  if (fileOutput && (await fileExists(fileOutput))) {
+    try {
+      const content = await fs.readFile(fileOutput, "utf8");
+      if (content.trim()) {
+        return content.trim();
+      }
+    } catch {}
+  }
+
+  if (commandId === "gemini") {
+    const response = extractGeminiResponse(params.stdout);
+    if (response) {
+      return response;
+    }
+  }
+
+  if (commandId === "sherpa-onnx-offline") {
+    const response = extractSherpaOnnxText(params.stdout);
+    if (response) {
+      return response;
+    }
+  }
+
+  return params.stdout.trim();
+}
+
+type ProviderQuery = Record<string, string | number | boolean>;
+
+function normalizeProviderQuery(
+  options?: Record<string, string | number | boolean>,
+): ProviderQuery | undefined {
+  if (!options) {
+    return undefined;
+  }
+  const query: ProviderQuery = {};
+  for (const [key, value] of Object.entries(options)) {
+    if (value === undefined) {
+      continue;
+    }
+    query[key] = value;
+  }
+  return Object.keys(query).length > 0 ? query : undefined;
+}
+
+function buildDeepgramCompatQuery(options?: {
+  detectLanguage?: boolean;
+  punctuate?: boolean;
+  smartFormat?: boolean;
+}): ProviderQuery | undefined {
+  if (!options) {
+    return undefined;
+  }
+  const query: ProviderQuery = {};
+  if (typeof options.detectLanguage === "boolean") {
+    query.detect_language = options.detectLanguage;
+  }
+  if (typeof options.punctuate === "boolean") {
+    query.punctuate = options.punctuate;
+  }
+  if (typeof options.smartFormat === "boolean") {
+    query.smart_format = options.smartFormat;
+  }
+  return Object.keys(query).length > 0 ? query : undefined;
+}
+
+function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
+  const normalized = { ...query };
+  if ("detectLanguage" in normalized) {
+    normalized.detect_language = normalized.detectLanguage as boolean;
+    delete normalized.detectLanguage;
+  }
+  if ("smartFormat" in normalized) {
+    normalized.smart_format = normalized.smartFormat as boolean;
+    delete normalized.smartFormat;
+  }
+  return normalized;
+}
+
+function resolveProviderQuery(params: {
+  providerId: string;
+  config?: MediaUnderstandingConfig;
+  entry: MediaUnderstandingModelConfig;
+}): ProviderQuery | undefined {
+  const { providerId, config, entry } = params;
+  const mergedOptions = normalizeProviderQuery({
+    ...config?.providerOptions?.[providerId],
+    ...entry.providerOptions?.[providerId],
+  });
+  if (providerId !== "deepgram") {
+    return mergedOptions;
+  }
+  const query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
+  const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
+  for (const [key, value] of Object.entries(compat ?? {})) {
+    if (query[key] === undefined) {
+      query[key] = value;
+    }
+  }
+  return Object.keys(query).length > 0 ? query : undefined;
+}
+
+export function buildModelDecision(params: {
+  entry: MediaUnderstandingModelConfig;
+  entryType: "provider" | "cli";
+  outcome: MediaUnderstandingModelDecision["outcome"];
+  reason?: string;
+}): MediaUnderstandingModelDecision {
+  if (params.entryType === "cli") {
+    const command = params.entry.command?.trim();
+    return {
+      type: "cli",
+      provider: command ?? "cli",
+      model: params.entry.model ?? command,
+      outcome: params.outcome,
+      reason: params.reason,
+    };
+  }
+  const providerIdRaw = params.entry.provider?.trim();
+  const providerId = providerIdRaw
+    ? normalizeExtensionHostMediaProviderId(providerIdRaw)
+    : undefined;
+  return {
+    type: "provider",
+    provider: providerId ?? providerIdRaw,
+    model: params.entry.model,
+    outcome: params.outcome,
+    reason: params.reason,
+  };
+}
+
+function resolveEntryRunOptions(params: {
+  capability: MediaUnderstandingCapability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: OpenClawConfig;
+  config?: MediaUnderstandingConfig;
+}): { maxBytes: number; maxChars?: number; timeoutMs: number; prompt: string } {
+  const { capability, entry, cfg } = params;
+  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
+  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
+  const timeoutMs = resolveTimeoutMs(
+    entry.timeoutSeconds ??
+      params.config?.timeoutSeconds ??
+      cfg.tools?.media?.[capability]?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS[capability],
+  );
+  const prompt = resolvePrompt(
+    capability,
+    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    maxChars,
+  );
+  return { maxBytes, maxChars, timeoutMs, prompt };
+}
+
+async function resolveProviderExecutionAuth(params: {
+  providerId: string;
+  cfg: OpenClawConfig;
+  entry: MediaUnderstandingModelConfig;
+  agentDir?: string;
+}) {
+  const auth = await resolveApiKeyForProvider({
+    provider: params.providerId,
+    cfg: params.cfg,
+    profileId: params.entry.profile,
+    preferredProfile: params.entry.preferredProfile,
+    agentDir: params.agentDir,
+  });
+  return {
+    apiKeys: collectProviderApiKeysForExecution({
+      provider: params.providerId,
+      primaryApiKey: requireApiKey(auth, params.providerId),
+    }),
+    providerConfig: params.cfg.models?.providers?.[params.providerId],
+  };
+}
+
+async function resolveProviderExecutionContext(params: {
+  providerId: string;
+  cfg: OpenClawConfig;
+  entry: MediaUnderstandingModelConfig;
+  config?: MediaUnderstandingConfig;
+  agentDir?: string;
+}) {
+  const { apiKeys, providerConfig } = await resolveProviderExecutionAuth({
+    providerId: params.providerId,
+    cfg: params.cfg,
+    entry: params.entry,
+    agentDir: params.agentDir,
+  });
+  const baseUrl = params.entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
+  const mergedHeaders = {
+    ...sanitizeProviderHeaders(providerConfig?.headers as Record<string, unknown> | undefined),
+    ...sanitizeProviderHeaders(params.config?.headers as Record<string, unknown> | undefined),
+    ...sanitizeProviderHeaders(params.entry.headers as Record<string, unknown> | undefined),
+  };
+  const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
+  return { apiKeys, baseUrl, headers };
+}
+
+export function formatDecisionSummary(decision: MediaUnderstandingDecision): string {
+  const attachments = Array.isArray(decision.attachments) ? decision.attachments : [];
+  const total = attachments.length;
+  const success = attachments.filter((entry) => entry?.chosen?.outcome === "success").length;
+  const chosen = attachments.find((entry) => entry?.chosen)?.chosen;
+  const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined;
+  const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined;
+  const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
+  const reason = attachments
+    .flatMap((entry) => {
+      const attempts = Array.isArray(entry?.attempts) ? entry.attempts : [];
+      return attempts
+        .map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined))
+        .filter((value): value is string => Boolean(value));
+    })
+    .find((value) => value.trim().length > 0);
+  const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
+  const countLabel = total > 0 ? ` (${success}/${total})` : "";
+  const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
+  const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
+  return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
+}
+
+function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void {
+  if (params.size >= MIN_AUDIO_FILE_BYTES) {
+    return;
+  }
+  throw new MediaUnderstandingSkipError(
+    "tooSmall",
+    `Audio attachment ${params.attachmentIndex + 1} is too small (${params.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
+  );
+}
+
+export async function runProviderEntry(params: {
+  capability: MediaUnderstandingCapability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: OpenClawConfig;
+  ctx: MsgContext;
+  attachmentIndex: number;
+  cache: MediaAttachmentCache;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  config?: MediaUnderstandingConfig;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg } = params;
+  const providerIdRaw = entry.provider?.trim();
+  if (!providerIdRaw) {
+    throw new Error(`Provider entry missing provider for ${capability}`);
+  }
+  const providerId = normalizeExtensionHostMediaProviderId(providerIdRaw);
+  const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({
+    capability,
+    entry,
+    cfg,
+    config: params.config,
+  });
+
+  if (capability === "image") {
+    if (!params.agentDir) {
+      throw new Error("Image understanding requires agentDir");
+    }
+    const modelId = entry.model?.trim();
+    if (!modelId) {
+      throw new Error("Image understanding requires model id");
+    }
+    const media = await params.cache.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs,
+    });
+    const provider = getExtensionHostMediaUnderstandingProvider(
+      providerId,
+      params.providerRegistry,
+    );
+    const imageInput = {
+      buffer: media.buffer,
+      fileName: media.fileName,
+      mime: media.mime,
+      model: modelId,
+      provider: providerId,
+      prompt,
+      timeoutMs,
+      profile: entry.profile,
+      preferredProfile: entry.preferredProfile,
+      agentDir: params.agentDir,
+      cfg: params.cfg,
+    };
+    const { describeImageWithModel } = await import("../media-understanding/providers/image.js");
+    const describeImage = provider?.describeImage ?? describeImageWithModel;
+    const result = await describeImage(imageInput);
+    return {
+      kind: "image.description",
+      attachmentIndex: params.attachmentIndex,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? modelId,
+    };
+  }
+
+  const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry);
+  if (!provider) {
+    throw new Error(`Media provider not available: ${providerId}`);
+  }
+
+  // Resolve proxy-aware fetch from env vars (HTTPS_PROXY, HTTP_PROXY, etc.)
+  // so provider HTTP calls are routed through the proxy when configured.
+  const fetchFn = resolveProxyFetchFromEnv();
+
+  if (capability === "audio") {
+    if (!provider.transcribeAudio) {
+      throw new Error(`Audio transcription provider "${providerId}" not available.`);
+    }
+    const transcribeAudio = provider.transcribeAudio;
+    const media = await params.cache.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs,
+    });
+    assertMinAudioSize({ size: media.size, attachmentIndex: params.attachmentIndex });
+    const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({
+      providerId,
+      cfg,
+      entry,
+      config: params.config,
+      agentDir: params.agentDir,
+    });
+    const providerQuery = resolveProviderQuery({
+      providerId,
+      config: params.config,
+      entry,
+    });
+    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
+    const result = await executeWithApiKeyRotation({
+      provider: providerId,
+      apiKeys,
+      execute: async (apiKey) =>
+        transcribeAudio({
+          buffer: media.buffer,
+          fileName: media.fileName,
+          mime: media.mime,
+          apiKey,
+          baseUrl,
+          headers,
+          model,
+          language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
+          prompt,
+          query: providerQuery,
+          timeoutMs,
+          fetchFn,
+        }),
+    });
+    return {
+      kind: "audio.transcription",
+      attachmentIndex: params.attachmentIndex,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? model,
+    };
+  }
+
+  if (!provider.describeVideo) {
+    throw new Error(`Video understanding provider "${providerId}" not available.`);
+  }
+  const describeVideo = provider.describeVideo;
+  const media = await params.cache.getBuffer({
+    attachmentIndex: params.attachmentIndex,
+    maxBytes,
+    timeoutMs,
+  });
+  const estimatedBase64Bytes = estimateBase64Size(media.size);
+  const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
+  if (estimatedBase64Bytes > maxBase64Bytes) {
+    throw new MediaUnderstandingSkipError(
+      "maxBytes",
+      `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
+    );
+  }
+  const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({
+    providerId,
+    cfg,
+    entry,
+    config: params.config,
+    agentDir: params.agentDir,
+  });
+  const result = await executeWithApiKeyRotation({
+    provider: providerId,
+    apiKeys,
+    execute: (apiKey) =>
+      describeVideo({
+        buffer: media.buffer,
+        fileName: media.fileName,
+        mime: media.mime,
+        apiKey,
+        baseUrl,
+        headers,
+        model: entry.model,
+        prompt,
+        timeoutMs,
+        fetchFn,
+      }),
+  });
+  return {
+    kind: "video.description",
+    attachmentIndex: params.attachmentIndex,
+    text: trimOutput(result.text, maxChars),
+    provider: providerId,
+    model: result.model ?? entry.model,
+  };
+}
+
+export async function runCliEntry(params: {
+  capability: MediaUnderstandingCapability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: OpenClawConfig;
+  ctx: MsgContext;
+  attachmentIndex: number;
+  cache: MediaAttachmentCache;
+  config?: MediaUnderstandingConfig;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg, ctx } = params;
+  const command = entry.command?.trim();
+  const args = entry.args ?? [];
+  if (!command) {
+    throw new Error(`CLI entry missing command for ${capability}`);
+  }
+  const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({
+    capability,
+    entry,
+    cfg,
+    config: params.config,
+  });
+  const pathResult = await params.cache.getPath({
+    attachmentIndex: params.attachmentIndex,
+    maxBytes,
+    timeoutMs,
+  });
+  if (capability === "audio") {
+    const stat = await fs.stat(pathResult.path);
+    assertMinAudioSize({ size: stat.size, attachmentIndex: params.attachmentIndex });
+  }
+  const outputDir = await fs.mkdtemp(
+    path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
+  );
+  const mediaPath = pathResult.path;
+  const outputBase = path.join(outputDir, path.parse(mediaPath).name);
+
+  const templCtx: MsgContext = {
+    ...ctx,
+    MediaPath: mediaPath,
+    MediaDir: path.dirname(mediaPath),
+    OutputDir: outputDir,
+    OutputBase: outputBase,
+    Prompt: prompt,
+    MaxChars: maxChars,
+  };
+  const argv = [command, ...args].map((part, index) =>
+    index === 0 ? part : applyTemplate(part, templCtx),
+  );
+  try {
+    if (shouldLogVerbose()) {
+      logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
+    }
+    const { stdout } = await runExec(argv[0], argv.slice(1), {
+      timeoutMs,
+      maxBuffer: CLI_OUTPUT_MAX_BUFFER,
+    });
+    const resolved = await resolveCliOutput({
+      command,
+      args: argv.slice(1),
+      stdout,
+      mediaPath,
+    });
+    const text = trimOutput(resolved, maxChars);
+    if (!text) {
+      return null;
+    }
+    return {
+      kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
+      attachmentIndex: params.attachmentIndex,
+      text,
+      provider: "cli",
+      model: command,
+    };
+  } finally {
+    await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {});
+  }
+}
diff --git a/src/media-understanding/providers/anthropic/index.ts b/src/media-understanding/providers/anthropic/index.ts
index 35ae04a921e..080bdbbdd48 100644
--- a/src/media-understanding/providers/anthropic/index.ts
+++ b/src/media-understanding/providers/anthropic/index.ts
@@ -1,8 +1,14 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
-import { describeImageWithModel } from "../image.js";
+
+const describeImage = async (
+  ...args: Parameters<NonNullable<MediaUnderstandingProvider["describeImage"]>>
+) => {
+  const { describeImageWithModel } = await import("../image.js");
+  return describeImageWithModel(...args);
+};
 
 export const anthropicProvider: MediaUnderstandingProvider = {
   id: "anthropic",
   capabilities: ["image"],
-  describeImage: describeImageWithModel,
+  describeImage,
 };
diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts
index 50674aac396..2edbdc3c140 100644
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -1,12 +1,18 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
-import { describeImageWithModel } from "../image.js";
 import { transcribeGeminiAudio } from "./audio.js";
 import { describeGeminiVideo } from "./video.js";
 
+const describeImage = async (
+  ...args: Parameters<NonNullable<MediaUnderstandingProvider["describeImage"]>>
+) => {
+  const { describeImageWithModel } = await import("../image.js");
+  return describeImageWithModel(...args);
+};
+
 export const googleProvider: MediaUnderstandingProvider = {
   id: "google",
   capabilities: ["image", "audio", "video"],
-  describeImage: describeImageWithModel,
+  describeImage,
   transcribeAudio: transcribeGeminiAudio,
   describeVideo: describeGeminiVideo,
 };
diff --git a/src/media-understanding/providers/minimax/index.ts b/src/media-understanding/providers/minimax/index.ts
index c9a7936f4d3..bc87d4e24d1 100644
--- a/src/media-understanding/providers/minimax/index.ts
+++ b/src/media-understanding/providers/minimax/index.ts
@@ -1,14 +1,20 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
-import { describeImageWithModel } from "../image.js";
+
+const describeImage = async (
+  ...args: Parameters<NonNullable<MediaUnderstandingProvider["describeImage"]>>
+) => {
+  const { describeImageWithModel } = await import("../image.js");
+  return describeImageWithModel(...args);
+};
 
 export const minimaxProvider: MediaUnderstandingProvider = {
   id: "minimax",
   capabilities: ["image"],
-  describeImage: describeImageWithModel,
+  describeImage,
 };
 
 export const minimaxPortalProvider: MediaUnderstandingProvider = {
   id: "minimax-portal",
   capabilities: ["image"],
-  describeImage: describeImageWithModel,
+  describeImage,
 };
diff --git a/src/media-understanding/providers/moonshot/index.ts b/src/media-understanding/providers/moonshot/index.ts
index 78a525129dc..abca7d2bcce 100644
--- a/src/media-understanding/providers/moonshot/index.ts
+++ b/src/media-understanding/providers/moonshot/index.ts
@@ -1,10 +1,16 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
-import { describeImageWithModel } from "../image.js";
 import { describeMoonshotVideo } from "./video.js";
 
+const describeImage = async (
+  ...args: Parameters<NonNullable<MediaUnderstandingProvider["describeImage"]>>
+) => {
+  const { describeImageWithModel } = await import("../image.js");
+  return describeImageWithModel(...args);
+};
+
 export const moonshotProvider: MediaUnderstandingProvider = {
   id: "moonshot",
   capabilities: ["image", "video"],
-  describeImage: describeImageWithModel,
+  describeImage,
   describeVideo: describeMoonshotVideo,
 };
diff --git a/src/media-understanding/providers/openai/index.ts b/src/media-understanding/providers/openai/index.ts
index 24d01964562..149bac441cf 100644
--- a/src/media-understanding/providers/openai/index.ts
+++ b/src/media-understanding/providers/openai/index.ts
@@ -1,10 +1,16 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
-import { describeImageWithModel } from "../image.js";
 import { transcribeOpenAiCompatibleAudio } from "./audio.js";
 
+const describeImage = async (
+  ...args: Parameters<NonNullable<MediaUnderstandingProvider["describeImage"]>>
+) => {
+  const { describeImageWithModel } = await import("../image.js");
+  return describeImageWithModel(...args);
+};
+
 export const openaiProvider: MediaUnderstandingProvider = {
   id: "openai",
   capabilities: ["image", "audio"],
-  describeImage: describeImageWithModel,
+  describeImage,
   transcribeAudio: transcribeOpenAiCompatibleAudio,
 };
diff --git a/src/media-understanding/providers/zai/index.ts b/src/media-understanding/providers/zai/index.ts
index 337ea0a6853..1de22ce6c1f 100644
--- a/src/media-understanding/providers/zai/index.ts
+++ b/src/media-understanding/providers/zai/index.ts
@@ -1,8 +1,14 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
-import { describeImageWithModel } from "../image.js";
+
+const describeImage = async (
+  ...args: Parameters<NonNullable<MediaUnderstandingProvider["describeImage"]>>
+) => {
+  const { describeImageWithModel } = await import("../image.js");
+  return describeImageWithModel(...args);
+};
 
 export const zaiProvider: MediaUnderstandingProvider = {
   id: "zai",
   capabilities: ["image"],
-  describeImage: describeImageWithModel,
+  describeImage,
 };
diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts
index 1a104714bb8..85c4fd3cf11 100644
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -1,292 +1,19 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import {
-  collectProviderApiKeysForExecution,
-  executeWithApiKeyRotation,
-} from "../agents/api-key-rotation.js";
-import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
 import type { MsgContext } from "../auto-reply/templating.js";
-import { applyTemplate } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/config.js";
 import type {
   MediaUnderstandingConfig,
   MediaUnderstandingModelConfig,
 } from "../config/types.tools.js";
-import {
-  getExtensionHostMediaUnderstandingProvider,
-  normalizeExtensionHostMediaProviderId,
-} from "../extension-host/media-runtime-registry.js";
-import { logVerbose, shouldLogVerbose } from "../globals.js";
-import { resolveProxyFetchFromEnv } from "../infra/net/proxy-fetch.js";
-import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
-import { runExec } from "../process/exec.js";
-import { MediaAttachmentCache } from "./attachments.js";
-import {
-  CLI_OUTPUT_MAX_BUFFER,
-  DEFAULT_AUDIO_MODELS,
-  DEFAULT_TIMEOUT_SECONDS,
-  MIN_AUDIO_FILE_BYTES,
-} from "./defaults.js";
-import { MediaUnderstandingSkipError } from "./errors.js";
-import { fileExists } from "./fs.js";
-import { extractGeminiResponse } from "./output-extract.js";
-import { describeImageWithModel } from "./providers/image.js";
-import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
+import { normalizeExtensionHostMediaProviderId } from "../extension-host/media-runtime-registry.js";
+import type { MediaAttachmentCache } from "./attachments.js";
 import type {
   MediaUnderstandingCapability,
   MediaUnderstandingDecision,
   MediaUnderstandingModelDecision,
   MediaUnderstandingOutput,
-  MediaUnderstandingProvider,
 } from "./types.js";
-import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
 
-export type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
-
-function sanitizeProviderHeaders(
-  headers: Record<string, unknown> | undefined,
-): Record<string, string> | undefined {
-  if (!headers) {
-    return undefined;
-  }
-  const next: Record<string, string> = {};
-  for (const [key, value] of Object.entries(headers)) {
-    if (typeof value !== "string") {
-      continue;
-    }
-    // Intentionally preserve marker-shaped values here. This path handles
-    // explicit config/runtime provider headers, where literal values may
-    // legitimately match marker patterns; discovered models.json entries are
-    // sanitized separately in the model registry path.
-    next[key] = value;
-  }
-  return Object.keys(next).length > 0 ? next : undefined;
-}
-
-function trimOutput(text: string, maxChars?: number): string {
-  const trimmed = text.trim();
-  if (!maxChars || trimmed.length <= maxChars) {
-    return trimmed;
-  }
-  return trimmed.slice(0, maxChars).trim();
-}
-
-function extractSherpaOnnxText(raw: string): string | null {
-  const tryParse = (value: string): string | null => {
-    const trimmed = value.trim();
-    if (!trimmed) {
-      return null;
-    }
-    const head = trimmed[0];
-    if (head !== "{" && head !== '"') {
-      return null;
-    }
-    try {
-      const parsed = JSON.parse(trimmed) as unknown;
-      if (typeof parsed === "string") {
-        return tryParse(parsed);
-      }
-      if (parsed && typeof parsed === "object") {
-        const text = (parsed as { text?: unknown }).text;
-        if (typeof text === "string" && text.trim()) {
-          return text.trim();
-        }
-      }
-    } catch {}
-    return null;
-  };
-
-  const direct = tryParse(raw);
-  if (direct) {
-    return direct;
-  }
-
-  const lines = raw
-    .split("\n")
-    .map((line) => line.trim())
-    .filter(Boolean);
-  for (let i = lines.length - 1; i >= 0; i -= 1) {
-    const parsed = tryParse(lines[i] ?? "");
-    if (parsed) {
-      return parsed;
-    }
-  }
-  return null;
-}
-
-function commandBase(command: string): string {
-  return path.parse(command).name;
-}
-
-function findArgValue(args: string[], keys: string[]): string | undefined {
-  for (let i = 0; i < args.length; i += 1) {
-    if (keys.includes(args[i] ?? "")) {
-      const value = args[i + 1];
-      if (value) {
-        return value;
-      }
-    }
-  }
-  return undefined;
-}
-
-function hasArg(args: string[], keys: string[]): boolean {
-  return args.some((arg) => keys.includes(arg));
-}
-
-function resolveWhisperOutputPath(args: string[], mediaPath: string): string | null {
-  const outputDir = findArgValue(args, ["--output_dir", "-o"]);
-  const outputFormat = findArgValue(args, ["--output_format"]);
-  if (!outputDir || !outputFormat) {
-    return null;
-  }
-  const formats = outputFormat.split(",").map((value) => value.trim());
-  if (!formats.includes("txt")) {
-    return null;
-  }
-  const base = path.parse(mediaPath).name;
-  return path.join(outputDir, `${base}.txt`);
-}
-
-function resolveWhisperCppOutputPath(args: string[]): string | null {
-  if (!hasArg(args, ["-otxt", "--output-txt"])) {
-    return null;
-  }
-  const outputBase = findArgValue(args, ["-of", "--output-file"]);
-  if (!outputBase) {
-    return null;
-  }
-  return `${outputBase}.txt`;
-}
-
-function resolveParakeetOutputPath(args: string[], mediaPath: string): string | null {
-  const outputDir = findArgValue(args, ["--output-dir"]);
-  const outputFormat = findArgValue(args, ["--output-format"]);
-  if (!outputDir) {
-    return null;
-  }
-  if (outputFormat && outputFormat !== "txt") {
-    return null;
-  }
-  const base = path.parse(mediaPath).name;
-  return path.join(outputDir, `${base}.txt`);
-}
-
-async function resolveCliOutput(params: {
-  command: string;
-  args: string[];
-  stdout: string;
-  mediaPath: string;
-}): Promise<string> {
-  const commandId = commandBase(params.command);
-  const fileOutput =
-    commandId === "whisper-cli"
-      ? resolveWhisperCppOutputPath(params.args)
-      : commandId === "whisper"
-        ? resolveWhisperOutputPath(params.args, params.mediaPath)
-        : commandId === "parakeet-mlx"
-          ? resolveParakeetOutputPath(params.args, params.mediaPath)
-          : null;
-  if (fileOutput && (await fileExists(fileOutput))) {
-    try {
-      const content = await fs.readFile(fileOutput, "utf8");
-      if (content.trim()) {
-        return content.trim();
-      }
-    } catch {}
-  }
-
-  if (commandId === "gemini") {
-    const response = extractGeminiResponse(params.stdout);
-    if (response) {
-      return response;
-    }
-  }
-
-  if (commandId === "sherpa-onnx-offline") {
-    const response = extractSherpaOnnxText(params.stdout);
-    if (response) {
-      return response;
-    }
-  }
-
-  return params.stdout.trim();
-}
-
-type ProviderQuery = Record<string, string | number | boolean>;
-
-function normalizeProviderQuery(
-  options?: Record<string, string | number | boolean>,
-): ProviderQuery | undefined {
-  if (!options) {
-    return undefined;
-  }
-  const query: ProviderQuery = {};
-  for (const [key, value] of Object.entries(options)) {
-    if (value === undefined) {
-      continue;
-    }
-    query[key] = value;
-  }
-  return Object.keys(query).length > 0 ? query : undefined;
-}
-
-function buildDeepgramCompatQuery(options?: {
-  detectLanguage?: boolean;
-  punctuate?: boolean;
-  smartFormat?: boolean;
-}): ProviderQuery | undefined {
-  if (!options) {
-    return undefined;
-  }
-  const query: ProviderQuery = {};
-  if (typeof options.detectLanguage === "boolean") {
-    query.detect_language = options.detectLanguage;
-  }
-  if (typeof options.punctuate === "boolean") {
-    query.punctuate = options.punctuate;
-  }
-  if (typeof options.smartFormat === "boolean") {
-    query.smart_format = options.smartFormat;
-  }
-  return Object.keys(query).length > 0 ? query : undefined;
-}
-
-function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
-  const normalized = { ...query };
-  if ("detectLanguage" in normalized) {
-    normalized.detect_language = normalized.detectLanguage as boolean;
-    delete normalized.detectLanguage;
-  }
-  if ("smartFormat" in normalized) {
-    normalized.smart_format = normalized.smartFormat as boolean;
-    delete normalized.smartFormat;
-  }
-  return normalized;
-}
-
-function resolveProviderQuery(params: {
-  providerId: string;
-  config?: MediaUnderstandingConfig;
-  entry: MediaUnderstandingModelConfig;
-}): ProviderQuery | undefined {
-  const { providerId, config, entry } = params;
-  const mergedOptions = normalizeProviderQuery({
-    ...config?.providerOptions?.[providerId],
-    ...entry.providerOptions?.[providerId],
-  });
-  if (providerId !== "deepgram") {
-    return mergedOptions;
-  }
-  const query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
-  const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
-  for (const [key, value] of Object.entries(compat ?? {})) {
-    if (query[key] === undefined) {
-      query[key] = value;
-    }
-  }
-  return Object.keys(query).length > 0 ? query : undefined;
-}
+export type ProviderRegistry = Map<string, import("./types.js").MediaUnderstandingProvider>;
 
 export function buildModelDecision(params: {
   entry: MediaUnderstandingModelConfig;
@@ -317,74 +44,6 @@ export function buildModelDecision(params: {
   };
 }
 
-function resolveEntryRunOptions(params: {
-  capability: MediaUnderstandingCapability;
-  entry: MediaUnderstandingModelConfig;
-  cfg: OpenClawConfig;
-  config?: MediaUnderstandingConfig;
-}): { maxBytes: number; maxChars?: number; timeoutMs: number; prompt: string } {
-  const { capability, entry, cfg } = params;
-  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
-  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
-  const timeoutMs = resolveTimeoutMs(
-    entry.timeoutSeconds ??
-      params.config?.timeoutSeconds ??
-      cfg.tools?.media?.[capability]?.timeoutSeconds,
-    DEFAULT_TIMEOUT_SECONDS[capability],
-  );
-  const prompt = resolvePrompt(
-    capability,
-    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
-    maxChars,
-  );
-  return { maxBytes, maxChars, timeoutMs, prompt };
-}
-
-async function resolveProviderExecutionAuth(params: {
-  providerId: string;
-  cfg: OpenClawConfig;
-  entry: MediaUnderstandingModelConfig;
-  agentDir?: string;
-}) {
-  const auth = await resolveApiKeyForProvider({
-    provider: params.providerId,
-    cfg: params.cfg,
-    profileId: params.entry.profile,
-    preferredProfile: params.entry.preferredProfile,
-    agentDir: params.agentDir,
-  });
-  return {
-    apiKeys: collectProviderApiKeysForExecution({
-      provider: params.providerId,
-      primaryApiKey: requireApiKey(auth, params.providerId),
-    }),
-    providerConfig: params.cfg.models?.providers?.[params.providerId],
-  };
-}
-
-async function resolveProviderExecutionContext(params: {
-  providerId: string;
-  cfg: OpenClawConfig;
-  entry: MediaUnderstandingModelConfig;
-  config?: MediaUnderstandingConfig;
-  agentDir?: string;
-}) {
-  const { apiKeys, providerConfig } = await resolveProviderExecutionAuth({
-    providerId: params.providerId,
-    cfg: params.cfg,
-    entry: params.entry,
-    agentDir: params.agentDir,
-  });
-  const baseUrl = params.entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
-  const mergedHeaders = {
-    ...sanitizeProviderHeaders(providerConfig?.headers as Record<string, unknown> | undefined),
-    ...sanitizeProviderHeaders(params.config?.headers as Record<string, unknown> | undefined),
-    ...sanitizeProviderHeaders(params.entry.headers as Record<string, unknown> | undefined),
-  };
-  const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
-  return { apiKeys, baseUrl, headers };
-}
-
 export function formatDecisionSummary(decision: MediaUnderstandingDecision): string {
   const attachments = Array.isArray(decision.attachments) ? decision.attachments : [];
   const total = attachments.length;
@@ -408,16 +67,6 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
   return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
 }
 
-function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void {
-  if (params.size >= MIN_AUDIO_FILE_BYTES) {
-    return;
-  }
-  throw new MediaUnderstandingSkipError(
-    "tooSmall",
-    `Audio attachment ${params.attachmentIndex + 1} is too small (${params.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
-  );
-}
-
 export async function runProviderEntry(params: {
   capability: MediaUnderstandingCapability;
   entry: MediaUnderstandingModelConfig;
@@ -429,169 +78,8 @@ export async function runProviderEntry(params: {
   providerRegistry: ProviderRegistry;
   config?: MediaUnderstandingConfig;
 }): Promise<MediaUnderstandingOutput | null> {
-  const { entry, capability, cfg } = params;
-  const providerIdRaw = entry.provider?.trim();
-  if (!providerIdRaw) {
-    throw new Error(`Provider entry missing provider for ${capability}`);
-  }
-  const providerId = normalizeExtensionHostMediaProviderId(providerIdRaw);
-  const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({
-    capability,
-    entry,
-    cfg,
-    config: params.config,
-  });
-
-  if (capability === "image") {
-    if (!params.agentDir) {
-      throw new Error("Image understanding requires agentDir");
-    }
-    const modelId = entry.model?.trim();
-    if (!modelId) {
-      throw new Error("Image understanding requires model id");
-    }
-    const media = await params.cache.getBuffer({
-      attachmentIndex: params.attachmentIndex,
-      maxBytes,
-      timeoutMs,
-    });
-    const provider = getExtensionHostMediaUnderstandingProvider(
-      providerId,
-      params.providerRegistry,
-    );
-    const imageInput = {
-      buffer: media.buffer,
-      fileName: media.fileName,
-      mime: media.mime,
-      model: modelId,
-      provider: providerId,
-      prompt,
-      timeoutMs,
-      profile: entry.profile,
-      preferredProfile: entry.preferredProfile,
-      agentDir: params.agentDir,
-      cfg: params.cfg,
-    };
-    const describeImage = provider?.describeImage ?? describeImageWithModel;
-    const result = await describeImage(imageInput);
-    return {
-      kind: "image.description",
-      attachmentIndex: params.attachmentIndex,
-      text: trimOutput(result.text, maxChars),
-      provider: providerId,
-      model: result.model ?? modelId,
-    };
-  }
-
-  const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry);
-  if (!provider) {
-    throw new Error(`Media provider not available: ${providerId}`);
-  }
-
-  // Resolve proxy-aware fetch from env vars (HTTPS_PROXY, HTTP_PROXY, etc.)
-  // so provider HTTP calls are routed through the proxy when configured.
-  const fetchFn = resolveProxyFetchFromEnv();
-
-  if (capability === "audio") {
-    if (!provider.transcribeAudio) {
-      throw new Error(`Audio transcription provider "${providerId}" not available.`);
-    }
-    const transcribeAudio = provider.transcribeAudio;
-    const media = await params.cache.getBuffer({
-      attachmentIndex: params.attachmentIndex,
-      maxBytes,
-      timeoutMs,
-    });
-    assertMinAudioSize({ size: media.size, attachmentIndex: params.attachmentIndex });
-    const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({
-      providerId,
-      cfg,
-      entry,
-      config: params.config,
-      agentDir: params.agentDir,
-    });
-    const providerQuery = resolveProviderQuery({
-      providerId,
-      config: params.config,
-      entry,
-    });
-    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
-    const result = await executeWithApiKeyRotation({
-      provider: providerId,
-      apiKeys,
-      execute: async (apiKey) =>
-        transcribeAudio({
-          buffer: media.buffer,
-          fileName: media.fileName,
-          mime: media.mime,
-          apiKey,
-          baseUrl,
-          headers,
-          model,
-          language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
-          prompt,
-          query: providerQuery,
-          timeoutMs,
-          fetchFn,
-        }),
-    });
-    return {
-      kind: "audio.transcription",
-      attachmentIndex: params.attachmentIndex,
-      text: trimOutput(result.text, maxChars),
-      provider: providerId,
-      model: result.model ?? model,
-    };
-  }
-
-  if (!provider.describeVideo) {
-    throw new Error(`Video understanding provider "${providerId}" not available.`);
-  }
-  const describeVideo = provider.describeVideo;
-  const media = await params.cache.getBuffer({
-    attachmentIndex: params.attachmentIndex,
-    maxBytes,
-    timeoutMs,
-  });
-  const estimatedBase64Bytes = estimateBase64Size(media.size);
-  const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
-  if (estimatedBase64Bytes > maxBase64Bytes) {
-    throw new MediaUnderstandingSkipError(
-      "maxBytes",
-      `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
-    );
-  }
-  const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({
-    providerId,
-    cfg,
-    entry,
-    config: params.config,
-    agentDir: params.agentDir,
-  });
-  const result = await executeWithApiKeyRotation({
-    provider: providerId,
-    apiKeys,
-    execute: (apiKey) =>
-      describeVideo({
-        buffer: media.buffer,
-        fileName: media.fileName,
-        mime: media.mime,
-        apiKey,
-        baseUrl,
-        headers,
-        model: entry.model,
-        prompt,
-        timeoutMs,
-        fetchFn,
-      }),
-  });
-  return {
-    kind: "video.description",
-    attachmentIndex: params.attachmentIndex,
-    text: trimOutput(result.text, maxChars),
-    provider: providerId,
-    model: result.model ?? entry.model,
-  };
+  const runtime = await import("../extension-host/media-runtime-execution.js");
+  return runtime.runProviderEntry(params);
 }
 
 export async function runCliEntry(params: {
@@ -603,71 +91,6 @@ export async function runCliEntry(params: {
   cache: MediaAttachmentCache;
   config?: MediaUnderstandingConfig;
 }): Promise<MediaUnderstandingOutput | null> {
-  const { entry, capability, cfg, ctx } = params;
-  const command = entry.command?.trim();
-  const args = entry.args ?? [];
-  if (!command) {
-    throw new Error(`CLI entry missing command for ${capability}`);
-  }
-  const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({
-    capability,
-    entry,
-    cfg,
-    config: params.config,
-  });
-  const pathResult = await params.cache.getPath({
-    attachmentIndex: params.attachmentIndex,
-    maxBytes,
-    timeoutMs,
-  });
-  if (capability === "audio") {
-    const stat = await fs.stat(pathResult.path);
-    assertMinAudioSize({ size: stat.size, attachmentIndex: params.attachmentIndex });
-  }
-  const outputDir = await fs.mkdtemp(
-    path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
-  );
-  const mediaPath = pathResult.path;
-  const outputBase = path.join(outputDir, path.parse(mediaPath).name);
-
-  const templCtx: MsgContext = {
-    ...ctx,
-    MediaPath: mediaPath,
-    MediaDir: path.dirname(mediaPath),
-    OutputDir: outputDir,
-    OutputBase: outputBase,
-    Prompt: prompt,
-    MaxChars: maxChars,
-  };
-  const argv = [command, ...args].map((part, index) =>
-    index === 0 ? part : applyTemplate(part, templCtx),
-  );
-  try {
-    if (shouldLogVerbose()) {
-      logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
-    }
-    const { stdout } = await runExec(argv[0], argv.slice(1), {
-      timeoutMs,
-      maxBuffer: CLI_OUTPUT_MAX_BUFFER,
-    });
-    const resolved = await resolveCliOutput({
-      command,
-      args: argv.slice(1),
-      stdout,
-      mediaPath,
-    });
-    const text = trimOutput(resolved, maxChars);
-    if (!text) {
-      return null;
-    }
-    return {
-      kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
-      attachmentIndex: params.attachmentIndex,
-      text,
-      provider: "cli",
-      model: command,
-    };
-  } finally {
-    await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {});
-  }
+  const runtime = await import("../extension-host/media-runtime-execution.js");
+  return runtime.runCliEntry(params);
 }