openclaw/src/media-understanding/apply.echo-transcript.test.ts

import fs from "node:fs/promises";
import path from "node:path";
import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
import type { MsgContext } from "../auto-reply/templating.js";
import type { OpenClawConfig } from "../config/config.js";
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
import { createSafeAudioFixtureBuffer } from "./runner.test-utils.js";

// ---------------------------------------------------------------------------
// Module mocks
// ---------------------------------------------------------------------------

vi.mock("../agents/model-auth.js", () => ({
  resolveApiKeyForProvider: vi.fn(async () => ({
    apiKey: "test-key", // pragma: allowlist secret
    source: "test",
    mode: "api-key",
  })),
  requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => {
    if (auth?.apiKey) {
      return auth.apiKey;
    }
    throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`);
  },
  resolveAwsSdkEnvVarName: vi.fn(() => undefined),
  resolveEnvApiKey: vi.fn(() => null),
  resolveModelAuthMode: vi.fn(() => "api-key"),
  getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })),
  getCustomProviderApiKey: vi.fn(() => undefined),
  ensureAuthProfileStore: vi.fn(async () => ({})),
  resolveAuthProfileOrder: vi.fn(() => []),
}));

const { MediaFetchErrorMock } = vi.hoisted(() => {
  class MediaFetchErrorMock extends Error {
    code: string;
    constructor(message: string, code: string) {
      super(message);
      this.name = "MediaFetchError";
      this.code = code;
    }
  }
  return { MediaFetchErrorMock };
});

vi.mock("../media/fetch.js", () => ({
  fetchRemoteMedia: vi.fn(),
  MediaFetchError: MediaFetchErrorMock,
}));

vi.mock("../process/exec.js", () => ({
  runExec: vi.fn(),
  runCommandWithTimeout: vi.fn(),
}));

const mockDeliverOutboundPayloads = vi.fn();

vi.mock("../infra/outbound/deliver.js", () => ({
  deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args),
}));

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
let clearMediaUnderstandingBinaryCacheForTests: () => void;

const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-";
let suiteTempMediaRootDir = "";

async function createTempAudioFile(): Promise<string> {
  const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-"));
  const filePath = path.join(dir, "note.ogg");
  await fs.writeFile(filePath, createSafeAudioFixtureBuffer(2048));
  return filePath;
}

function createAudioCtxWithProvider(mediaPath: string, extra?: Partial<MsgContext>): MsgContext {
  return {
    Body: "<media:audio>",
    MediaPath: mediaPath,
    MediaType: "audio/ogg",
    Provider: "whatsapp",
    From: "+10000000001",
    AccountId: "acc1",
    ...extra,
  };
}

function createAudioConfigWithEcho(opts?: {
  echoTranscript?: boolean;
  echoFormat?: string;
  transcribedText?: string;
}): {
  cfg: OpenClawConfig;
  providers: Record<string, { id: string; transcribeAudio: () => Promise<{ text: string }> }>;
} {
  const cfg: OpenClawConfig = {
    tools: {
      media: {
        audio: {
          enabled: true,
          maxBytes: 1024 * 1024,
          models: [{ provider: "groq" }],
          echoTranscript: opts?.echoTranscript ?? true,
          ...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}),
        },
      },
    },
  };
  const providers = {
    groq: {
      id: "groq",
      transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }),
    },
  };
  return { cfg, providers };
}

function expectSingleEchoDeliveryCall() {
  expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
  const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
  expect(callArgs).toBeDefined();
  return callArgs as {
    to?: string;
    channel?: string;
    accountId?: string;
    payloads: Array<{ text?: string }>;
  };
}

function createAudioConfigWithoutEchoFlag() {
  const { cfg, providers } = createAudioConfigWithEcho();
  const audio = cfg.tools?.media?.audio as { echoTranscript?: boolean } | undefined;
  if (audio && "echoTranscript" in audio) {
    delete audio.echoTranscript;
  }
  return { cfg, providers };
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

describe("applyMediaUnderstanding – echo transcript", () => {
  beforeAll(async () => {
    const baseDir = resolvePreferredOpenClawTmpDir();
    await fs.mkdir(baseDir, { recursive: true });
    suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX));
    const mod = await import("./apply.js");
    applyMediaUnderstanding = mod.applyMediaUnderstanding;
    const runner = await import("./runner.js");
    clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests;
  });

  beforeEach(() => {
    mockDeliverOutboundPayloads.mockClear();
    mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]);
    clearMediaUnderstandingBinaryCacheForTests?.();
  });

  afterAll(async () => {
    if (!suiteTempMediaRootDir) {
      return;
    }
    await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true });
    suiteTempMediaRootDir = "";
  });

  it("does NOT echo when echoTranscript is false (default)", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx = createAudioCtxWithProvider(mediaPath);
    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false });

    await applyMediaUnderstanding({ ctx, cfg, providers });

    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
  });

  it("does NOT echo when echoTranscript is absent (default)", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx = createAudioCtxWithProvider(mediaPath);
    const { cfg, providers } = createAudioConfigWithoutEchoFlag();

    await applyMediaUnderstanding({ ctx, cfg, providers });

    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
  });

  it("echoes transcript with default format when echoTranscript is true", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx = createAudioCtxWithProvider(mediaPath);
    const { cfg, providers } = createAudioConfigWithEcho({
      echoTranscript: true,
      transcribedText: "hello world",
    });

    await applyMediaUnderstanding({ ctx, cfg, providers });

    const callArgs = expectSingleEchoDeliveryCall();
    expect(callArgs.channel).toBe("whatsapp");
    expect(callArgs.to).toBe("+10000000001");
    expect(callArgs.accountId).toBe("acc1");
    expect(callArgs.payloads).toHaveLength(1);
    expect(callArgs.payloads[0].text).toBe('📝 "hello world"');
  });

  it("uses custom echoFormat when provided", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx = createAudioCtxWithProvider(mediaPath);
    const { cfg, providers } = createAudioConfigWithEcho({
      echoTranscript: true,
      echoFormat: "🎙️ Heard: {transcript}",
      transcribedText: "custom message",
    });

    await applyMediaUnderstanding({ ctx, cfg, providers });

    const callArgs = expectSingleEchoDeliveryCall();
    expect(callArgs.payloads[0].text).toBe("🎙️ Heard: custom message");
  });

  it("does NOT echo when there are no audio attachments", async () => {
    // Image-only context — no audio attachment
    const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-"));
    const imgPath = path.join(dir, "photo.jpg");
    await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0]));

    const ctx: MsgContext = {
      Body: "<media:image>",
      MediaPath: imgPath,
      MediaType: "image/jpeg",
      Provider: "whatsapp",
      From: "+10000000001",
    };

    const { cfg, providers } = createAudioConfigWithEcho({
      echoTranscript: true,
      transcribedText: "should not appear",
    });
    cfg.tools!.media!.image = { enabled: false };

    await applyMediaUnderstanding({ ctx, cfg, providers });

    // No audio outputs → Transcript not set → no echo
    expect(ctx.Transcript).toBeUndefined();
    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
  });

  it("does NOT echo when transcription fails", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx = createAudioCtxWithProvider(mediaPath);
    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
    providers.groq.transcribeAudio = async () => {
      throw new Error("transcription provider failure");
    };

    // Should not throw; transcription failure is swallowed by runner
    await applyMediaUnderstanding({ ctx, cfg, providers });

    expect(ctx.Transcript).toBeUndefined();
    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
  });

  it("does NOT echo when channel is not deliverable", async () => {
    const mediaPath = await createTempAudioFile();
    // Use an internal/non-deliverable channel
    const ctx = createAudioCtxWithProvider(mediaPath, {
      Provider: "internal-system",
      From: "some-source",
    });
    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });

    await applyMediaUnderstanding({ ctx, cfg, providers });

    // Transcript should be set (transcription succeeded)
    expect(ctx.Transcript).toBe("hello world");
    // But echo should be skipped
    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
  });

  it("does NOT echo when ctx has no From or OriginatingTo", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx: MsgContext = {
      Body: "<media:audio>",
      MediaPath: mediaPath,
      MediaType: "audio/ogg",
      Provider: "whatsapp",
      // From and OriginatingTo intentionally absent
    };
    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });

    await applyMediaUnderstanding({ ctx, cfg, providers });

    expect(ctx.Transcript).toBe("hello world");
    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
  });

  it("uses OriginatingTo when From is absent", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx: MsgContext = {
      Body: "<media:audio>",
      MediaPath: mediaPath,
      MediaType: "audio/ogg",
      Provider: "whatsapp",
      OriginatingTo: "+19999999999",
    };
    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });

    await applyMediaUnderstanding({ ctx, cfg, providers });

    const callArgs = expectSingleEchoDeliveryCall();
    expect(callArgs.to).toBe("+19999999999");
  });

  it("echo delivery failure does not throw or break transcription", async () => {
    const mediaPath = await createTempAudioFile();
    const ctx = createAudioCtxWithProvider(mediaPath);
    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });

    mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout"));

    // Should not throw
    const result = await applyMediaUnderstanding({ ctx, cfg, providers });

    // Transcription itself succeeded
    expect(result.appliedAudio).toBe(true);
    expect(ctx.Transcript).toBe("hello world");
    // Deliver was attempted
    expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
  });
});
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								import fs from "node:fs/promises";
 								import path from "node:path";
 								import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
 								import type { MsgContext } from "../auto-reply/templating.js";
 								import type { OpenClawConfig } from "../config/config.js";
 								import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
-												refactor(media): split audio helpers and attachment cache

											
										
										
											2026-03-02 22:00:46 +00:00
+								import { createSafeAudioFixtureBuffer } from "./runner.test-utils.js";
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
 								// ---------------------------------------------------------------------------
 								// Module mocks
 								// ---------------------------------------------------------------------------
 								vi.mock("../agents/model-auth.js", () => ({
 								  resolveApiKeyForProvider: vi.fn(async () => ({
-												CI: restore main detect-secrets scan (#38438)

* Tests: stabilize detect-secrets fixtures

* Tests: fix rebased detect-secrets false positives

* Docs: keep snippets valid under detect-secrets

* Tests: finalize detect-secrets false-positive fixes

* Tests: reduce detect-secrets false positives

* Tests: keep detect-secrets pragmas inline

* Tests: remediate next detect-secrets batch

* Tests: tighten detect-secrets allowlists

* Tests: stabilize detect-secrets formatter drift
											
										
										
											2026-03-07 13:06:35 -05:00
+								    apiKey: "test-key", // pragma: allowlist secret
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								    source: "test",
 								    mode: "api-key",
 								  })),
 								  requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => {
 								    if (auth?.apiKey) {
 								      return auth.apiKey;
 								    }
 								    throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`);
 								  },
 								  resolveAwsSdkEnvVarName: vi.fn(() => undefined),
 								  resolveEnvApiKey: vi.fn(() => null),
 								  resolveModelAuthMode: vi.fn(() => "api-key"),
 								  getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })),
 								  getCustomProviderApiKey: vi.fn(() => undefined),
 								  ensureAuthProfileStore: vi.fn(async () => ({})),
 								  resolveAuthProfileOrder: vi.fn(() => []),
 								}));
-												refactor(media): split audio helpers and attachment cache

											
										
										
											2026-03-02 22:00:46 +00:00
+								const { MediaFetchErrorMock } = vi.hoisted(() => {
 								  class MediaFetchErrorMock extends Error {
 								    code: string;
 								    constructor(message: string, code: string) {
 								      super(message);
 								      this.name = "MediaFetchError";
 								      this.code = code;
 								    }
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								  }
-												refactor(media): split audio helpers and attachment cache

											
										
										
											2026-03-02 22:00:46 +00:00
+								  return { MediaFetchErrorMock };
 								});
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
 								vi.mock("../media/fetch.js", () => ({
 								  fetchRemoteMedia: vi.fn(),
 								  MediaFetchError: MediaFetchErrorMock,
 								}));
 								vi.mock("../process/exec.js", () => ({
 								  runExec: vi.fn(),
 								  runCommandWithTimeout: vi.fn(),
 								}));
 								const mockDeliverOutboundPayloads = vi.fn();
 								vi.mock("../infra/outbound/deliver.js", () => ({
 								  deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args),
 								}));
 								// ---------------------------------------------------------------------------
 								// Helpers
 								// ---------------------------------------------------------------------------
 								let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
 								let clearMediaUnderstandingBinaryCacheForTests: () => void;
 								const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-";
 								let suiteTempMediaRootDir = "";
 								async function createTempAudioFile(): Promise<string> {
 								  const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-"));
 								  const filePath = path.join(dir, "note.ogg");
-												refactor(media): split audio helpers and attachment cache

											
										
										
											2026-03-02 22:00:46 +00:00
+								  await fs.writeFile(filePath, createSafeAudioFixtureBuffer(2048));
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								  return filePath;
 								}
 								function createAudioCtxWithProvider(mediaPath: string, extra?: Partial<MsgContext>): MsgContext {
 								  return {
 								    Body: "<media:audio>",
 								    MediaPath: mediaPath,
 								    MediaType: "audio/ogg",
 								    Provider: "whatsapp",
 								    From: "+10000000001",
 								    AccountId: "acc1",
 								    ...extra,
 								  };
 								}
 								function createAudioConfigWithEcho(opts?: {
 								  echoTranscript?: boolean;
 								  echoFormat?: string;
 								  transcribedText?: string;
 								}): {
 								  cfg: OpenClawConfig;
 								  providers: Record<string, { id: string; transcribeAudio: () => Promise<{ text: string }> }>;
 								} {
 								  const cfg: OpenClawConfig = {
 								    tools: {
 								      media: {
 								        audio: {
 								          enabled: true,
 								          maxBytes: 1024 * 1024,
 								          models: [{ provider: "groq" }],
 								          echoTranscript: opts?.echoTranscript ?? true,
 								          ...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}),
 								        },
 								      },
 								    },
 								  };
 								  const providers = {
 								    groq: {
 								      id: "groq",
 								      transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }),
 								    },
 								  };
 								  return { cfg, providers };
 								}
-												refactor(tests): dedupe media transcript echo config setup

											
										
										
											2026-03-03 01:47:26 +00:00
+								function expectSingleEchoDeliveryCall() {
 								  expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
 								  const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
 								  expect(callArgs).toBeDefined();
 								  return callArgs as {
 								    to?: string;
 								    channel?: string;
 								    accountId?: string;
 								    payloads: Array<{ text?: string }>;
 								  };
 								}
 								function createAudioConfigWithoutEchoFlag() {
 								  const { cfg, providers } = createAudioConfigWithEcho();
 								  const audio = cfg.tools?.media?.audio as { echoTranscript?: boolean } | undefined;
 								  if (audio && "echoTranscript" in audio) {
 								    delete audio.echoTranscript;
 								  }
 								  return { cfg, providers };
 								}
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								// ---------------------------------------------------------------------------
 								// Tests
 								// ---------------------------------------------------------------------------
 								describe("applyMediaUnderstanding – echo transcript", () => {
 								  beforeAll(async () => {
 								    const baseDir = resolvePreferredOpenClawTmpDir();
 								    await fs.mkdir(baseDir, { recursive: true });
 								    suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX));
 								    const mod = await import("./apply.js");
 								    applyMediaUnderstanding = mod.applyMediaUnderstanding;
 								    const runner = await import("./runner.js");
 								    clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests;
 								  });
 								  beforeEach(() => {
 								    mockDeliverOutboundPayloads.mockClear();
 								    mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]);
 								    clearMediaUnderstandingBinaryCacheForTests?.();
 								  });
 								  afterAll(async () => {
 								    if (!suiteTempMediaRootDir) {
 								      return;
 								    }
 								    await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true });
 								    suiteTempMediaRootDir = "";
 								  });
 								  it("does NOT echo when echoTranscript is false (default)", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx = createAudioCtxWithProvider(mediaPath);
 								    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false });
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
 								    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
 								  });
 								  it("does NOT echo when echoTranscript is absent (default)", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx = createAudioCtxWithProvider(mediaPath);
-												refactor(tests): dedupe media transcript echo config setup

											
										
										
											2026-03-03 01:47:26 +00:00
+								    const { cfg, providers } = createAudioConfigWithoutEchoFlag();
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
 								    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
 								  });
 								  it("echoes transcript with default format when echoTranscript is true", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx = createAudioCtxWithProvider(mediaPath);
 								    const { cfg, providers } = createAudioConfigWithEcho({
 								      echoTranscript: true,
 								      transcribedText: "hello world",
 								    });
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
-												refactor(tests): dedupe media transcript echo config setup

											
										
										
											2026-03-03 01:47:26 +00:00
+								    const callArgs = expectSingleEchoDeliveryCall();
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								    expect(callArgs.channel).toBe("whatsapp");
 								    expect(callArgs.to).toBe("+10000000001");
 								    expect(callArgs.accountId).toBe("acc1");
 								    expect(callArgs.payloads).toHaveLength(1);
 								    expect(callArgs.payloads[0].text).toBe('📝 "hello world"');
 								  });
 								  it("uses custom echoFormat when provided", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx = createAudioCtxWithProvider(mediaPath);
 								    const { cfg, providers } = createAudioConfigWithEcho({
 								      echoTranscript: true,
 								      echoFormat: "🎙️ Heard: {transcript}",
 								      transcribedText: "custom message",
 								    });
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
-												refactor(tests): dedupe media transcript echo config setup

											
										
										
											2026-03-03 01:47:26 +00:00
+								    const callArgs = expectSingleEchoDeliveryCall();
 								    expect(callArgs.payloads[0].text).toBe("🎙️ Heard: custom message");
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								  });
 								  it("does NOT echo when there are no audio attachments", async () => {
 								    // Image-only context — no audio attachment
 								    const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-"));
 								    const imgPath = path.join(dir, "photo.jpg");
 								    await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0]));
 								    const ctx: MsgContext = {
 								      Body: "<media:image>",
 								      MediaPath: imgPath,
 								      MediaType: "image/jpeg",
 								      Provider: "whatsapp",
 								      From: "+10000000001",
 								    };
-												refactor(tests): dedupe media transcript echo config setup

											
										
										
											2026-03-03 01:47:26 +00:00
+								    const { cfg, providers } = createAudioConfigWithEcho({
 								      echoTranscript: true,
 								      transcribedText: "should not appear",
 								    });
 								    cfg.tools!.media!.image = { enabled: false };
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
 								    // No audio outputs → Transcript not set → no echo
 								    expect(ctx.Transcript).toBeUndefined();
 								    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
 								  });
 								  it("does NOT echo when transcription fails", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx = createAudioCtxWithProvider(mediaPath);
-												refactor(tests): dedupe media transcript echo config setup

											
										
										
											2026-03-03 01:47:26 +00:00
+								    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
 								    providers.groq.transcribeAudio = async () => {
 								      throw new Error("transcription provider failure");
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								    };
 								    // Should not throw; transcription failure is swallowed by runner
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
 								    expect(ctx.Transcript).toBeUndefined();
 								    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
 								  });
 								  it("does NOT echo when channel is not deliverable", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    // Use an internal/non-deliverable channel
 								    const ctx = createAudioCtxWithProvider(mediaPath, {
 								      Provider: "internal-system",
 								      From: "some-source",
 								    });
 								    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
 								    // Transcript should be set (transcription succeeded)
 								    expect(ctx.Transcript).toBe("hello world");
 								    // But echo should be skipped
 								    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
 								  });
 								  it("does NOT echo when ctx has no From or OriginatingTo", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx: MsgContext = {
 								      Body: "<media:audio>",
 								      MediaPath: mediaPath,
 								      MediaType: "audio/ogg",
 								      Provider: "whatsapp",
 								      // From and OriginatingTo intentionally absent
 								    };
 								    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
 								    expect(ctx.Transcript).toBe("hello world");
 								    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
 								  });
 								  it("uses OriginatingTo when From is absent", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx: MsgContext = {
 								      Body: "<media:audio>",
 								      MediaPath: mediaPath,
 								      MediaType: "audio/ogg",
 								      Provider: "whatsapp",
 								      OriginatingTo: "+19999999999",
 								    };
 								    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
 								    await applyMediaUnderstanding({ ctx, cfg, providers });
-												refactor(tests): dedupe media transcript echo config setup

											
										
										
											2026-03-03 01:47:26 +00:00
+								    const callArgs = expectSingleEchoDeliveryCall();
 								    expect(callArgs.to).toBe("+19999999999");
-												feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102

											
										
										
											2026-03-02 23:31:57 +03:00
+								  });
 								  it("echo delivery failure does not throw or break transcription", async () => {
 								    const mediaPath = await createTempAudioFile();
 								    const ctx = createAudioCtxWithProvider(mediaPath);
 								    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
 								    mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout"));
 								    // Should not throw
 								    const result = await applyMediaUnderstanding({ ctx, cfg, providers });
 								    // Transcription itself succeeded
 								    expect(result.appliedAudio).toBe(true);
 								    expect(ctx.Transcript).toBe("hello world");
 								    // Deliver was attempted
 								    expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
 								  });
 								});