From b36e456b09160b98779826302961c98b0ff09d9d Mon Sep 17 00:00:00 2001 From: Lakshya Agarwal Date: Fri, 20 Mar 2026 01:06:26 -0400 Subject: [PATCH 01/11] feat: add Tavily as a bundled web search plugin with search and extract tools (#49200) Merged via squash. Prepared head SHA: ece9226e886004f1e0536dd5de3ddc2946fc118c Co-authored-by: lakshyaag-tavily <266572148+lakshyaag-tavily@users.noreply.github.com> Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com> Reviewed-by: @gumadeiras --- .github/labeler.yml | 4 + CHANGELOG.md | 1 + docs/docs.json | 1 + .../reference/secretref-credential-surface.md | 1 + ...tref-user-supplied-credentials-matrix.json | 7 + docs/tools/index.md | 2 +- docs/tools/tavily.md | 125 ++++++++ docs/tools/web.md | 44 ++- extensions/brave/openclaw.plugin.json | 3 + extensions/firecrawl/openclaw.plugin.json | 3 + extensions/perplexity/openclaw.plugin.json | 3 + extensions/tavily/index.test.ts | 41 +++ extensions/tavily/index.ts | 15 + extensions/tavily/openclaw.plugin.json | 37 +++ extensions/tavily/package.json | 12 + extensions/tavily/skills/tavily/SKILL.md | 94 ++++++ extensions/tavily/src/config.ts | 71 +++++ extensions/tavily/src/tavily-client.ts | 286 ++++++++++++++++++ .../tavily/src/tavily-extract-tool.test.ts | 53 ++++ extensions/tavily/src/tavily-extract-tool.ts | 74 +++++ .../tavily/src/tavily-search-provider.ts | 76 +++++ extensions/tavily/src/tavily-search-tool.ts | 81 +++++ pnpm-lock.yaml | 2 + src/agents/tools/web-search.ts | 112 +------ src/commands/onboard-search.test.ts | 121 +++++++- src/commands/onboard-search.ts | 11 +- src/config/config.web-search-provider.test.ts | 68 +++++ ...undled-provider-auth-env-vars.generated.ts | 4 + .../bundled-provider-auth-env-vars.test.ts | 7 + src/plugins/bundled-web-search.test.ts | 1 + src/plugins/bundled-web-search.ts | 15 + .../contracts/registry.contract.test.ts | 9 + src/plugins/contracts/registry.ts | 3 +- src/plugins/web-search-providers.test.ts | 5 + src/secrets/provider-env-vars.test.ts | 22 +- src/secrets/target-registry-data.ts | 11 + src/web-search/runtime.test.ts | 77 +++++ 37 files changed, 1378 insertions(+), 124 deletions(-) create mode 100644 docs/tools/tavily.md create mode 100644 extensions/tavily/index.test.ts create mode 100644 extensions/tavily/index.ts create mode 100644 extensions/tavily/openclaw.plugin.json create mode 100644 extensions/tavily/package.json create mode 100644 extensions/tavily/skills/tavily/SKILL.md create mode 100644 extensions/tavily/src/config.ts create mode 100644 extensions/tavily/src/tavily-client.ts create mode 100644 extensions/tavily/src/tavily-extract-tool.test.ts create mode 100644 extensions/tavily/src/tavily-extract-tool.ts create mode 100644 extensions/tavily/src/tavily-search-provider.ts create mode 100644 extensions/tavily/src/tavily-search-tool.ts diff --git a/.github/labeler.yml b/.github/labeler.yml index 4ee43d5e6fa..67a74985465 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -293,6 +293,10 @@ - changed-files: - any-glob-to-any-file: - "extensions/synthetic/**" +"extensions: tavily": + - changed-files: + - any-glob-to-any-file: + - "extensions/tavily/**" "extensions: talk-voice": - changed-files: - any-glob-to-any-file: diff --git a/CHANGELOG.md b/CHANGELOG.md index e0c87b836a9..37ff9e33f36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,7 @@ Docs: https://docs.openclaw.ai - Plugins/Xiaomi: switch the bundled Xiaomi provider to the `/v1` OpenAI-compatible endpoint and add MiMo V2 Pro plus MiMo V2 Omni to the built-in catalog. (#49214) thanks @DJjjjhao. - Plugins/Matrix: add `allowBots` room policy so configured Matrix bot accounts can talk to each other, with optional mention-only gating. Thanks @gumadeiras. - Plugins/Matrix: add per-account `allowPrivateNetwork` opt-in for private/internal homeservers, while keeping public cleartext homeservers blocked. Thanks @gumadeiras. +- Web tools/Tavily: add Tavily as a bundled web-search provider with dedicated `tavily_search` and `tavily_extract` tools, using canonical plugin-owned config under `plugins.entries.tavily.config.webSearch.*`. (#49200) thanks @lakshyaag-tavily. ### Fixes diff --git a/docs/docs.json b/docs/docs.json index bd7d01fc43b..a941bec2601 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -1031,6 +1031,7 @@ "tools/exec", "tools/exec-approvals", "tools/firecrawl", + "tools/tavily", "tools/llm-task", "tools/lobster", "tools/loop-detection", diff --git a/docs/reference/secretref-credential-surface.md b/docs/reference/secretref-credential-surface.md index 39420e335bf..d0a11bc68ef 100644 --- a/docs/reference/secretref-credential-surface.md +++ b/docs/reference/secretref-credential-surface.md @@ -38,6 +38,7 @@ Scope intent: - `plugins.entries.moonshot.config.webSearch.apiKey` - `plugins.entries.perplexity.config.webSearch.apiKey` - `plugins.entries.firecrawl.config.webSearch.apiKey` +- `plugins.entries.tavily.config.webSearch.apiKey` - `tools.web.search.apiKey` - `tools.web.search.gemini.apiKey` - `tools.web.search.grok.apiKey` diff --git a/docs/reference/secretref-user-supplied-credentials-matrix.json b/docs/reference/secretref-user-supplied-credentials-matrix.json index d4706e40304..cca7bb38c4b 100644 --- a/docs/reference/secretref-user-supplied-credentials-matrix.json +++ b/docs/reference/secretref-user-supplied-credentials-matrix.json @@ -551,6 +551,13 @@ "path": "tools.web.search.perplexity.apiKey", "secretShape": "secret_input", "optIn": true + }, + { + "id": "plugins.entries.tavily.config.webSearch.apiKey", + "configFile": "openclaw.json", + "path": "plugins.entries.tavily.config.webSearch.apiKey", + "secretShape": "secret_input", + "optIn": true } ] } diff --git a/docs/tools/index.md b/docs/tools/index.md index 55e52bf46da..91297e5775c 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -256,7 +256,7 @@ Enable with `tools.loopDetection.enabled: true` (default is `false`). ### `web_search` -Search the web using Brave, Firecrawl, Gemini, Grok, Kimi, or Perplexity. +Search the web using Brave, Firecrawl, Gemini, Grok, Kimi, Perplexity, or Tavily. Core parameters: diff --git a/docs/tools/tavily.md b/docs/tools/tavily.md new file mode 100644 index 00000000000..dcf7ce4c1ad --- /dev/null +++ b/docs/tools/tavily.md @@ -0,0 +1,125 @@ +--- +summary: "Tavily search and extract tools" +read_when: + - You want Tavily-backed web search + - You need a Tavily API key + - You want Tavily as a web_search provider + - You want content extraction from URLs +title: "Tavily" +--- + +# Tavily + +OpenClaw can use **Tavily** in two ways: + +- as the `web_search` provider +- as explicit plugin tools: `tavily_search` and `tavily_extract` + +Tavily is a search API designed for AI applications, returning structured results +optimized for LLM consumption. It supports configurable search depth, topic +filtering, domain filters, AI-generated answer summaries, and content extraction +from URLs (including JavaScript-rendered pages). + +## Get an API key + +1. Create a Tavily account at [tavily.com](https://tavily.com/). +2. Generate an API key in the dashboard. +3. Store it in config or set `TAVILY_API_KEY` in the gateway environment. + +## Configure Tavily search + +```json5 +{ + plugins: { + entries: { + tavily: { + enabled: true, + config: { + webSearch: { + apiKey: "tvly-...", // optional if TAVILY_API_KEY is set + baseUrl: "https://api.tavily.com", + }, + }, + }, + }, + }, + tools: { + web: { + search: { + provider: "tavily", + }, + }, + }, +} +``` + +Notes: + +- Choosing Tavily in onboarding or `openclaw configure --section web` enables + the bundled Tavily plugin automatically. +- Store Tavily config under `plugins.entries.tavily.config.webSearch.*`. +- `web_search` with Tavily supports `query` and `count` (up to 20 results). +- For Tavily-specific controls like `search_depth`, `topic`, `include_answer`, + or domain filters, use `tavily_search`. + +## Tavily plugin tools + +### `tavily_search` + +Use this when you want Tavily-specific search controls instead of generic +`web_search`. + +| Parameter | Description | +| ----------------- | --------------------------------------------------------------------- | +| `query` | Search query string (keep under 400 characters) | +| `search_depth` | `basic` (default, balanced) or `advanced` (highest relevance, slower) | +| `topic` | `general` (default), `news` (real-time updates), or `finance` | +| `max_results` | Number of results, 1-20 (default: 5) | +| `include_answer` | Include an AI-generated answer summary (default: false) | +| `time_range` | Filter by recency: `day`, `week`, `month`, or `year` | +| `include_domains` | Array of domains to restrict results to | +| `exclude_domains` | Array of domains to exclude from results | + +**Search depth:** + +| Depth | Speed | Relevance | Best for | +| ---------- | ------ | --------- | ----------------------------------- | +| `basic` | Faster | High | General-purpose queries (default) | +| `advanced` | Slower | Highest | Precision, specific facts, research | + +### `tavily_extract` + +Use this to extract clean content from one or more URLs. Handles +JavaScript-rendered pages and supports query-focused chunking for targeted +extraction. + +| Parameter | Description | +| ------------------- | ---------------------------------------------------------- | +| `urls` | Array of URLs to extract (1-20 per request) | +| `query` | Rerank extracted chunks by relevance to this query | +| `extract_depth` | `basic` (default, fast) or `advanced` (for JS-heavy pages) | +| `chunks_per_source` | Chunks per URL, 1-5 (requires `query`) | +| `include_images` | Include image URLs in results (default: false) | + +**Extract depth:** + +| Depth | When to use | +| ---------- | ----------------------------------------- | +| `basic` | Simple pages - try this first | +| `advanced` | JS-rendered SPAs, dynamic content, tables | + +Tips: + +- Max 20 URLs per request. Batch larger lists into multiple calls. +- Use `query` + `chunks_per_source` to get only relevant content instead of full pages. +- Try `basic` first; fall back to `advanced` if content is missing or incomplete. + +## Choosing the right tool + +| Need | Tool | +| ------------------------------------ | ---------------- | +| Quick web search, no special options | `web_search` | +| Search with depth, topic, AI answers | `tavily_search` | +| Extract content from specific URLs | `tavily_extract` | + +See [Web tools](/tools/web) for the full web tool setup and provider comparison. diff --git a/docs/tools/web.md b/docs/tools/web.md index 313e709c32f..8d5b6bff5f1 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -1,5 +1,5 @@ --- -summary: "Web search + fetch tools (Brave, Firecrawl, Gemini, Grok, Kimi, and Perplexity providers)" +summary: "Web search + fetch tools (Brave, Firecrawl, Gemini, Grok, Kimi, Perplexity, and Tavily providers)" read_when: - You want to enable web_search or web_fetch - You need provider API key setup @@ -11,7 +11,7 @@ title: "Web Tools" OpenClaw ships two lightweight web tools: -- `web_search` — Search the web using Brave Search API, Firecrawl Search, Gemini with Google Search grounding, Grok, Kimi, or Perplexity Search API. +- `web_search` — Search the web using Brave Search API, Firecrawl Search, Gemini with Google Search grounding, Grok, Kimi, Perplexity Search API, or Tavily Search API. - `web_fetch` — HTTP fetch + readable extraction (HTML → markdown/text). These are **not** browser automation. For JS-heavy sites or logins, use the @@ -25,8 +25,9 @@ These are **not** browser automation. For JS-heavy sites or logins, use the (HTML → markdown/text). It does **not** execute JavaScript. - `web_fetch` is enabled by default (unless explicitly disabled). - The bundled Firecrawl plugin also adds `firecrawl_search` and `firecrawl_scrape` when enabled. +- The bundled Tavily plugin also adds `tavily_search` and `tavily_extract` when enabled. -See [Brave Search setup](/tools/brave-search) and [Perplexity Search setup](/tools/perplexity-search) for provider-specific details. +See [Brave Search setup](/tools/brave-search), [Perplexity Search setup](/tools/perplexity-search), and [Tavily Search setup](/tools/tavily) for provider-specific details. ## Choosing a search provider @@ -38,6 +39,7 @@ See [Brave Search setup](/tools/brave-search) and [Perplexity Search setup](/too | **Grok** | AI-synthesized answers + citations | — | Uses xAI web-grounded responses | `XAI_API_KEY` | | **Kimi** | AI-synthesized answers + citations | — | Uses Moonshot web search | `KIMI_API_KEY` / `MOONSHOT_API_KEY` | | **Perplexity Search API** | Structured results with snippets | `country`, `language`, time, `domain_filter` | Supports content extraction controls; OpenRouter uses Sonar compatibility path | `PERPLEXITY_API_KEY` / `OPENROUTER_API_KEY` | +| **Tavily Search API** | Structured results with snippets | Use `tavily_search` for Tavily-specific search options | Search depth, topic filtering, AI answers, URL extraction via `tavily_extract` | `TAVILY_API_KEY` | ### Auto-detection @@ -49,6 +51,7 @@ The table above is alphabetical. If no `provider` is explicitly set, runtime aut 4. **Kimi** — `KIMI_API_KEY` / `MOONSHOT_API_KEY` env var or `plugins.entries.moonshot.config.webSearch.apiKey` 5. **Perplexity** — `PERPLEXITY_API_KEY`, `OPENROUTER_API_KEY`, or `plugins.entries.perplexity.config.webSearch.apiKey` 6. **Firecrawl** — `FIRECRAWL_API_KEY` env var or `plugins.entries.firecrawl.config.webSearch.apiKey` +7. **Tavily** — `TAVILY_API_KEY` env var or `plugins.entries.tavily.config.webSearch.apiKey` If no keys are found, it falls back to Brave (you'll get a missing-key error prompting you to configure one). @@ -97,6 +100,7 @@ See [Perplexity Search API Docs](https://docs.perplexity.ai/guides/search-quicks - Grok: `plugins.entries.xai.config.webSearch.apiKey` - Kimi: `plugins.entries.moonshot.config.webSearch.apiKey` - Perplexity: `plugins.entries.perplexity.config.webSearch.apiKey` +- Tavily: `plugins.entries.tavily.config.webSearch.apiKey` All of these fields also support SecretRef objects. @@ -108,6 +112,7 @@ All of these fields also support SecretRef objects. - Grok: `XAI_API_KEY` - Kimi: `KIMI_API_KEY` or `MOONSHOT_API_KEY` - Perplexity: `PERPLEXITY_API_KEY` or `OPENROUTER_API_KEY` +- Tavily: `TAVILY_API_KEY` For a gateway install, put these in `~/.openclaw/.env` (or your service environment). See [Env vars](/help/faq#how-does-openclaw-load-environment-variables). @@ -176,6 +181,36 @@ For a gateway install, put these in `~/.openclaw/.env` (or your service environm When you choose Firecrawl in onboarding or `openclaw configure --section web`, OpenClaw enables the bundled Firecrawl plugin automatically so `web_search`, `firecrawl_search`, and `firecrawl_scrape` are all available. +**Tavily Search:** + +```json5 +{ + plugins: { + entries: { + tavily: { + enabled: true, + config: { + webSearch: { + apiKey: "tvly-...", // optional if TAVILY_API_KEY is set + baseUrl: "https://api.tavily.com", + }, + }, + }, + }, + }, + tools: { + web: { + search: { + enabled: true, + provider: "tavily", + }, + }, + }, +} +``` + +When you choose Tavily in onboarding or `openclaw configure --section web`, OpenClaw enables the bundled Tavily plugin automatically so `web_search`, `tavily_search`, and `tavily_extract` are all available. + **Brave LLM Context mode:** ```json5 @@ -326,6 +361,7 @@ Search the web using your configured provider. - **Grok**: `XAI_API_KEY` or `plugins.entries.xai.config.webSearch.apiKey` - **Kimi**: `KIMI_API_KEY`, `MOONSHOT_API_KEY`, or `plugins.entries.moonshot.config.webSearch.apiKey` - **Perplexity**: `PERPLEXITY_API_KEY`, `OPENROUTER_API_KEY`, or `plugins.entries.perplexity.config.webSearch.apiKey` + - **Tavily**: `TAVILY_API_KEY` or `plugins.entries.tavily.config.webSearch.apiKey` - All provider key fields above support SecretRef objects. ### Config @@ -369,6 +405,8 @@ If you set `plugins.entries.perplexity.config.webSearch.baseUrl` / `model`, use Firecrawl `web_search` supports `query` and `count`. For Firecrawl-specific controls like `sources`, `categories`, result scraping, or scrape timeout, use `firecrawl_search` from the bundled Firecrawl plugin. +Tavily `web_search` supports `query` and `count` (up to 20 results). For Tavily-specific controls like `search_depth`, `topic`, `include_answer`, or domain filters, use `tavily_search` from the bundled Tavily plugin. For URL content extraction, use `tavily_extract`. See [Tavily](/tools/tavily) for details. + **Examples:** ```javascript diff --git a/extensions/brave/openclaw.plugin.json b/extensions/brave/openclaw.plugin.json index 2077f174d62..791a413ec66 100644 --- a/extensions/brave/openclaw.plugin.json +++ b/extensions/brave/openclaw.plugin.json @@ -1,5 +1,8 @@ { "id": "brave", + "providerAuthEnvVars": { + "brave": ["BRAVE_API_KEY"] + }, "uiHints": { "webSearch.apiKey": { "label": "Brave Search API Key", diff --git a/extensions/firecrawl/openclaw.plugin.json b/extensions/firecrawl/openclaw.plugin.json index e9c50c589d2..adbe2a2a9c8 100644 --- a/extensions/firecrawl/openclaw.plugin.json +++ b/extensions/firecrawl/openclaw.plugin.json @@ -1,5 +1,8 @@ { "id": "firecrawl", + "providerAuthEnvVars": { + "firecrawl": ["FIRECRAWL_API_KEY"] + }, "uiHints": { "webSearch.apiKey": { "label": "Firecrawl Search API Key", diff --git a/extensions/perplexity/openclaw.plugin.json b/extensions/perplexity/openclaw.plugin.json index 89c7a0fb902..32567c76cb2 100644 --- a/extensions/perplexity/openclaw.plugin.json +++ b/extensions/perplexity/openclaw.plugin.json @@ -1,5 +1,8 @@ { "id": "perplexity", + "providerAuthEnvVars": { + "perplexity": ["PERPLEXITY_API_KEY", "OPENROUTER_API_KEY"] + }, "uiHints": { "webSearch.apiKey": { "label": "Perplexity API Key", diff --git a/extensions/tavily/index.test.ts b/extensions/tavily/index.test.ts new file mode 100644 index 00000000000..5b71aeb6f7b --- /dev/null +++ b/extensions/tavily/index.test.ts @@ -0,0 +1,41 @@ +import { describe, expect, it } from "vitest"; +import plugin from "./index.js"; + +describe("tavily plugin", () => { + it("exports a valid plugin entry with correct id and name", () => { + expect(plugin.id).toBe("tavily"); + expect(plugin.name).toBe("Tavily Plugin"); + expect(typeof plugin.register).toBe("function"); + }); + + it("registers web search provider and two tools", () => { + const registrations: { + webSearchProviders: unknown[]; + tools: unknown[]; + } = { webSearchProviders: [], tools: [] }; + + const mockApi = { + registerWebSearchProvider(provider: unknown) { + registrations.webSearchProviders.push(provider); + }, + registerTool(tool: unknown) { + registrations.tools.push(tool); + }, + config: {}, + }; + + plugin.register(mockApi as never); + + expect(registrations.webSearchProviders).toHaveLength(1); + expect(registrations.tools).toHaveLength(2); + + const provider = registrations.webSearchProviders[0] as Record; + expect(provider.id).toBe("tavily"); + expect(provider.autoDetectOrder).toBe(70); + expect(provider.envVars).toEqual(["TAVILY_API_KEY"]); + + const toolNames = registrations.tools.map((t) => (t as Record).name); + expect(toolNames).toContain("tavily_search"); + expect(toolNames).toContain("tavily_extract"); + }); +}); diff --git a/extensions/tavily/index.ts b/extensions/tavily/index.ts new file mode 100644 index 00000000000..f35fda3129d --- /dev/null +++ b/extensions/tavily/index.ts @@ -0,0 +1,15 @@ +import { definePluginEntry, type AnyAgentTool } from "openclaw/plugin-sdk/core"; +import { createTavilyExtractTool } from "./src/tavily-extract-tool.js"; +import { createTavilyWebSearchProvider } from "./src/tavily-search-provider.js"; +import { createTavilySearchTool } from "./src/tavily-search-tool.js"; + +export default definePluginEntry({ + id: "tavily", + name: "Tavily Plugin", + description: "Bundled Tavily search and extract plugin", + register(api) { + api.registerWebSearchProvider(createTavilyWebSearchProvider()); + api.registerTool(createTavilySearchTool(api) as AnyAgentTool); + api.registerTool(createTavilyExtractTool(api) as AnyAgentTool); + }, +}); diff --git a/extensions/tavily/openclaw.plugin.json b/extensions/tavily/openclaw.plugin.json new file mode 100644 index 00000000000..9ed930bfe63 --- /dev/null +++ b/extensions/tavily/openclaw.plugin.json @@ -0,0 +1,37 @@ +{ + "id": "tavily", + "skills": ["./skills"], + "providerAuthEnvVars": { + "tavily": ["TAVILY_API_KEY"] + }, + "uiHints": { + "webSearch.apiKey": { + "label": "Tavily API Key", + "help": "Tavily API key for web search and extraction (fallback: TAVILY_API_KEY env var).", + "sensitive": true, + "placeholder": "tvly-..." + }, + "webSearch.baseUrl": { + "label": "Tavily Base URL", + "help": "Tavily API base URL override." + } + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": { + "webSearch": { + "type": "object", + "additionalProperties": false, + "properties": { + "apiKey": { + "type": ["string", "object"] + }, + "baseUrl": { + "type": "string" + } + } + } + } + } +} diff --git a/extensions/tavily/package.json b/extensions/tavily/package.json new file mode 100644 index 00000000000..3d693a6ca38 --- /dev/null +++ b/extensions/tavily/package.json @@ -0,0 +1,12 @@ +{ + "name": "@openclaw/tavily-plugin", + "version": "2026.3.17", + "private": true, + "description": "OpenClaw Tavily plugin", + "type": "module", + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/tavily/skills/tavily/SKILL.md b/extensions/tavily/skills/tavily/SKILL.md new file mode 100644 index 00000000000..4026537362a --- /dev/null +++ b/extensions/tavily/skills/tavily/SKILL.md @@ -0,0 +1,94 @@ +--- +name: tavily +description: Tavily web search, content extraction, and research tools. +metadata: + { "openclaw": { "emoji": "🔍", "requires": { "config": ["plugins.entries.tavily.enabled"] } } } +--- + +# Tavily Tools + +## When to use which tool + +| Need | Tool | When | +| ---------------------------- | ---------------- | ------------------------------------------------------------- | +| Quick web search | `web_search` | Basic queries, no special options needed | +| Search with advanced options | `tavily_search` | Need depth, topic, domain filters, time ranges, or AI answers | +| Extract content from URLs | `tavily_extract` | Have specific URLs, need their content | + +## web_search + +Tavily powers this automatically when selected as the search provider. Use for +straightforward queries where you don't need Tavily-specific options. + +| Parameter | Description | +| --------- | ------------------------ | +| `query` | Search query string | +| `count` | Number of results (1-20) | + +## tavily_search + +Use when you need fine-grained control over search behavior. + +| Parameter | Description | +| ----------------- | --------------------------------------------------------------------- | +| `query` | Search query string (keep under 400 characters) | +| `search_depth` | `basic` (default, balanced) or `advanced` (highest relevance, slower) | +| `topic` | `general` (default), `news` (real-time updates), or `finance` | +| `max_results` | Number of results, 1-20 (default: 5) | +| `include_answer` | Include an AI-generated answer summary (default: false) | +| `time_range` | Filter by recency: `day`, `week`, `month`, or `year` | +| `include_domains` | Array of domains to restrict results to | +| `exclude_domains` | Array of domains to exclude from results | + +### Search depth + +| Depth | Speed | Relevance | Best for | +| ---------- | ------ | --------- | -------------------------------------------- | +| `basic` | Faster | High | General-purpose queries (default) | +| `advanced` | Slower | Highest | Precision, specific facts, detailed research | + +### Tips + +- **Keep queries under 400 characters** — think search query, not prompt. +- **Break complex queries into sub-queries** for better results. +- **Use `include_domains`** to focus on trusted sources. +- **Use `time_range`** for recent information (news, current events). +- **Use `include_answer`** when you need a quick synthesized answer. + +## tavily_extract + +Use when you have specific URLs and need their content. Handles JavaScript-rendered +pages and returns clean markdown. Supports query-focused chunking for targeted +extraction. + +| Parameter | Description | +| ------------------- | ------------------------------------------------------------------ | +| `urls` | Array of URLs to extract (1-20 per request) | +| `query` | Rerank extracted chunks by relevance to this query | +| `extract_depth` | `basic` (default, fast) or `advanced` (for JS-heavy pages, tables) | +| `chunks_per_source` | Chunks per URL, 1-5 (requires `query`) | +| `include_images` | Include image URLs in results (default: false) | + +### Extract depth + +| Depth | When to use | +| ---------- | ----------------------------------------------------------- | +| `basic` | Simple pages — try this first | +| `advanced` | JS-rendered SPAs, dynamic content, tables, embedded content | + +### Tips + +- **Max 20 URLs per request** — batch larger lists into multiple calls. +- **Use `query` + `chunks_per_source`** to get only relevant content instead of full pages. +- **Try `basic` first**, fall back to `advanced` if content is missing or incomplete. +- If `tavily_search` results already contain the snippets you need, skip the extract step. + +## Choosing the right workflow + +Follow this escalation pattern — start simple, escalate only when needed: + +1. **`web_search`** — Quick lookup, no special options needed. +2. **`tavily_search`** — Need depth control, topic filtering, domain filters, time ranges, or AI answers. +3. **`tavily_extract`** — Have specific URLs, need their full content or targeted chunks. + +Combine search + extract when you need to find pages first, then get their full content. diff --git a/extensions/tavily/src/config.ts b/extensions/tavily/src/config.ts new file mode 100644 index 00000000000..752a721d17c --- /dev/null +++ b/extensions/tavily/src/config.ts @@ -0,0 +1,71 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/config-runtime"; +import { normalizeSecretInput } from "openclaw/plugin-sdk/provider-auth"; + +export const DEFAULT_TAVILY_BASE_URL = "https://api.tavily.com"; +export const DEFAULT_TAVILY_SEARCH_TIMEOUT_SECONDS = 30; +export const DEFAULT_TAVILY_EXTRACT_TIMEOUT_SECONDS = 60; + +type TavilySearchConfig = + | { + apiKey?: unknown; + baseUrl?: string; + } + | undefined; + +type PluginEntryConfig = { + webSearch?: { + apiKey?: unknown; + baseUrl?: string; + }; +}; + +export function resolveTavilySearchConfig(cfg?: OpenClawConfig): TavilySearchConfig { + const pluginConfig = cfg?.plugins?.entries?.tavily?.config as PluginEntryConfig; + const pluginWebSearch = pluginConfig?.webSearch; + if (pluginWebSearch && typeof pluginWebSearch === "object" && !Array.isArray(pluginWebSearch)) { + return pluginWebSearch; + } + return undefined; +} + +function normalizeConfiguredSecret(value: unknown, path: string): string | undefined { + return normalizeSecretInput( + normalizeResolvedSecretInputString({ + value, + path, + }), + ); +} + +export function resolveTavilyApiKey(cfg?: OpenClawConfig): string | undefined { + const search = resolveTavilySearchConfig(cfg); + return ( + normalizeConfiguredSecret(search?.apiKey, "plugins.entries.tavily.config.webSearch.apiKey") || + normalizeSecretInput(process.env.TAVILY_API_KEY) || + undefined + ); +} + +export function resolveTavilyBaseUrl(cfg?: OpenClawConfig): string { + const search = resolveTavilySearchConfig(cfg); + const configured = + (typeof search?.baseUrl === "string" ? search.baseUrl.trim() : "") || + normalizeSecretInput(process.env.TAVILY_BASE_URL) || + ""; + return configured || DEFAULT_TAVILY_BASE_URL; +} + +export function resolveTavilySearchTimeoutSeconds(override?: number): number { + if (typeof override === "number" && Number.isFinite(override) && override > 0) { + return Math.floor(override); + } + return DEFAULT_TAVILY_SEARCH_TIMEOUT_SECONDS; +} + +export function resolveTavilyExtractTimeoutSeconds(override?: number): number { + if (typeof override === "number" && Number.isFinite(override) && override > 0) { + return Math.floor(override); + } + return DEFAULT_TAVILY_EXTRACT_TIMEOUT_SECONDS; +} diff --git a/extensions/tavily/src/tavily-client.ts b/extensions/tavily/src/tavily-client.ts new file mode 100644 index 00000000000..8308f8b8772 --- /dev/null +++ b/extensions/tavily/src/tavily-client.ts @@ -0,0 +1,286 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; +import { withTrustedWebToolsEndpoint } from "openclaw/plugin-sdk/provider-web-search"; +import { + DEFAULT_CACHE_TTL_MINUTES, + normalizeCacheKey, + readCache, + readResponseText, + resolveCacheTtlMs, + writeCache, +} from "openclaw/plugin-sdk/provider-web-search"; +import { wrapExternalContent, wrapWebContent } from "openclaw/plugin-sdk/security-runtime"; +import { + DEFAULT_TAVILY_BASE_URL, + resolveTavilyApiKey, + resolveTavilyBaseUrl, + resolveTavilyExtractTimeoutSeconds, + resolveTavilySearchTimeoutSeconds, +} from "./config.js"; + +const SEARCH_CACHE = new Map< + string, + { value: Record; expiresAt: number; insertedAt: number } +>(); +const EXTRACT_CACHE = new Map< + string, + { value: Record; expiresAt: number; insertedAt: number } +>(); +const DEFAULT_SEARCH_COUNT = 5; +const DEFAULT_ERROR_MAX_BYTES = 64_000; + +export type TavilySearchParams = { + cfg?: OpenClawConfig; + query: string; + searchDepth?: string; + topic?: string; + maxResults?: number; + includeAnswer?: boolean; + timeRange?: string; + includeDomains?: string[]; + excludeDomains?: string[]; + timeoutSeconds?: number; +}; + +export type TavilyExtractParams = { + cfg?: OpenClawConfig; + urls: string[]; + query?: string; + extractDepth?: string; + chunksPerSource?: number; + includeImages?: boolean; + timeoutSeconds?: number; +}; + +function resolveEndpoint(baseUrl: string, pathname: string): string { + const trimmed = baseUrl.trim(); + if (!trimmed) { + return `${DEFAULT_TAVILY_BASE_URL}${pathname}`; + } + try { + const url = new URL(trimmed); + // Always append the endpoint pathname to the base URL path, + // supporting both bare hosts and reverse-proxy path prefixes. + url.pathname = url.pathname.replace(/\/$/, "") + pathname; + return url.toString(); + } catch { + return `${DEFAULT_TAVILY_BASE_URL}${pathname}`; + } +} + +async function postTavilyJson(params: { + baseUrl: string; + pathname: string; + apiKey: string; + body: Record; + timeoutSeconds: number; + errorLabel: string; +}): Promise> { + const endpoint = resolveEndpoint(params.baseUrl, params.pathname); + return await withTrustedWebToolsEndpoint( + { + url: endpoint, + timeoutSeconds: params.timeoutSeconds, + init: { + method: "POST", + headers: { + Accept: "application/json", + Authorization: `Bearer ${params.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(params.body), + }, + }, + async ({ response }) => { + if (!response.ok) { + const detail = await readResponseText(response, { maxBytes: DEFAULT_ERROR_MAX_BYTES }); + throw new Error( + `${params.errorLabel} API error (${response.status}): ${detail.text || response.statusText}`, + ); + } + return (await response.json()) as Record; + }, + ); +} + +export async function runTavilySearch( + params: TavilySearchParams, +): Promise> { + const apiKey = resolveTavilyApiKey(params.cfg); + if (!apiKey) { + throw new Error( + "web_search (tavily) needs a Tavily API key. Set TAVILY_API_KEY in the Gateway environment, or configure plugins.entries.tavily.config.webSearch.apiKey.", + ); + } + const count = + typeof params.maxResults === "number" && Number.isFinite(params.maxResults) + ? Math.max(1, Math.min(20, Math.floor(params.maxResults))) + : DEFAULT_SEARCH_COUNT; + const timeoutSeconds = resolveTavilySearchTimeoutSeconds(params.timeoutSeconds); + const baseUrl = resolveTavilyBaseUrl(params.cfg); + + const cacheKey = normalizeCacheKey( + JSON.stringify({ + type: "tavily-search", + q: params.query, + count, + baseUrl, + searchDepth: params.searchDepth, + topic: params.topic, + includeAnswer: params.includeAnswer, + timeRange: params.timeRange, + includeDomains: params.includeDomains, + excludeDomains: params.excludeDomains, + }), + ); + const cached = readCache(SEARCH_CACHE, cacheKey); + if (cached) { + return { ...cached.value, cached: true }; + } + + const body: Record = { + query: params.query, + max_results: count, + }; + if (params.searchDepth) body.search_depth = params.searchDepth; + if (params.topic) body.topic = params.topic; + if (params.includeAnswer) body.include_answer = true; + if (params.timeRange) body.time_range = params.timeRange; + if (params.includeDomains?.length) body.include_domains = params.includeDomains; + if (params.excludeDomains?.length) body.exclude_domains = params.excludeDomains; + + const start = Date.now(); + const payload = await postTavilyJson({ + baseUrl, + pathname: "/search", + apiKey, + body, + timeoutSeconds, + errorLabel: "Tavily Search", + }); + + const rawResults = Array.isArray(payload.results) ? payload.results : []; + const results = rawResults.map((r: Record) => ({ + title: typeof r.title === "string" ? wrapWebContent(r.title, "web_search") : "", + url: typeof r.url === "string" ? r.url : "", + snippet: typeof r.content === "string" ? wrapWebContent(r.content, "web_search") : "", + score: typeof r.score === "number" ? r.score : undefined, + ...(typeof r.published_date === "string" ? { published: r.published_date } : {}), + })); + + const result: Record = { + query: params.query, + provider: "tavily", + count: results.length, + tookMs: Date.now() - start, + externalContent: { + untrusted: true, + source: "web_search", + provider: "tavily", + wrapped: true, + }, + results, + }; + if (typeof payload.answer === "string" && payload.answer) { + result.answer = wrapWebContent(payload.answer, "web_search"); + } + + writeCache( + SEARCH_CACHE, + cacheKey, + result, + resolveCacheTtlMs(undefined, DEFAULT_CACHE_TTL_MINUTES), + ); + return result; +} + +export async function runTavilyExtract( + params: TavilyExtractParams, +): Promise> { + const apiKey = resolveTavilyApiKey(params.cfg); + if (!apiKey) { + throw new Error( + "tavily_extract needs a Tavily API key. Set TAVILY_API_KEY in the Gateway environment, or configure plugins.entries.tavily.config.webSearch.apiKey.", + ); + } + const baseUrl = resolveTavilyBaseUrl(params.cfg); + const timeoutSeconds = resolveTavilyExtractTimeoutSeconds(params.timeoutSeconds); + + const cacheKey = normalizeCacheKey( + JSON.stringify({ + type: "tavily-extract", + urls: params.urls, + baseUrl, + query: params.query, + extractDepth: params.extractDepth, + chunksPerSource: params.chunksPerSource, + includeImages: params.includeImages, + }), + ); + const cached = readCache(EXTRACT_CACHE, cacheKey); + if (cached) { + return { ...cached.value, cached: true }; + } + + const body: Record = { urls: params.urls }; + if (params.query) body.query = params.query; + if (params.extractDepth) body.extract_depth = params.extractDepth; + if (params.chunksPerSource) body.chunks_per_source = params.chunksPerSource; + if (params.includeImages) body.include_images = true; + + const start = Date.now(); + const payload = await postTavilyJson({ + baseUrl, + pathname: "/extract", + apiKey, + body, + timeoutSeconds, + errorLabel: "Tavily Extract", + }); + + const rawResults = Array.isArray(payload.results) ? payload.results : []; + const results = rawResults.map((r: Record) => ({ + url: typeof r.url === "string" ? r.url : "", + rawContent: + typeof r.raw_content === "string" + ? wrapExternalContent(r.raw_content, { source: "web_fetch", includeWarning: false }) + : "", + ...(typeof r.content === "string" + ? { content: wrapExternalContent(r.content, { source: "web_fetch", includeWarning: false }) } + : {}), + ...(Array.isArray(r.images) + ? { + images: (r.images as string[]).map((img) => + wrapExternalContent(String(img), { source: "web_fetch", includeWarning: false }), + ), + } + : {}), + })); + + const failedResults = Array.isArray(payload.failed_results) ? payload.failed_results : []; + + const result: Record = { + provider: "tavily", + count: results.length, + tookMs: Date.now() - start, + externalContent: { + untrusted: true, + source: "web_fetch", + provider: "tavily", + wrapped: true, + }, + results, + ...(failedResults.length > 0 ? { failedResults } : {}), + }; + + writeCache( + EXTRACT_CACHE, + cacheKey, + result, + resolveCacheTtlMs(undefined, DEFAULT_CACHE_TTL_MINUTES), + ); + return result; +} + +export const __testing = { + postTavilyJson, +}; diff --git a/extensions/tavily/src/tavily-extract-tool.test.ts b/extensions/tavily/src/tavily-extract-tool.test.ts new file mode 100644 index 00000000000..f571e196d0b --- /dev/null +++ b/extensions/tavily/src/tavily-extract-tool.test.ts @@ -0,0 +1,53 @@ +import type { OpenClawPluginApi } from "openclaw/plugin-sdk/plugin-runtime"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +vi.mock("./tavily-client.js", () => ({ + runTavilyExtract: vi.fn(async (params: unknown) => ({ ok: true, params })), +})); + +import { runTavilyExtract } from "./tavily-client.js"; +import { createTavilyExtractTool } from "./tavily-extract-tool.js"; + +function fakeApi(): OpenClawPluginApi { + return { + config: {}, + } as OpenClawPluginApi; +} + +describe("tavily_extract", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("rejects chunks_per_source without query", async () => { + const tool = createTavilyExtractTool(fakeApi()); + + await expect( + tool.execute("id", { + urls: ["https://example.com"], + chunks_per_source: 2, + }), + ).rejects.toThrow("tavily_extract requires query when chunks_per_source is set."); + + expect(runTavilyExtract).not.toHaveBeenCalled(); + }); + + it("forwards query-scoped chunking when query is provided", async () => { + const tool = createTavilyExtractTool(fakeApi()); + + await tool.execute("id", { + urls: ["https://example.com"], + query: "pricing", + chunks_per_source: 2, + }); + + expect(runTavilyExtract).toHaveBeenCalledWith( + expect.objectContaining({ + cfg: {}, + urls: ["https://example.com"], + query: "pricing", + chunksPerSource: 2, + }), + ); + }); +}); diff --git a/extensions/tavily/src/tavily-extract-tool.ts b/extensions/tavily/src/tavily-extract-tool.ts new file mode 100644 index 00000000000..1a3c381fc64 --- /dev/null +++ b/extensions/tavily/src/tavily-extract-tool.ts @@ -0,0 +1,74 @@ +import { Type } from "@sinclair/typebox"; +import { optionalStringEnum } from "openclaw/plugin-sdk/agent-runtime"; +import { jsonResult, readNumberParam, readStringParam } from "openclaw/plugin-sdk/agent-runtime"; +import type { OpenClawPluginApi } from "openclaw/plugin-sdk/plugin-runtime"; +import { runTavilyExtract } from "./tavily-client.js"; + +const TavilyExtractToolSchema = Type.Object( + { + urls: Type.Array(Type.String(), { + description: "One or more URLs to extract content from (max 20).", + minItems: 1, + maxItems: 20, + }), + query: Type.Optional( + Type.String({ + description: "Rerank extracted chunks by relevance to this query.", + }), + ), + extract_depth: optionalStringEnum(["basic", "advanced"] as const, { + description: '"basic" (default) or "advanced" (for JS-heavy pages).', + }), + chunks_per_source: Type.Optional( + Type.Number({ + description: "Chunks per URL (1-5, requires query).", + minimum: 1, + maximum: 5, + }), + ), + include_images: Type.Optional( + Type.Boolean({ + description: "Include image URLs in extraction results.", + }), + ), + }, + { additionalProperties: false }, +); + +export function createTavilyExtractTool(api: OpenClawPluginApi) { + return { + name: "tavily_extract", + label: "Tavily Extract", + description: + "Extract clean content from one or more URLs using Tavily. Handles JS-rendered pages. Supports query-focused chunking.", + parameters: TavilyExtractToolSchema, + execute: async (_toolCallId: string, rawParams: Record) => { + const urls = Array.isArray(rawParams.urls) + ? (rawParams.urls as string[]).filter(Boolean) + : []; + if (urls.length === 0) { + throw new Error("tavily_extract requires at least one URL."); + } + const query = readStringParam(rawParams, "query") || undefined; + const extractDepth = readStringParam(rawParams, "extract_depth") || undefined; + const chunksPerSource = readNumberParam(rawParams, "chunks_per_source", { + integer: true, + }); + if (chunksPerSource !== undefined && !query) { + throw new Error("tavily_extract requires query when chunks_per_source is set."); + } + const includeImages = rawParams.include_images === true; + + return jsonResult( + await runTavilyExtract({ + cfg: api.config, + urls, + query, + extractDepth, + chunksPerSource, + includeImages, + }), + ); + }, + }; +} diff --git a/extensions/tavily/src/tavily-search-provider.ts b/extensions/tavily/src/tavily-search-provider.ts new file mode 100644 index 00000000000..2ad33362353 --- /dev/null +++ b/extensions/tavily/src/tavily-search-provider.ts @@ -0,0 +1,76 @@ +import { Type } from "@sinclair/typebox"; +import { + enablePluginInConfig, + resolveProviderWebSearchPluginConfig, + setProviderWebSearchPluginConfigValue, + type WebSearchProviderPlugin, +} from "openclaw/plugin-sdk/provider-web-search"; +import { runTavilySearch } from "./tavily-client.js"; + +const GenericTavilySearchSchema = Type.Object( + { + query: Type.String({ description: "Search query string." }), + count: Type.Optional( + Type.Number({ + description: "Number of results to return (1-20).", + minimum: 1, + maximum: 20, + }), + ), + }, + { additionalProperties: false }, +); + +function getScopedCredentialValue(searchConfig?: Record): unknown { + const scoped = searchConfig?.tavily; + if (!scoped || typeof scoped !== "object" || Array.isArray(scoped)) { + return undefined; + } + return (scoped as Record).apiKey; +} + +function setScopedCredentialValue( + searchConfigTarget: Record, + value: unknown, +): void { + const scoped = searchConfigTarget.tavily; + if (!scoped || typeof scoped !== "object" || Array.isArray(scoped)) { + searchConfigTarget.tavily = { apiKey: value }; + return; + } + (scoped as Record).apiKey = value; +} + +export function createTavilyWebSearchProvider(): WebSearchProviderPlugin { + return { + id: "tavily", + label: "Tavily Search", + hint: "Structured results with domain filters and AI answer summaries", + envVars: ["TAVILY_API_KEY"], + placeholder: "tvly-...", + signupUrl: "https://tavily.com/", + docsUrl: "https://docs.openclaw.ai/tools/tavily", + autoDetectOrder: 70, + credentialPath: "plugins.entries.tavily.config.webSearch.apiKey", + inactiveSecretPaths: ["plugins.entries.tavily.config.webSearch.apiKey"], + getCredentialValue: getScopedCredentialValue, + setCredentialValue: setScopedCredentialValue, + getConfiguredCredentialValue: (config) => + resolveProviderWebSearchPluginConfig(config, "tavily")?.apiKey, + setConfiguredCredentialValue: (configTarget, value) => { + setProviderWebSearchPluginConfigValue(configTarget, "tavily", "apiKey", value); + }, + applySelectionConfig: (config) => enablePluginInConfig(config, "tavily").config, + createTool: (ctx) => ({ + description: + "Search the web using Tavily. Returns structured results with snippets. Use tavily_search for Tavily-specific options like search depth, topic filtering, or AI answers.", + parameters: GenericTavilySearchSchema, + execute: async (args) => + await runTavilySearch({ + cfg: ctx.config, + query: typeof args.query === "string" ? args.query : "", + maxResults: typeof args.count === "number" ? args.count : undefined, + }), + }), + }; +} diff --git a/extensions/tavily/src/tavily-search-tool.ts b/extensions/tavily/src/tavily-search-tool.ts new file mode 100644 index 00000000000..1d925973fe0 --- /dev/null +++ b/extensions/tavily/src/tavily-search-tool.ts @@ -0,0 +1,81 @@ +import { Type } from "@sinclair/typebox"; +import { optionalStringEnum } from "openclaw/plugin-sdk/agent-runtime"; +import { jsonResult, readNumberParam, readStringParam } from "openclaw/plugin-sdk/agent-runtime"; +import type { OpenClawPluginApi } from "openclaw/plugin-sdk/plugin-runtime"; +import { runTavilySearch } from "./tavily-client.js"; + +const TavilySearchToolSchema = Type.Object( + { + query: Type.String({ description: "Search query string." }), + search_depth: optionalStringEnum(["basic", "advanced"] as const, { + description: 'Search depth: "basic" (default, faster) or "advanced" (more thorough).', + }), + topic: optionalStringEnum(["general", "news", "finance"] as const, { + description: 'Search topic: "general" (default), "news", or "finance".', + }), + max_results: Type.Optional( + Type.Number({ + description: "Number of results to return (1-20).", + minimum: 1, + maximum: 20, + }), + ), + include_answer: Type.Optional( + Type.Boolean({ + description: "Include an AI-generated answer summary (default: false).", + }), + ), + time_range: optionalStringEnum(["day", "week", "month", "year"] as const, { + description: "Filter results by recency: 'day', 'week', 'month', or 'year'.", + }), + include_domains: Type.Optional( + Type.Array(Type.String(), { + description: "Only include results from these domains.", + }), + ), + exclude_domains: Type.Optional( + Type.Array(Type.String(), { + description: "Exclude results from these domains.", + }), + ), + }, + { additionalProperties: false }, +); + +export function createTavilySearchTool(api: OpenClawPluginApi) { + return { + name: "tavily_search", + label: "Tavily Search", + description: + "Search the web using Tavily Search API. Supports search depth, topic filtering, domain filters, time ranges, and AI answer summaries.", + parameters: TavilySearchToolSchema, + execute: async (_toolCallId: string, rawParams: Record) => { + const query = readStringParam(rawParams, "query", { required: true }); + const searchDepth = readStringParam(rawParams, "search_depth") || undefined; + const topic = readStringParam(rawParams, "topic") || undefined; + const maxResults = readNumberParam(rawParams, "max_results", { integer: true }); + const includeAnswer = rawParams.include_answer === true; + const timeRange = readStringParam(rawParams, "time_range") || undefined; + const includeDomains = Array.isArray(rawParams.include_domains) + ? (rawParams.include_domains as string[]).filter(Boolean) + : undefined; + const excludeDomains = Array.isArray(rawParams.exclude_domains) + ? (rawParams.exclude_domains as string[]).filter(Boolean) + : undefined; + + return jsonResult( + await runTavilySearch({ + cfg: api.config, + query, + searchDepth, + topic, + maxResults, + includeAnswer, + timeRange, + includeDomains: includeDomains?.length ? includeDomains : undefined, + excludeDomains: excludeDomains?.length ? excludeDomains : undefined, + }), + ); + }, + }; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f0d503f2346..f821a4aa3c4 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -519,6 +519,8 @@ importers: extensions/synthetic: {} + extensions/tavily: {} + extensions/telegram: dependencies: '@grammyjs/runner': diff --git a/src/agents/tools/web-search.ts b/src/agents/tools/web-search.ts index 151cfc4e6c4..11955d4a9b0 100644 --- a/src/agents/tools/web-search.ts +++ b/src/agents/tools/web-search.ts @@ -1,123 +1,35 @@ import type { OpenClawConfig } from "../../config/config.js"; -import { normalizeResolvedSecretInputString } from "../../config/types.secrets.js"; -import { logVerbose } from "../../globals.js"; -import type { PluginWebSearchProviderEntry } from "../../plugins/types.js"; -import { resolvePluginWebSearchProviders } from "../../plugins/web-search-providers.js"; import type { RuntimeWebSearchMetadata } from "../../secrets/runtime-web-tools.types.js"; -import { normalizeSecretInput } from "../../utils/normalize-secret-input.js"; +import { + resolveWebSearchDefinition, + resolveWebSearchProviderId, +} from "../../web-search/runtime.js"; import type { AnyAgentTool } from "./common.js"; import { jsonResult } from "./common.js"; import { SEARCH_CACHE } from "./web-search-provider-common.js"; -import { - resolveSearchConfig, - resolveSearchEnabled, - type WebSearchConfig, -} from "./web-search-provider-config.js"; - -function readProviderEnvValue(envVars: string[]): string | undefined { - for (const envVar of envVars) { - const value = normalizeSecretInput(process.env[envVar]); - if (value) { - return value; - } - } - return undefined; -} - -function hasProviderCredential( - provider: PluginWebSearchProviderEntry, - search: WebSearchConfig | undefined, -): boolean { - const rawValue = provider.getCredentialValue(search as Record | undefined); - const fromConfig = normalizeSecretInput( - normalizeResolvedSecretInputString({ - value: rawValue, - path: provider.credentialPath, - }), - ); - return Boolean(fromConfig || readProviderEnvValue(provider.envVars)); -} - -function resolveSearchProvider(search?: WebSearchConfig): string { - const providers = resolvePluginWebSearchProviders({ - bundledAllowlistCompat: true, - }); - const raw = - search && "provider" in search && typeof search.provider === "string" - ? search.provider.trim().toLowerCase() - : ""; - - if (raw) { - const explicit = providers.find((provider) => provider.id === raw); - if (explicit) { - return explicit.id; - } - } - - if (!raw) { - for (const provider of providers) { - if (!hasProviderCredential(provider, search)) { - continue; - } - logVerbose( - `web_search: no provider configured, auto-detected "${provider.id}" from available API keys`, - ); - return provider.id; - } - } - - return providers[0]?.id ?? ""; -} export function createWebSearchTool(options?: { config?: OpenClawConfig; sandboxed?: boolean; runtimeWebSearch?: RuntimeWebSearchMetadata; }): AnyAgentTool | null { - const search = resolveSearchConfig(options?.config); - if (!resolveSearchEnabled({ search, sandboxed: options?.sandboxed })) { - return null; - } - - const providers = resolvePluginWebSearchProviders({ - config: options?.config, - bundledAllowlistCompat: true, - }); - if (providers.length === 0) { - return null; - } - - const providerId = - options?.runtimeWebSearch?.selectedProvider ?? - options?.runtimeWebSearch?.providerConfigured ?? - resolveSearchProvider(search); - const provider = - providers.find((entry) => entry.id === providerId) ?? - providers.find((entry) => entry.id === resolveSearchProvider(search)) ?? - providers[0]; - if (!provider) { - return null; - } - - const definition = provider.createTool({ - config: options?.config, - searchConfig: search as Record | undefined, - runtimeMetadata: options?.runtimeWebSearch, - }); - if (!definition) { + const resolved = resolveWebSearchDefinition(options); + if (!resolved) { return null; } return { label: "Web Search", name: "web_search", - description: definition.description, - parameters: definition.parameters, - execute: async (_toolCallId, args) => jsonResult(await definition.execute(args)), + description: resolved.definition.description, + parameters: resolved.definition.parameters, + execute: async (_toolCallId, args) => jsonResult(await resolved.definition.execute(args)), }; } export const __testing = { SEARCH_CACHE, - resolveSearchProvider, + resolveSearchProvider: ( + search?: NonNullable["web"]>["search"], + ) => resolveWebSearchProviderId({ search }), }; diff --git a/src/commands/onboard-search.test.ts b/src/commands/onboard-search.test.ts index 00bfd6382a6..c15fdefcf72 100644 --- a/src/commands/onboard-search.test.ts +++ b/src/commands/onboard-search.test.ts @@ -48,6 +48,15 @@ function createPerplexityConfig(apiKey: string, enabled?: boolean): OpenClawConf }; } +function pluginWebSearchApiKey(config: OpenClawConfig, pluginId: string): unknown { + const entry = ( + config.plugins?.entries as + | Record + | undefined + )?.[pluginId]; + return entry?.config?.webSearch?.apiKey; +} + async function runBlankPerplexityKeyEntry( apiKey: string, enabled?: boolean, @@ -88,8 +97,9 @@ describe("setupSearch", () => { }); const result = await setupSearch(cfg, runtime, prompter); expect(result.tools?.web?.search?.provider).toBe("perplexity"); - expect(result.tools?.web?.search?.perplexity?.apiKey).toBe("pplx-test-key"); + expect(pluginWebSearchApiKey(result, "perplexity")).toBe("pplx-test-key"); expect(result.tools?.web?.search?.enabled).toBe(true); + expect(result.plugins?.entries?.perplexity?.enabled).toBe(true); }); it("sets provider and key for brave", async () => { @@ -101,7 +111,8 @@ describe("setupSearch", () => { const result = await setupSearch(cfg, runtime, prompter); expect(result.tools?.web?.search?.provider).toBe("brave"); expect(result.tools?.web?.search?.enabled).toBe(true); - expect(result.tools?.web?.search?.apiKey).toBe("BSA-test-key"); + expect(pluginWebSearchApiKey(result, "brave")).toBe("BSA-test-key"); + expect(result.plugins?.entries?.brave?.enabled).toBe(true); }); it("sets provider and key for gemini", async () => { @@ -113,7 +124,8 @@ describe("setupSearch", () => { const result = await setupSearch(cfg, runtime, prompter); expect(result.tools?.web?.search?.provider).toBe("gemini"); expect(result.tools?.web?.search?.enabled).toBe(true); - expect(result.tools?.web?.search?.gemini?.apiKey).toBe("AIza-test"); + expect(pluginWebSearchApiKey(result, "google")).toBe("AIza-test"); + expect(result.plugins?.entries?.google?.enabled).toBe(true); }); it("sets provider and key for firecrawl and enables the plugin", async () => { @@ -125,7 +137,7 @@ describe("setupSearch", () => { const result = await setupSearch(cfg, runtime, prompter); expect(result.tools?.web?.search?.provider).toBe("firecrawl"); expect(result.tools?.web?.search?.enabled).toBe(true); - expect(result.tools?.web?.search?.firecrawl?.apiKey).toBe("fc-test-key"); + expect(pluginWebSearchApiKey(result, "firecrawl")).toBe("fc-test-key"); expect(result.plugins?.entries?.firecrawl?.enabled).toBe(true); }); @@ -150,7 +162,21 @@ describe("setupSearch", () => { const result = await setupSearch(cfg, runtime, prompter); expect(result.tools?.web?.search?.provider).toBe("kimi"); expect(result.tools?.web?.search?.enabled).toBe(true); - expect(result.tools?.web?.search?.kimi?.apiKey).toBe("sk-moonshot"); + expect(pluginWebSearchApiKey(result, "moonshot")).toBe("sk-moonshot"); + expect(result.plugins?.entries?.moonshot?.enabled).toBe(true); + }); + + it("sets provider and key for tavily and enables the plugin", async () => { + const cfg: OpenClawConfig = {}; + const { prompter } = createPrompter({ + selectValue: "tavily", + textValue: "tvly-test-key", + }); + const result = await setupSearch(cfg, runtime, prompter); + expect(result.tools?.web?.search?.provider).toBe("tavily"); + expect(result.tools?.web?.search?.enabled).toBe(true); + expect(pluginWebSearchApiKey(result, "tavily")).toBe("tvly-test-key"); + expect(result.plugins?.entries?.tavily?.enabled).toBe(true); }); it("shows missing-key note when no key is provided and no env var", async () => { @@ -198,7 +224,7 @@ describe("setupSearch", () => { "stored-pplx-key", // pragma: allowlist secret ); expect(result.tools?.web?.search?.provider).toBe("perplexity"); - expect(result.tools?.web?.search?.perplexity?.apiKey).toBe("stored-pplx-key"); + expect(pluginWebSearchApiKey(result, "perplexity")).toBe("stored-pplx-key"); expect(result.tools?.web?.search?.enabled).toBe(true); expect(prompter.text).not.toHaveBeenCalled(); }); @@ -209,11 +235,43 @@ describe("setupSearch", () => { false, ); expect(result.tools?.web?.search?.provider).toBe("perplexity"); - expect(result.tools?.web?.search?.perplexity?.apiKey).toBe("stored-pplx-key"); + expect(pluginWebSearchApiKey(result, "perplexity")).toBe("stored-pplx-key"); expect(result.tools?.web?.search?.enabled).toBe(false); expect(prompter.text).not.toHaveBeenCalled(); }); + it("quickstart skips key prompt when canonical plugin config key exists", async () => { + const cfg: OpenClawConfig = { + tools: { + web: { + search: { + provider: "tavily", + }, + }, + }, + plugins: { + entries: { + tavily: { + enabled: true, + config: { + webSearch: { + apiKey: "tvly-existing-key", + }, + }, + }, + }, + }, + }; + const { prompter } = createPrompter({ selectValue: "tavily" }); + const result = await setupSearch(cfg, runtime, prompter, { + quickstartDefaults: true, + }); + expect(result.tools?.web?.search?.provider).toBe("tavily"); + expect(pluginWebSearchApiKey(result, "tavily")).toBe("tvly-existing-key"); + expect(result.tools?.web?.search?.enabled).toBe(true); + expect(prompter.text).not.toHaveBeenCalled(); + }); + it("quickstart falls through to key prompt when no key and no env var", async () => { const original = process.env.XAI_API_KEY; delete process.env.XAI_API_KEY; @@ -268,7 +326,7 @@ describe("setupSearch", () => { secretInputMode: "ref", // pragma: allowlist secret }); expect(result.tools?.web?.search?.provider).toBe("perplexity"); - expect(result.tools?.web?.search?.perplexity?.apiKey).toEqual({ + expect(pluginWebSearchApiKey(result, "perplexity")).toEqual({ source: "env", provider: "default", id: "PERPLEXITY_API_KEY", // pragma: allowlist secret @@ -299,7 +357,7 @@ describe("setupSearch", () => { const result = await setupSearch(cfg, runtime, prompter, { secretInputMode: "ref", // pragma: allowlist secret }); - expect(result.tools?.web?.search?.perplexity?.apiKey).toEqual({ + expect(pluginWebSearchApiKey(result, "perplexity")).toEqual({ source: "env", provider: "default", id: "OPENROUTER_API_KEY", // pragma: allowlist secret @@ -326,14 +384,41 @@ describe("setupSearch", () => { secretInputMode: "ref", // pragma: allowlist secret }); expect(result.tools?.web?.search?.provider).toBe("brave"); - expect(result.tools?.web?.search?.apiKey).toEqual({ + expect(pluginWebSearchApiKey(result, "brave")).toEqual({ source: "env", provider: "default", id: "BRAVE_API_KEY", }); + expect(result.plugins?.entries?.brave?.enabled).toBe(true); expect(prompter.text).not.toHaveBeenCalled(); }); + it("stores env-backed SecretRef when secretInputMode=ref for tavily", async () => { + const original = process.env.TAVILY_API_KEY; + delete process.env.TAVILY_API_KEY; + const cfg: OpenClawConfig = {}; + try { + const { prompter } = createPrompter({ selectValue: "tavily" }); + const result = await setupSearch(cfg, runtime, prompter, { + secretInputMode: "ref", // pragma: allowlist secret + }); + expect(result.tools?.web?.search?.provider).toBe("tavily"); + expect(pluginWebSearchApiKey(result, "tavily")).toEqual({ + source: "env", + provider: "default", + id: "TAVILY_API_KEY", + }); + expect(result.plugins?.entries?.tavily?.enabled).toBe(true); + expect(prompter.text).not.toHaveBeenCalled(); + } finally { + if (original === undefined) { + delete process.env.TAVILY_API_KEY; + } else { + process.env.TAVILY_API_KEY = original; + } + } + }); + it("stores plaintext key when secretInputMode is unset", async () => { const cfg: OpenClawConfig = {}; const { prompter } = createPrompter({ @@ -341,12 +426,20 @@ describe("setupSearch", () => { textValue: "BSA-plain", }); const result = await setupSearch(cfg, runtime, prompter); - expect(result.tools?.web?.search?.apiKey).toBe("BSA-plain"); + expect(pluginWebSearchApiKey(result, "brave")).toBe("BSA-plain"); }); - it("exports all 6 providers in SEARCH_PROVIDER_OPTIONS", () => { - expect(SEARCH_PROVIDER_OPTIONS).toHaveLength(6); + it("exports all 7 providers in SEARCH_PROVIDER_OPTIONS", () => { + expect(SEARCH_PROVIDER_OPTIONS).toHaveLength(7); const values = SEARCH_PROVIDER_OPTIONS.map((e) => e.value); - expect(values).toEqual(["brave", "gemini", "grok", "kimi", "perplexity", "firecrawl"]); + expect(values).toEqual([ + "brave", + "gemini", + "grok", + "kimi", + "perplexity", + "firecrawl", + "tavily", + ]); }); }); diff --git a/src/commands/onboard-search.ts b/src/commands/onboard-search.ts index 566362f9f03..0d414017c31 100644 --- a/src/commands/onboard-search.ts +++ b/src/commands/onboard-search.ts @@ -53,7 +53,10 @@ function rawKeyValue(config: OpenClawConfig, provider: SearchProvider): unknown config, bundledAllowlistCompat: true, }).find((candidate) => candidate.id === provider); - return entry?.getCredentialValue(search as Record | undefined); + return ( + entry?.getConfiguredCredentialValue?.(config) ?? + entry?.getCredentialValue(search as Record | undefined) + ); } /** Returns the plaintext key string, or undefined for SecretRefs/missing. */ @@ -104,7 +107,7 @@ export function applySearchKey( bundledAllowlistCompat: true, }).find((candidate) => candidate.id === provider); const search: MutableSearchConfig = { ...config.tools?.web?.search, provider, enabled: true }; - if (providerEntry) { + if (providerEntry && !providerEntry.setConfiguredCredentialValue) { providerEntry.setCredentialValue(search, key); } const nextBase: OpenClawConfig = { @@ -114,7 +117,9 @@ export function applySearchKey( web: { ...config.tools?.web, search }, }, }; - return providerEntry?.applySelectionConfig?.(nextBase) ?? nextBase; + const next = providerEntry?.applySelectionConfig?.(nextBase) ?? nextBase; + providerEntry?.setConfiguredCredentialValue?.(next, key); + return next; } function applyProviderOnly(config: OpenClawConfig, provider: SearchProvider): OpenClawConfig { diff --git a/src/config/config.web-search-provider.test.ts b/src/config/config.web-search-provider.test.ts index 85ce1c2700a..d89d913fcba 100644 --- a/src/config/config.web-search-provider.test.ts +++ b/src/config/config.web-search-provider.test.ts @@ -59,6 +59,13 @@ vi.mock("../plugins/web-search-providers.js", () => { getCredentialValue: getScoped("perplexity"), getConfiguredCredentialValue: getConfigured("perplexity"), }, + { + id: "tavily", + envVars: ["TAVILY_API_KEY"], + credentialPath: "plugins.entries.tavily.config.webSearch.apiKey", + getCredentialValue: getScoped("tavily"), + getConfiguredCredentialValue: getConfigured("tavily"), + }, ], }; }); @@ -66,6 +73,17 @@ vi.mock("../plugins/web-search-providers.js", () => { const { __testing } = await import("../agents/tools/web-search.js"); const { resolveSearchProvider } = __testing; +function pluginWebSearchApiKey( + config: Record | undefined, + pluginId: string, +): unknown { + return ( + config?.plugins as + | { entries?: Record } + | undefined + )?.entries?.[pluginId]?.config?.webSearch?.apiKey; +} + describe("web search provider config", () => { it("accepts perplexity provider and config", () => { const res = validateConfigObjectWithPlugins( @@ -113,6 +131,50 @@ describe("web search provider config", () => { expect(res.ok).toBe(true); }); + it("accepts tavily provider config on the plugin-owned path", () => { + const res = validateConfigObjectWithPlugins( + buildWebSearchProviderConfig({ + enabled: true, + provider: "tavily", + providerConfig: { + apiKey: { + source: "env", + provider: "default", + id: "TAVILY_API_KEY", + }, + baseUrl: "https://api.tavily.com", + }, + }), + ); + + expect(res.ok).toBe(true); + }); + + it("does not migrate the nonexistent legacy Tavily scoped config", () => { + const res = validateConfigObjectWithPlugins({ + tools: { + web: { + search: { + provider: "tavily", + tavily: { + apiKey: "tvly-test-key", + }, + }, + }, + }, + }); + + expect(res.ok).toBe(true); + if (!res.ok) { + return; + } + expect(res.config.tools?.web?.search?.provider).toBe("tavily"); + expect((res.config.tools?.web?.search as Record | undefined)?.tavily).toBe( + undefined, + ); + expect(pluginWebSearchApiKey(res.config as Record, "tavily")).toBe(undefined); + }); + it("accepts gemini provider with no extra config", () => { const res = validateConfigObjectWithPlugins( buildWebSearchProviderConfig({ @@ -161,6 +223,7 @@ describe("web search provider auto-detection", () => { delete process.env.MOONSHOT_API_KEY; delete process.env.PERPLEXITY_API_KEY; delete process.env.OPENROUTER_API_KEY; + delete process.env.TAVILY_API_KEY; delete process.env.XAI_API_KEY; delete process.env.KIMI_API_KEY; delete process.env.MOONSHOT_API_KEY; @@ -185,6 +248,11 @@ describe("web search provider auto-detection", () => { expect(resolveSearchProvider({})).toBe("gemini"); }); + it("auto-detects tavily when only TAVILY_API_KEY is set", () => { + process.env.TAVILY_API_KEY = "tvly-test-key"; // pragma: allowlist secret + expect(resolveSearchProvider({})).toBe("tavily"); + }); + it("auto-detects firecrawl when only FIRECRAWL_API_KEY is set", () => { process.env.FIRECRAWL_API_KEY = "fc-test-key"; // pragma: allowlist secret expect(resolveSearchProvider({})).toBe("firecrawl"); diff --git a/src/plugins/bundled-provider-auth-env-vars.generated.ts b/src/plugins/bundled-provider-auth-env-vars.generated.ts index 416036b28ea..80ebcedc2b9 100644 --- a/src/plugins/bundled-provider-auth-env-vars.generated.ts +++ b/src/plugins/bundled-provider-auth-env-vars.generated.ts @@ -2,10 +2,12 @@ export const BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES = { anthropic: ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"], + brave: ["BRAVE_API_KEY"], byteplus: ["BYTEPLUS_API_KEY"], chutes: ["CHUTES_API_KEY", "CHUTES_OAUTH_TOKEN"], "cloudflare-ai-gateway": ["CLOUDFLARE_AI_GATEWAY_API_KEY"], fal: ["FAL_KEY"], + firecrawl: ["FIRECRAWL_API_KEY"], "github-copilot": ["COPILOT_GITHUB_TOKEN", "GH_TOKEN", "GITHUB_TOKEN"], google: ["GEMINI_API_KEY", "GOOGLE_API_KEY"], huggingface: ["HUGGINGFACE_HUB_TOKEN", "HF_TOKEN"], @@ -23,10 +25,12 @@ export const BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES = { opencode: ["OPENCODE_API_KEY", "OPENCODE_ZEN_API_KEY"], "opencode-go": ["OPENCODE_API_KEY", "OPENCODE_ZEN_API_KEY"], openrouter: ["OPENROUTER_API_KEY"], + perplexity: ["PERPLEXITY_API_KEY", "OPENROUTER_API_KEY"], qianfan: ["QIANFAN_API_KEY"], "qwen-portal": ["QWEN_OAUTH_TOKEN", "QWEN_PORTAL_API_KEY"], sglang: ["SGLANG_API_KEY"], synthetic: ["SYNTHETIC_API_KEY"], + tavily: ["TAVILY_API_KEY"], together: ["TOGETHER_API_KEY"], venice: ["VENICE_API_KEY"], "vercel-ai-gateway": ["AI_GATEWAY_API_KEY"], diff --git a/src/plugins/bundled-provider-auth-env-vars.test.ts b/src/plugins/bundled-provider-auth-env-vars.test.ts index a41b60d7b6d..bf0d481834b 100644 --- a/src/plugins/bundled-provider-auth-env-vars.test.ts +++ b/src/plugins/bundled-provider-auth-env-vars.test.ts @@ -31,15 +31,22 @@ describe("bundled provider auth env vars", () => { }); it("reads bundled provider auth env vars from plugin manifests", () => { + expect(BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES.brave).toEqual(["BRAVE_API_KEY"]); + expect(BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES.firecrawl).toEqual(["FIRECRAWL_API_KEY"]); expect(BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES["github-copilot"]).toEqual([ "COPILOT_GITHUB_TOKEN", "GH_TOKEN", "GITHUB_TOKEN", ]); + expect(BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES.perplexity).toEqual([ + "PERPLEXITY_API_KEY", + "OPENROUTER_API_KEY", + ]); expect(BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES["qwen-portal"]).toEqual([ "QWEN_OAUTH_TOKEN", "QWEN_PORTAL_API_KEY", ]); + expect(BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES.tavily).toEqual(["TAVILY_API_KEY"]); expect(BUNDLED_PROVIDER_AUTH_ENV_VAR_CANDIDATES["minimax-portal"]).toEqual([ "MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY", diff --git a/src/plugins/bundled-web-search.test.ts b/src/plugins/bundled-web-search.test.ts index 921bd66868e..b8d5c6142ad 100644 --- a/src/plugins/bundled-web-search.test.ts +++ b/src/plugins/bundled-web-search.test.ts @@ -71,6 +71,7 @@ describe("bundled web search metadata", () => { "google", "moonshot", "perplexity", + "tavily", "xai", ]); }); diff --git a/src/plugins/bundled-web-search.ts b/src/plugins/bundled-web-search.ts index d1f2ce342f8..4b9594caaf8 100644 --- a/src/plugins/bundled-web-search.ts +++ b/src/plugins/bundled-web-search.ts @@ -191,6 +191,21 @@ const BUNDLED_WEB_SEARCH_PROVIDER_DESCRIPTORS = [ credentialScope: { kind: "scoped", key: "firecrawl" }, applySelectionConfig: (config) => enablePluginInConfig(config, "firecrawl").config, }, + { + pluginId: "tavily", + id: "tavily", + label: "Tavily Search", + hint: "Structured results with domain filters and AI answer summaries", + envVars: ["TAVILY_API_KEY"], + placeholder: "tvly-...", + signupUrl: "https://tavily.com/", + docsUrl: "https://docs.openclaw.ai/tools/tavily", + autoDetectOrder: 70, + credentialPath: "plugins.entries.tavily.config.webSearch.apiKey", + inactiveSecretPaths: ["plugins.entries.tavily.config.webSearch.apiKey"], + credentialScope: { kind: "scoped", key: "tavily" }, + applySelectionConfig: (config) => enablePluginInConfig(config, "tavily").config, + }, ] as const satisfies ReadonlyArray; export const BUNDLED_WEB_SEARCH_PLUGIN_IDS = [ diff --git a/src/plugins/contracts/registry.contract.test.ts b/src/plugins/contracts/registry.contract.test.ts index a5214106d52..f2cfd9e1392 100644 --- a/src/plugins/contracts/registry.contract.test.ts +++ b/src/plugins/contracts/registry.contract.test.ts @@ -146,6 +146,7 @@ describe("plugin contract registry", () => { expect(findWebSearchIdsForPlugin("google")).toEqual(["gemini"]); expect(findWebSearchIdsForPlugin("moonshot")).toEqual(["kimi"]); expect(findWebSearchIdsForPlugin("perplexity")).toEqual(["perplexity"]); + expect(findWebSearchIdsForPlugin("tavily")).toEqual(["tavily"]); expect(findWebSearchIdsForPlugin("xai")).toEqual(["grok"]); }); @@ -183,6 +184,14 @@ describe("plugin contract registry", () => { webSearchProviderIds: ["firecrawl"], toolNames: ["firecrawl_search", "firecrawl_scrape"], }); + expect(findRegistrationForPlugin("tavily")).toMatchObject({ + providerIds: [], + speechProviderIds: [], + mediaUnderstandingProviderIds: [], + imageGenerationProviderIds: [], + webSearchProviderIds: ["tavily"], + toolNames: ["tavily_search", "tavily_extract"], + }); }); it("tracks speech registrations on bundled provider plugins", () => { diff --git a/src/plugins/contracts/registry.ts b/src/plugins/contracts/registry.ts index 60d6f96dc3d..cde5b8e8e2d 100644 --- a/src/plugins/contracts/registry.ts +++ b/src/plugins/contracts/registry.ts @@ -29,6 +29,7 @@ import qianfanPlugin from "../../../extensions/qianfan/index.js"; import qwenPortalAuthPlugin from "../../../extensions/qwen-portal-auth/index.js"; import sglangPlugin from "../../../extensions/sglang/index.js"; import syntheticPlugin from "../../../extensions/synthetic/index.js"; +import tavilyPlugin from "../../../extensions/tavily/index.js"; import togetherPlugin from "../../../extensions/together/index.js"; import venicePlugin from "../../../extensions/venice/index.js"; import vercelAiGatewayPlugin from "../../../extensions/vercel-ai-gateway/index.js"; @@ -84,9 +85,9 @@ const bundledWebSearchPlugins: Array ({ @@ -96,6 +97,7 @@ describe("resolvePluginWebSearchProviders", () => { "moonshot:kimi", "perplexity:perplexity", "firecrawl:firecrawl", + "tavily:tavily", ]); expect(providers.map((provider) => provider.credentialPath)).toEqual([ "plugins.entries.brave.config.webSearch.apiKey", @@ -104,6 +106,7 @@ describe("resolvePluginWebSearchProviders", () => { "plugins.entries.moonshot.config.webSearch.apiKey", "plugins.entries.perplexity.config.webSearch.apiKey", "plugins.entries.firecrawl.config.webSearch.apiKey", + "plugins.entries.tavily.config.webSearch.apiKey", ]); expect(providers.find((provider) => provider.id === "firecrawl")?.applySelectionConfig).toEqual( expect.any(Function), @@ -130,6 +133,7 @@ describe("resolvePluginWebSearchProviders", () => { "moonshot", "perplexity", "firecrawl", + "tavily", ]); }); @@ -183,6 +187,7 @@ describe("resolvePluginWebSearchProviders", () => { "moonshot:kimi", "perplexity:perplexity", "firecrawl:firecrawl", + "tavily:tavily", ]); expect(loadOpenClawPluginsMock).not.toHaveBeenCalled(); }); diff --git a/src/secrets/provider-env-vars.test.ts b/src/secrets/provider-env-vars.test.ts index 6405d322e2f..63d12fd6c0e 100644 --- a/src/secrets/provider-env-vars.test.ts +++ b/src/secrets/provider-env-vars.test.ts @@ -8,10 +8,28 @@ import { describe("provider env vars", () => { it("keeps the auth scrub list broader than the global secret env list", () => { expect(listKnownProviderAuthEnvVarNames()).toEqual( - expect.arrayContaining(["GITHUB_TOKEN", "GH_TOKEN", "ANTHROPIC_OAUTH_TOKEN"]), + expect.arrayContaining([ + "GITHUB_TOKEN", + "GH_TOKEN", + "ANTHROPIC_OAUTH_TOKEN", + "BRAVE_API_KEY", + "FIRECRAWL_API_KEY", + "PERPLEXITY_API_KEY", + "OPENROUTER_API_KEY", + "TAVILY_API_KEY", + ]), ); expect(listKnownSecretEnvVarNames()).toEqual( - expect.arrayContaining(["GITHUB_TOKEN", "GH_TOKEN", "ANTHROPIC_OAUTH_TOKEN"]), + expect.arrayContaining([ + "GITHUB_TOKEN", + "GH_TOKEN", + "ANTHROPIC_OAUTH_TOKEN", + "BRAVE_API_KEY", + "FIRECRAWL_API_KEY", + "PERPLEXITY_API_KEY", + "OPENROUTER_API_KEY", + "TAVILY_API_KEY", + ]), ); expect(listKnownProviderAuthEnvVarNames()).toEqual( expect.arrayContaining(["MINIMAX_CODE_PLAN_KEY"]), diff --git a/src/secrets/target-registry-data.ts b/src/secrets/target-registry-data.ts index 30aa096004b..7d1a7854867 100644 --- a/src/secrets/target-registry-data.ts +++ b/src/secrets/target-registry-data.ts @@ -843,6 +843,17 @@ const SECRET_TARGET_REGISTRY: SecretTargetRegistryEntry[] = [ includeInConfigure: true, includeInAudit: true, }, + { + id: "plugins.entries.tavily.config.webSearch.apiKey", + targetType: "plugins.entries.tavily.config.webSearch.apiKey", + configFile: "openclaw.json", + pathPattern: "plugins.entries.tavily.config.webSearch.apiKey", + secretShape: SECRET_INPUT_SHAPE, + expectedResolvedValue: "string", + includeInPlan: true, + includeInConfigure: true, + includeInAudit: true, + }, ]; export { SECRET_TARGET_REGISTRY }; diff --git a/src/web-search/runtime.test.ts b/src/web-search/runtime.test.ts index 72d1e4ad3f3..ab5a59ca993 100644 --- a/src/web-search/runtime.test.ts +++ b/src/web-search/runtime.test.ts @@ -1,8 +1,15 @@ import { afterEach, describe, expect, it } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; import { createEmptyPluginRegistry } from "../plugins/registry.js"; import { setActivePluginRegistry } from "../plugins/runtime.js"; import { runWebSearch } from "./runtime.js"; +type TestPluginWebSearchConfig = { + webSearch?: { + apiKey?: unknown; + }; +}; + describe("web search runtime", () => { afterEach(() => { setActivePluginRegistry(createEmptyPluginRegistry()); @@ -44,4 +51,74 @@ describe("web search runtime", () => { result: { query: "hello", ok: true }, }); }); + + it("auto-detects a provider from canonical plugin-owned credentials", async () => { + const registry = createEmptyPluginRegistry(); + registry.webSearchProviders.push({ + pluginId: "custom-search", + pluginName: "Custom Search", + provider: { + id: "custom", + label: "Custom Search", + hint: "Custom runtime provider", + envVars: ["CUSTOM_SEARCH_API_KEY"], + placeholder: "custom-...", + signupUrl: "https://example.com/signup", + credentialPath: "plugins.entries.custom-search.config.webSearch.apiKey", + autoDetectOrder: 1, + getCredentialValue: () => undefined, + setCredentialValue: () => {}, + getConfiguredCredentialValue: (config) => { + const pluginConfig = config?.plugins?.entries?.["custom-search"]?.config as + | TestPluginWebSearchConfig + | undefined; + return pluginConfig?.webSearch?.apiKey; + }, + setConfiguredCredentialValue: (configTarget, value) => { + configTarget.plugins = { + ...configTarget.plugins, + entries: { + ...configTarget.plugins?.entries, + "custom-search": { + enabled: true, + config: { webSearch: { apiKey: value } }, + }, + }, + }; + }, + createTool: () => ({ + description: "custom", + parameters: {}, + execute: async (args) => ({ ...args, ok: true }), + }), + }, + source: "test", + }); + setActivePluginRegistry(registry); + + const config: OpenClawConfig = { + plugins: { + entries: { + "custom-search": { + enabled: true, + config: { + webSearch: { + apiKey: "custom-config-key", + }, + }, + }, + }, + }, + }; + + await expect( + runWebSearch({ + config, + args: { query: "hello" }, + }), + ).resolves.toEqual({ + provider: "custom", + result: { query: "hello", ok: true }, + }); + }); }); From 84ee6fbb76b5b255c6e84ea834d4b2a9562b33d6 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:26:24 +0530 Subject: [PATCH 02/11] feat(tts): add in-memory speech synthesis --- src/tts/providers/microsoft.ts | 1 + src/tts/providers/openai.ts | 2 +- src/tts/tts.ts | 64 +++++++++++++++++++++++++++------- 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/src/tts/providers/microsoft.ts b/src/tts/providers/microsoft.ts index fef369740cb..ba2511e4de6 100644 --- a/src/tts/providers/microsoft.ts +++ b/src/tts/providers/microsoft.ts @@ -96,6 +96,7 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { outputPath, config: { ...req.config.edge, + voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice, outputFormat: format, }, timeoutMs: req.config.timeoutMs, diff --git a/src/tts/providers/openai.ts b/src/tts/providers/openai.ts index 9f96e9ea6e9..01e5997e85c 100644 --- a/src/tts/providers/openai.ts +++ b/src/tts/providers/openai.ts @@ -21,7 +21,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin { baseUrl: req.config.openai.baseUrl, model: req.overrides?.openai?.model ?? req.config.openai.model, voice: req.overrides?.openai?.voice ?? req.config.openai.voice, - speed: req.config.openai.speed, + speed: req.overrides?.openai?.speed ?? req.config.openai.speed, instructions: req.config.openai.instructions, responseFormat, timeoutMs: req.config.timeoutMs, diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 0a5aa81126e..c64dda83909 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -162,6 +162,7 @@ export type TtsDirectiveOverrides = { openai?: { voice?: string; model?: string; + speed?: number; }; elevenlabs?: { voiceId?: string; @@ -171,6 +172,9 @@ export type TtsDirectiveOverrides = { languageCode?: string; voiceSettings?: Partial; }; + microsoft?: { + voice?: string; + }; }; export type TtsDirectiveParseResult = { @@ -191,6 +195,17 @@ export type TtsResult = { voiceCompatible?: boolean; }; +export type TtsSynthesisResult = { + success: boolean; + audioBuffer?: Buffer; + error?: string; + latencyMs?: number; + provider?: string; + outputFormat?: string; + voiceCompatible?: boolean; + fileExtension?: string; +}; + export type TtsTelephonyResult = { success: boolean; audioBuffer?: Buffer; @@ -601,6 +616,7 @@ function resolveTtsRequestSetup(params: { cfg: OpenClawConfig; prefsPath?: string; providerOverride?: TtsProvider; + disableFallback?: boolean; }): | { config: ResolvedTtsConfig; @@ -621,7 +637,7 @@ function resolveTtsRequestSetup(params: { const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider; return { config, - providers: resolveTtsProviderOrder(provider, params.cfg), + providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg), }; } @@ -631,12 +647,44 @@ export async function textToSpeech(params: { prefsPath?: string; channel?: string; overrides?: TtsDirectiveOverrides; + disableFallback?: boolean; }): Promise { + const synthesis = await synthesizeSpeech(params); + if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) { + return buildTtsFailureResult([synthesis.error ?? "TTS conversion failed"]); + } + + const tempRoot = resolvePreferredOpenClawTmpDir(); + mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = mkdtempSync(path.join(tempRoot, "tts-")); + const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`); + writeFileSync(audioPath, synthesis.audioBuffer); + scheduleCleanup(tempDir); + + return { + success: true, + audioPath, + latencyMs: synthesis.latencyMs, + provider: synthesis.provider, + outputFormat: synthesis.outputFormat, + voiceCompatible: synthesis.voiceCompatible, + }; +} + +export async function synthesizeSpeech(params: { + text: string; + cfg: OpenClawConfig; + prefsPath?: string; + channel?: string; + overrides?: TtsDirectiveOverrides; + disableFallback?: boolean; +}): Promise { const setup = resolveTtsRequestSetup({ text: params.text, cfg: params.cfg, prefsPath: params.prefsPath, providerOverride: params.overrides?.provider, + disableFallback: params.disableFallback, }); if ("error" in setup) { return { success: false, error: setup.error }; @@ -667,22 +715,14 @@ export async function textToSpeech(params: { target, overrides: params.overrides, }); - const latencyMs = Date.now() - providerStart; - - const tempRoot = resolvePreferredOpenClawTmpDir(); - mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); - const tempDir = mkdtempSync(path.join(tempRoot, "tts-")); - const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`); - writeFileSync(audioPath, synthesis.audioBuffer); - scheduleCleanup(tempDir); - return { success: true, - audioPath, - latencyMs, + audioBuffer: synthesis.audioBuffer, + latencyMs: Date.now() - providerStart, provider, outputFormat: synthesis.outputFormat, voiceCompatible: synthesis.voiceCompatible, + fileExtension: synthesis.fileExtension, }; } catch (err) { errors.push(formatTtsProviderError(provider, err)); From 4ac355babbeffdf133c46f77352829ad23e38eda Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:27:05 +0530 Subject: [PATCH 03/11] feat(gateway): add talk speak rpc --- src/gateway/method-scopes.ts | 1 + src/gateway/protocol/index.ts | 10 + src/gateway/protocol/schema/channels.ts | 29 ++ .../protocol/schema/protocol-schemas.ts | 4 + src/gateway/protocol/schema/types.ts | 2 + src/gateway/server-methods-list.ts | 1 + src/gateway/server-methods/talk.ts | 335 +++++++++++++++++- src/gateway/server.talk-config.test.ts | 67 +++- 8 files changed, 447 insertions(+), 2 deletions(-) diff --git a/src/gateway/method-scopes.ts b/src/gateway/method-scopes.ts index c31ff30db7b..f3a969301bf 100644 --- a/src/gateway/method-scopes.ts +++ b/src/gateway/method-scopes.ts @@ -98,6 +98,7 @@ const METHOD_SCOPE_GROUPS: Record = { "agent.wait", "wake", "talk.mode", + "talk.speak", "tts.enable", "tts.disable", "tts.convert", diff --git a/src/gateway/protocol/index.ts b/src/gateway/protocol/index.ts index 408e3239cc1..408074d44e4 100644 --- a/src/gateway/protocol/index.ts +++ b/src/gateway/protocol/index.ts @@ -48,6 +48,10 @@ import { TalkConfigParamsSchema, type TalkConfigResult, TalkConfigResultSchema, + type TalkSpeakParams, + TalkSpeakParamsSchema, + type TalkSpeakResult, + TalkSpeakResultSchema, type ChannelsStatusParams, ChannelsStatusParamsSchema, type ChannelsStatusResult, @@ -375,6 +379,8 @@ export const validateWizardStatusParams = ajv.compile(Wizard export const validateTalkModeParams = ajv.compile(TalkModeParamsSchema); export const validateTalkConfigParams = ajv.compile(TalkConfigParamsSchema); export const validateTalkConfigResult = ajv.compile(TalkConfigResultSchema); +export const validateTalkSpeakParams = ajv.compile(TalkSpeakParamsSchema); +export const validateTalkSpeakResult = ajv.compile(TalkSpeakResultSchema); export const validateChannelsStatusParams = ajv.compile( ChannelsStatusParamsSchema, ); @@ -540,6 +546,8 @@ export { WizardStatusResultSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkSpeakParamsSchema, + TalkSpeakResultSchema, ChannelsStatusParamsSchema, ChannelsStatusResultSchema, ChannelsLogoutParamsSchema, @@ -629,6 +637,8 @@ export type { WizardStatusResult, TalkConfigParams, TalkConfigResult, + TalkSpeakParams, + TalkSpeakResult, TalkModeParams, ChannelsStatusParams, ChannelsStatusResult, diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 041318897ac..923432c7ac8 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -16,6 +16,23 @@ export const TalkConfigParamsSchema = Type.Object( { additionalProperties: false }, ); +export const TalkSpeakParamsSchema = Type.Object( + { + text: NonEmptyString, + voiceId: Type.Optional(Type.String()), + modelId: Type.Optional(Type.String()), + speed: Type.Optional(Type.Number()), + stability: Type.Optional(Type.Number()), + similarity: Type.Optional(Type.Number()), + style: Type.Optional(Type.Number()), + speakerBoost: Type.Optional(Type.Boolean()), + seed: Type.Optional(Type.Integer({ minimum: 0 })), + normalize: Type.Optional(Type.String()), + language: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + const talkProviderFieldSchemas = { voiceId: Type.Optional(Type.String()), voiceAliases: Type.Optional(Type.Record(Type.String(), Type.String())), @@ -85,6 +102,18 @@ export const TalkConfigResultSchema = Type.Object( { additionalProperties: false }, ); +export const TalkSpeakResultSchema = Type.Object( + { + audioBase64: NonEmptyString, + provider: NonEmptyString, + outputFormat: Type.Optional(Type.String()), + voiceCompatible: Type.Optional(Type.Boolean()), + mimeType: Type.Optional(Type.String()), + fileExtension: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + export const ChannelsStatusParamsSchema = Type.Object( { probe: Type.Optional(Type.Boolean()), diff --git a/src/gateway/protocol/schema/protocol-schemas.ts b/src/gateway/protocol/schema/protocol-schemas.ts index 60636e3eb5f..cf14fc44610 100644 --- a/src/gateway/protocol/schema/protocol-schemas.ts +++ b/src/gateway/protocol/schema/protocol-schemas.ts @@ -44,6 +44,8 @@ import { ChannelsLogoutParamsSchema, TalkConfigParamsSchema, TalkConfigResultSchema, + TalkSpeakParamsSchema, + TalkSpeakResultSchema, ChannelsStatusParamsSchema, ChannelsStatusResultSchema, TalkModeParamsSchema, @@ -238,6 +240,8 @@ export const ProtocolSchemas = { TalkModeParams: TalkModeParamsSchema, TalkConfigParams: TalkConfigParamsSchema, TalkConfigResult: TalkConfigResultSchema, + TalkSpeakParams: TalkSpeakParamsSchema, + TalkSpeakResult: TalkSpeakResultSchema, ChannelsStatusParams: ChannelsStatusParamsSchema, ChannelsStatusResult: ChannelsStatusResultSchema, ChannelsLogoutParams: ChannelsLogoutParamsSchema, diff --git a/src/gateway/protocol/schema/types.ts b/src/gateway/protocol/schema/types.ts index 58ddb142cd5..d74c08ad10b 100644 --- a/src/gateway/protocol/schema/types.ts +++ b/src/gateway/protocol/schema/types.ts @@ -70,6 +70,8 @@ export type WizardStatusResult = SchemaType<"WizardStatusResult">; export type TalkModeParams = SchemaType<"TalkModeParams">; export type TalkConfigParams = SchemaType<"TalkConfigParams">; export type TalkConfigResult = SchemaType<"TalkConfigResult">; +export type TalkSpeakParams = SchemaType<"TalkSpeakParams">; +export type TalkSpeakResult = SchemaType<"TalkSpeakResult">; export type ChannelsStatusParams = SchemaType<"ChannelsStatusParams">; export type ChannelsStatusResult = SchemaType<"ChannelsStatusResult">; export type ChannelsLogoutParams = SchemaType<"ChannelsLogoutParams">; diff --git a/src/gateway/server-methods-list.ts b/src/gateway/server-methods-list.ts index b4de49f1198..e930f8b0517 100644 --- a/src/gateway/server-methods-list.ts +++ b/src/gateway/server-methods-list.ts @@ -34,6 +34,7 @@ const BASE_METHODS = [ "wizard.cancel", "wizard.status", "talk.config", + "talk.speak", "talk.mode", "models.list", "tools.catalog", diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index 693f3447537..33cb6d7f116 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -1,23 +1,297 @@ import { readConfigFileSnapshot } from "../../config/config.js"; import { redactConfigObject } from "../../config/redact-snapshot.js"; -import { buildTalkConfigResponse } from "../../config/talk.js"; +import { buildTalkConfigResponse, resolveActiveTalkProviderConfig } from "../../config/talk.js"; +import type { TalkProviderConfig } from "../../config/types.gateway.js"; +import type { OpenClawConfig, TtsConfig } from "../../config/types.js"; +import { normalizeSpeechProviderId } from "../../tts/provider-registry.js"; +import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js"; import { ErrorCodes, errorShape, formatValidationErrors, validateTalkConfigParams, validateTalkModeParams, + validateTalkSpeakParams, } from "../protocol/index.js"; +import { formatForLog } from "../ws-log.js"; import type { GatewayRequestHandlers } from "./types.js"; const ADMIN_SCOPE = "operator.admin"; const TALK_SECRETS_SCOPE = "operator.talk.secrets"; +type ElevenLabsVoiceSettings = NonNullable["voiceSettings"]>; function canReadTalkSecrets(client: { connect?: { scopes?: string[] } } | null): boolean { const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : []; return scopes.includes(ADMIN_SCOPE) || scopes.includes(TALK_SECRETS_SCOPE); } +function trimString(value: unknown): string | undefined { + if (typeof value !== "string") { + return undefined; + } + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function finiteNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function optionalBoolean(value: unknown): boolean | undefined { + return typeof value === "boolean" ? value : undefined; +} + +function plainObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function normalizeTextNormalization(value: unknown): "auto" | "on" | "off" | undefined { + const normalized = trimString(value)?.toLowerCase(); + return normalized === "auto" || normalized === "on" || normalized === "off" + ? normalized + : undefined; +} + +function normalizeAliasKey(value: string): string { + return value.trim().toLowerCase(); +} + +function resolveTalkVoiceId( + providerConfig: TalkProviderConfig, + requested: string | undefined, +): string | undefined { + if (!requested) { + return undefined; + } + const aliases = providerConfig.voiceAliases; + if (!aliases) { + return requested; + } + return aliases[normalizeAliasKey(requested)] ?? requested; +} + +function readTalkVoiceSettings( + providerConfig: TalkProviderConfig, +): ElevenLabsVoiceSettings | undefined { + const source = plainObject(providerConfig.voiceSettings); + if (!source) { + return undefined; + } + const stability = finiteNumber(source.stability); + const similarityBoost = finiteNumber(source.similarityBoost); + const style = finiteNumber(source.style); + const useSpeakerBoost = optionalBoolean(source.useSpeakerBoost); + const speed = finiteNumber(source.speed); + const voiceSettings = { + ...(stability == null ? {} : { stability }), + ...(similarityBoost == null ? {} : { similarityBoost }), + ...(style == null ? {} : { style }), + ...(useSpeakerBoost == null ? {} : { useSpeakerBoost }), + ...(speed == null ? {} : { speed }), + }; + return Object.keys(voiceSettings).length > 0 ? voiceSettings : undefined; +} + +function buildTalkTtsConfig( + config: OpenClawConfig, +): + | { cfg: OpenClawConfig; provider: string; providerConfig: TalkProviderConfig } + | { error: string } { + const resolved = resolveActiveTalkProviderConfig(config.talk); + const provider = normalizeSpeechProviderId(resolved?.provider); + if (!resolved || !provider) { + return { error: "talk.speak unavailable: talk provider not configured" }; + } + + const baseTts = config.messages?.tts ?? {}; + const providerConfig = resolved.config; + const talkTts: TtsConfig = { + ...baseTts, + auto: "always", + provider, + }; + + if (provider === "elevenlabs") { + talkTts.elevenlabs = { + ...baseTts.elevenlabs, + ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }), + ...(trimString(providerConfig.baseUrl) == null + ? {} + : { baseUrl: trimString(providerConfig.baseUrl) }), + ...(trimString(providerConfig.voiceId) == null + ? {} + : { voiceId: trimString(providerConfig.voiceId) }), + ...(trimString(providerConfig.modelId) == null + ? {} + : { modelId: trimString(providerConfig.modelId) }), + ...(finiteNumber(providerConfig.seed) == null + ? {} + : { seed: finiteNumber(providerConfig.seed) }), + ...(normalizeTextNormalization(providerConfig.applyTextNormalization) == null + ? {} + : { + applyTextNormalization: normalizeTextNormalization( + providerConfig.applyTextNormalization, + ), + }), + ...(trimString(providerConfig.languageCode) == null + ? {} + : { languageCode: trimString(providerConfig.languageCode) }), + ...(readTalkVoiceSettings(providerConfig) == null + ? {} + : { voiceSettings: readTalkVoiceSettings(providerConfig) }), + }; + } else if (provider === "openai") { + talkTts.openai = { + ...baseTts.openai, + ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }), + ...(trimString(providerConfig.baseUrl) == null + ? {} + : { baseUrl: trimString(providerConfig.baseUrl) }), + ...(trimString(providerConfig.modelId) == null + ? {} + : { model: trimString(providerConfig.modelId) }), + ...(trimString(providerConfig.voiceId) == null + ? {} + : { voice: trimString(providerConfig.voiceId) }), + ...(finiteNumber(providerConfig.speed) == null + ? {} + : { speed: finiteNumber(providerConfig.speed) }), + ...(trimString(providerConfig.instructions) == null + ? {} + : { instructions: trimString(providerConfig.instructions) }), + }; + } else if (provider === "microsoft") { + talkTts.microsoft = { + ...baseTts.microsoft, + enabled: true, + ...(trimString(providerConfig.voiceId) == null + ? {} + : { voice: trimString(providerConfig.voiceId) }), + ...(trimString(providerConfig.languageCode) == null + ? {} + : { lang: trimString(providerConfig.languageCode) }), + ...(trimString(providerConfig.outputFormat) == null + ? {} + : { outputFormat: trimString(providerConfig.outputFormat) }), + ...(trimString(providerConfig.pitch) == null + ? {} + : { pitch: trimString(providerConfig.pitch) }), + ...(trimString(providerConfig.rate) == null ? {} : { rate: trimString(providerConfig.rate) }), + ...(trimString(providerConfig.volume) == null + ? {} + : { volume: trimString(providerConfig.volume) }), + ...(trimString(providerConfig.proxy) == null + ? {} + : { proxy: trimString(providerConfig.proxy) }), + ...(finiteNumber(providerConfig.timeoutMs) == null + ? {} + : { timeoutMs: finiteNumber(providerConfig.timeoutMs) }), + }; + } else { + return { error: `talk.speak unavailable: unsupported talk provider '${resolved.provider}'` }; + } + + return { + provider, + providerConfig, + cfg: { + ...config, + messages: { + ...config.messages, + tts: talkTts, + }, + }, + }; +} + +function buildTalkSpeakOverrides( + provider: string, + providerConfig: TalkProviderConfig, + params: Record, +): TtsDirectiveOverrides { + const voiceId = resolveTalkVoiceId(providerConfig, trimString(params.voiceId)); + const modelId = trimString(params.modelId); + const speed = finiteNumber(params.speed); + const seed = finiteNumber(params.seed); + const normalize = normalizeTextNormalization(params.normalize); + const language = trimString(params.language)?.toLowerCase(); + const overrides: TtsDirectiveOverrides = { provider }; + + if (provider === "elevenlabs") { + const voiceSettings = { + ...(speed == null ? {} : { speed }), + ...(finiteNumber(params.stability) == null + ? {} + : { stability: finiteNumber(params.stability) }), + ...(finiteNumber(params.similarity) == null + ? {} + : { similarityBoost: finiteNumber(params.similarity) }), + ...(finiteNumber(params.style) == null ? {} : { style: finiteNumber(params.style) }), + ...(optionalBoolean(params.speakerBoost) == null + ? {} + : { useSpeakerBoost: optionalBoolean(params.speakerBoost) }), + }; + overrides.elevenlabs = { + ...(voiceId == null ? {} : { voiceId }), + ...(modelId == null ? {} : { modelId }), + ...(seed == null ? {} : { seed }), + ...(normalize == null ? {} : { applyTextNormalization: normalize }), + ...(language == null ? {} : { languageCode: language }), + ...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }), + }; + return overrides; + } + + if (provider === "openai") { + overrides.openai = { + ...(voiceId == null ? {} : { voice: voiceId }), + ...(modelId == null ? {} : { model: modelId }), + ...(speed == null ? {} : { speed }), + }; + return overrides; + } + + if (provider === "microsoft") { + overrides.microsoft = voiceId == null ? undefined : { voice: voiceId }; + } + + return overrides; +} + +function inferMimeType( + outputFormat: string | undefined, + fileExtension: string | undefined, +): string | undefined { + const normalizedOutput = outputFormat?.trim().toLowerCase(); + const normalizedExtension = fileExtension?.trim().toLowerCase(); + if ( + normalizedOutput === "mp3" || + normalizedOutput?.startsWith("mp3_") || + normalizedOutput?.endsWith("-mp3") || + normalizedExtension === ".mp3" + ) { + return "audio/mpeg"; + } + if ( + normalizedOutput === "opus" || + normalizedOutput?.startsWith("opus_") || + normalizedExtension === ".opus" || + normalizedExtension === ".ogg" + ) { + return "audio/ogg"; + } + if (normalizedOutput?.endsWith("-wav") || normalizedExtension === ".wav") { + return "audio/wav"; + } + if (normalizedOutput?.endsWith("-webm") || normalizedExtension === ".webm") { + return "audio/webm"; + } + return undefined; +} + export const talkHandlers: GatewayRequestHandlers = { "talk.config": async ({ params, respond, client }) => { if (!validateTalkConfigParams(params)) { @@ -65,6 +339,65 @@ export const talkHandlers: GatewayRequestHandlers = { respond(true, { config: configPayload }, undefined); }, + "talk.speak": async ({ params, respond }) => { + if (!validateTalkSpeakParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.speak params: ${formatValidationErrors(validateTalkSpeakParams.errors)}`, + ), + ); + return; + } + + const text = trimString((params as { text?: unknown }).text); + if (!text) { + respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "talk.speak requires text")); + return; + } + + try { + const snapshot = await readConfigFileSnapshot(); + const setup = buildTalkTtsConfig(snapshot.config); + if ("error" in setup) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, setup.error)); + return; + } + + const overrides = buildTalkSpeakOverrides(setup.provider, setup.providerConfig, params); + const result = await synthesizeSpeech({ + text, + cfg: setup.cfg, + overrides, + disableFallback: true, + }); + if (!result.success || !result.audioBuffer) { + respond( + false, + undefined, + errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "talk synthesis failed"), + ); + return; + } + + respond( + true, + { + audioBase64: result.audioBuffer.toString("base64"), + provider: result.provider ?? setup.provider, + outputFormat: result.outputFormat, + voiceCompatible: result.voiceCompatible, + mimeType: inferMimeType(result.outputFormat, result.fileExtension), + fileExtension: result.fileExtension, + }, + undefined, + ); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, "talk.mode": ({ params, respond, context, client, isWebchatConnect }) => { if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) { respond( diff --git a/src/gateway/server.talk-config.test.ts b/src/gateway/server.talk-config.test.ts index a47addbb0e0..eb2925db158 100644 --- a/src/gateway/server.talk-config.test.ts +++ b/src/gateway/server.talk-config.test.ts @@ -1,6 +1,6 @@ import os from "node:os"; import path from "node:path"; -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { loadOrCreateDeviceIdentity, publicKeyRawBase64UrlFromPem, @@ -41,6 +41,13 @@ type TalkConfigPayload = { }; }; type TalkConfig = NonNullable["talk"]>; +type TalkSpeakPayload = { + audioBase64?: string; + provider?: string; + outputFormat?: string; + mimeType?: string; + fileExtension?: string; +}; const TALK_CONFIG_DEVICE_PATH = path.join( os.tmpdir(), `openclaw-talk-config-device-${process.pid}.json`, @@ -95,6 +102,10 @@ async function fetchTalkConfig( return rpcReq(ws, "talk.config", params ?? {}); } +async function fetchTalkSpeak(ws: GatewaySocket, params: Record) { + return rpcReq(ws, "talk.speak", params); +} + function expectElevenLabsTalkConfig( talk: TalkConfig | undefined, expected: { @@ -236,4 +247,58 @@ describe("gateway talk.config", () => { }); }); }); + + it("synthesizes talk audio via the active talk provider", async () => { + const { writeConfigFile } = await import("../config/config.js"); + await writeConfigFile({ + talk: { + provider: "openai", + providers: { + openai: { + apiKey: "openai-talk-key", // pragma: allowlist secret + voiceId: "alloy", + modelId: "gpt-4o-mini-tts", + }, + }, + }, + }); + + const originalFetch = globalThis.fetch; + const requestInits: RequestInit[] = []; + const fetchMock = vi.fn(async (_input: RequestInfo | URL, init?: RequestInit) => { + if (init) { + requestInits.push(init); + } + return new Response(new Uint8Array([1, 2, 3]), { status: 200 }); + }); + globalThis.fetch = fetchMock as typeof fetch; + + try { + await withServer(async (ws) => { + await connectOperator(ws, ["operator.read", "operator.write"]); + const res = await fetchTalkSpeak(ws, { + text: "Hello from talk mode.", + voiceId: "nova", + modelId: "tts-1", + speed: 1.25, + }); + expect(res.ok).toBe(true); + expect(res.payload?.provider).toBe("openai"); + expect(res.payload?.outputFormat).toBe("mp3"); + expect(res.payload?.mimeType).toBe("audio/mpeg"); + expect(res.payload?.fileExtension).toBe(".mp3"); + expect(res.payload?.audioBase64).toBe(Buffer.from([1, 2, 3]).toString("base64")); + }); + + expect(fetchMock).toHaveBeenCalled(); + const requestInit = requestInits.find((init) => typeof init.body === "string"); + expect(requestInit).toBeDefined(); + const body = JSON.parse(requestInit?.body as string) as Record; + expect(body.model).toBe("tts-1"); + expect(body.voice).toBe("nova"); + expect(body.speed).toBe(1.25); + } finally { + globalThis.fetch = originalFetch; + } + }); }); From f7fe75a68bb28ed2cf8631264991d52f20e219b0 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:27:48 +0530 Subject: [PATCH 04/11] refactor(android): simplify talk config parsing --- .../app/voice/TalkModeGatewayConfig.kt | 119 +---------------- .../app/voice/TalkModeConfigContractTest.kt | 100 --------------- .../app/voice/TalkModeConfigParsingTest.kt | 120 ++---------------- 3 files changed, 15 insertions(+), 324 deletions(-) delete mode 100644 apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigContractTest.kt diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeGatewayConfig.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeGatewayConfig.kt index 58208acc0bb..d0545b2baf0 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeGatewayConfig.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeGatewayConfig.kt @@ -4,116 +4,23 @@ import ai.openclaw.app.normalizeMainKey import kotlinx.serialization.json.JsonElement import kotlinx.serialization.json.JsonObject import kotlinx.serialization.json.JsonPrimitive -import kotlinx.serialization.json.buildJsonObject import kotlinx.serialization.json.booleanOrNull import kotlinx.serialization.json.contentOrNull -internal data class TalkProviderConfigSelection( - val provider: String, - val config: JsonObject, - val normalizedPayload: Boolean, -) - internal data class TalkModeGatewayConfigState( - val activeProvider: String, - val normalizedPayload: Boolean, - val missingResolvedPayload: Boolean, val mainSessionKey: String, - val defaultVoiceId: String?, - val voiceAliases: Map, - val defaultModelId: String, - val defaultOutputFormat: String, - val apiKey: String?, val interruptOnSpeech: Boolean?, val silenceTimeoutMs: Long, ) internal object TalkModeGatewayConfigParser { - private const val defaultTalkProvider = "elevenlabs" - - fun parse( - config: JsonObject?, - defaultProvider: String, - defaultModelIdFallback: String, - defaultOutputFormatFallback: String, - envVoice: String?, - sagVoice: String?, - envKey: String?, - ): TalkModeGatewayConfigState { + fun parse(config: JsonObject?): TalkModeGatewayConfigState { val talk = config?.get("talk").asObjectOrNull() - val selection = selectTalkProviderConfig(talk) - val activeProvider = selection?.provider ?: defaultProvider - val activeConfig = selection?.config val sessionCfg = config?.get("session").asObjectOrNull() - val mainKey = normalizeMainKey(sessionCfg?.get("mainKey").asStringOrNull()) - val voice = activeConfig?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } - val aliases = - activeConfig?.get("voiceAliases").asObjectOrNull()?.entries?.mapNotNull { (key, value) -> - val id = value.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } ?: return@mapNotNull null - normalizeTalkAliasKey(key).takeIf { it.isNotEmpty() }?.let { it to id } - }?.toMap().orEmpty() - val model = activeConfig?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } - val outputFormat = - activeConfig?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } - val key = activeConfig?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } - val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull() - val silenceTimeoutMs = resolvedSilenceTimeoutMs(talk) - return TalkModeGatewayConfigState( - activeProvider = activeProvider, - normalizedPayload = selection?.normalizedPayload == true, - missingResolvedPayload = talk != null && selection == null, - mainSessionKey = mainKey, - defaultVoiceId = - if (activeProvider == defaultProvider) { - voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } - } else { - voice - }, - voiceAliases = aliases, - defaultModelId = model ?: defaultModelIdFallback, - defaultOutputFormat = outputFormat ?: defaultOutputFormatFallback, - apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }, - interruptOnSpeech = interrupt, - silenceTimeoutMs = silenceTimeoutMs, - ) - } - - fun fallback( - defaultProvider: String, - defaultModelIdFallback: String, - defaultOutputFormatFallback: String, - envVoice: String?, - sagVoice: String?, - envKey: String?, - ): TalkModeGatewayConfigState = - TalkModeGatewayConfigState( - activeProvider = defaultProvider, - normalizedPayload = false, - missingResolvedPayload = false, - mainSessionKey = "main", - defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }, - voiceAliases = emptyMap(), - defaultModelId = defaultModelIdFallback, - defaultOutputFormat = defaultOutputFormatFallback, - apiKey = envKey?.takeIf { it.isNotEmpty() }, - interruptOnSpeech = null, - silenceTimeoutMs = TalkDefaults.defaultSilenceTimeoutMs, - ) - - fun selectTalkProviderConfig(talk: JsonObject?): TalkProviderConfigSelection? { - if (talk == null) return null - selectResolvedTalkProviderConfig(talk)?.let { return it } - val rawProvider = talk["provider"].asStringOrNull() - val rawProviders = talk["providers"].asObjectOrNull() - val hasNormalizedPayload = rawProvider != null || rawProviders != null - if (hasNormalizedPayload) { - return null - } - return TalkProviderConfigSelection( - provider = defaultTalkProvider, - config = talk, - normalizedPayload = false, + mainSessionKey = normalizeMainKey(sessionCfg?.get("mainKey").asStringOrNull()), + interruptOnSpeech = talk?.get("interruptOnSpeech").asBooleanOrNull(), + silenceTimeoutMs = resolvedSilenceTimeoutMs(talk), ) } @@ -127,26 +34,8 @@ internal object TalkModeGatewayConfigParser { } return timeout.toLong() } - - private fun selectResolvedTalkProviderConfig(talk: JsonObject): TalkProviderConfigSelection? { - val resolved = talk["resolved"].asObjectOrNull() ?: return null - val providerId = normalizeTalkProviderId(resolved["provider"].asStringOrNull()) ?: return null - return TalkProviderConfigSelection( - provider = providerId, - config = resolved["config"].asObjectOrNull() ?: buildJsonObject {}, - normalizedPayload = true, - ) - } - - private fun normalizeTalkProviderId(raw: String?): String? { - val trimmed = raw?.trim()?.lowercase().orEmpty() - return trimmed.takeIf { it.isNotEmpty() } - } } -private fun normalizeTalkAliasKey(value: String): String = - value.trim().lowercase() - private fun JsonElement?.asStringOrNull(): String? = this?.let { element -> element as? JsonPrimitive diff --git a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigContractTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigContractTest.kt deleted file mode 100644 index ca9be8b1280..00000000000 --- a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigContractTest.kt +++ /dev/null @@ -1,100 +0,0 @@ -package ai.openclaw.app.voice - -import java.io.File -import kotlinx.serialization.SerialName -import kotlinx.serialization.Serializable -import kotlinx.serialization.json.Json -import kotlinx.serialization.json.JsonObject -import kotlinx.serialization.json.JsonPrimitive -import org.junit.Assert.assertEquals -import org.junit.Assert.assertNotNull -import org.junit.Assert.assertNull -import org.junit.Test - -@Serializable -private data class TalkConfigContractFixture( - @SerialName("selectionCases") val selectionCases: List, - @SerialName("timeoutCases") val timeoutCases: List, -) { - @Serializable - data class SelectionCase( - val id: String, - val defaultProvider: String, - val payloadValid: Boolean, - val expectedSelection: ExpectedSelection? = null, - val talk: JsonObject, - ) - - @Serializable - data class ExpectedSelection( - val provider: String, - val normalizedPayload: Boolean, - val voiceId: String? = null, - val apiKey: String? = null, - ) - - @Serializable - data class TimeoutCase( - val id: String, - val fallback: Long, - val expectedTimeoutMs: Long, - val talk: JsonObject, - ) -} - -class TalkModeConfigContractTest { - private val json = Json { ignoreUnknownKeys = true } - - @Test - fun selectionFixtures() { - for (fixture in loadFixtures().selectionCases) { - val selection = TalkModeGatewayConfigParser.selectTalkProviderConfig(fixture.talk) - val expected = fixture.expectedSelection - if (expected == null) { - assertNull(fixture.id, selection) - continue - } - assertNotNull(fixture.id, selection) - assertEquals(fixture.id, expected.provider, selection?.provider) - assertEquals(fixture.id, expected.normalizedPayload, selection?.normalizedPayload) - assertEquals( - fixture.id, - expected.voiceId, - (selection?.config?.get("voiceId") as? JsonPrimitive)?.content, - ) - assertEquals( - fixture.id, - expected.apiKey, - (selection?.config?.get("apiKey") as? JsonPrimitive)?.content, - ) - assertEquals(fixture.id, true, fixture.payloadValid) - } - } - - @Test - fun timeoutFixtures() { - for (fixture in loadFixtures().timeoutCases) { - val timeout = TalkModeGatewayConfigParser.resolvedSilenceTimeoutMs(fixture.talk) - assertEquals(fixture.id, fixture.expectedTimeoutMs, timeout) - assertEquals(fixture.id, TalkDefaults.defaultSilenceTimeoutMs, fixture.fallback) - } - } - - private fun loadFixtures(): TalkConfigContractFixture { - val fixturePath = findFixtureFile() - return json.decodeFromString(File(fixturePath).readText()) - } - - private fun findFixtureFile(): String { - val startDir = System.getProperty("user.dir") ?: error("user.dir unavailable") - var current = File(startDir).absoluteFile - while (true) { - val candidate = File(current, "test-fixtures/talk-config-contract.json") - if (candidate.exists()) { - return candidate.absolutePath - } - current = current.parentFile ?: break - } - error("talk-config-contract.json not found from $startDir") - } -} diff --git a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigParsingTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigParsingTest.kt index e9c46231961..79f0cb94074 100644 --- a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigParsingTest.kt +++ b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigParsingTest.kt @@ -2,135 +2,37 @@ package ai.openclaw.app.voice import kotlinx.serialization.json.Json import kotlinx.serialization.json.buildJsonObject -import kotlinx.serialization.json.jsonPrimitive import kotlinx.serialization.json.jsonObject import kotlinx.serialization.json.put import org.junit.Assert.assertEquals -import org.junit.Assert.assertNotNull -import org.junit.Assert.assertTrue import org.junit.Test class TalkModeConfigParsingTest { private val json = Json { ignoreUnknownKeys = true } @Test - fun prefersCanonicalResolvedTalkProviderPayload() { - val talk = + fun readsMainSessionKeyAndInterruptFlag() { + val config = json.parseToJsonElement( """ { - "resolved": { - "provider": "elevenlabs", - "config": { - "voiceId": "voice-resolved" - } + "talk": { + "interruptOnSpeech": true, + "silenceTimeoutMs": 1800 }, - "provider": "elevenlabs", - "providers": { - "elevenlabs": { - "voiceId": "voice-normalized" - } + "session": { + "mainKey": "voice-main" } } """.trimIndent(), ) .jsonObject - val selection = TalkModeGatewayConfigParser.selectTalkProviderConfig(talk) - assertNotNull(selection) - assertEquals("elevenlabs", selection?.provider) - assertTrue(selection?.normalizedPayload == true) - assertEquals("voice-resolved", selection?.config?.get("voiceId")?.jsonPrimitive?.content) - } + val parsed = TalkModeGatewayConfigParser.parse(config) - @Test - fun prefersNormalizedTalkProviderPayload() { - val talk = - json.parseToJsonElement( - """ - { - "provider": "elevenlabs", - "providers": { - "elevenlabs": { - "voiceId": "voice-normalized" - } - }, - "voiceId": "voice-legacy" - } - """.trimIndent(), - ) - .jsonObject - - val selection = TalkModeGatewayConfigParser.selectTalkProviderConfig(talk) - assertEquals(null, selection) - } - - @Test - fun rejectsNormalizedTalkProviderPayloadWhenProviderMissingFromProviders() { - val talk = - json.parseToJsonElement( - """ - { - "provider": "acme", - "providers": { - "elevenlabs": { - "voiceId": "voice-normalized" - } - } - } - """.trimIndent(), - ) - .jsonObject - - val selection = TalkModeGatewayConfigParser.selectTalkProviderConfig(talk) - assertEquals(null, selection) - } - - @Test - fun rejectsNormalizedTalkProviderPayloadWhenProviderIsAmbiguous() { - val talk = - json.parseToJsonElement( - """ - { - "providers": { - "acme": { - "voiceId": "voice-acme" - }, - "elevenlabs": { - "voiceId": "voice-normalized" - } - } - } - """.trimIndent(), - ) - .jsonObject - - val selection = TalkModeGatewayConfigParser.selectTalkProviderConfig(talk) - assertEquals(null, selection) - } - - @Test - fun fallsBackToLegacyTalkFieldsWhenNormalizedPayloadMissing() { - val legacyApiKey = "legacy-key" // pragma: allowlist secret - val talk = - buildJsonObject { - put("voiceId", "voice-legacy") - put("apiKey", legacyApiKey) // pragma: allowlist secret - } - - val selection = TalkModeGatewayConfigParser.selectTalkProviderConfig(talk) - assertNotNull(selection) - assertEquals("elevenlabs", selection?.provider) - assertTrue(selection?.normalizedPayload == false) - assertEquals("voice-legacy", selection?.config?.get("voiceId")?.jsonPrimitive?.content) - assertEquals("legacy-key", selection?.config?.get("apiKey")?.jsonPrimitive?.content) - } - - @Test - fun readsConfiguredSilenceTimeoutMs() { - val talk = buildJsonObject { put("silenceTimeoutMs", 1500) } - - assertEquals(1500L, TalkModeGatewayConfigParser.resolvedSilenceTimeoutMs(talk)) + assertEquals("voice-main", parsed.mainSessionKey) + assertEquals(true, parsed.interruptOnSpeech) + assertEquals(1800L, parsed.silenceTimeoutMs) } @Test From e3afaca1a61de4a821518024599fee0c9dcff228 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:28:28 +0530 Subject: [PATCH 05/11] refactor(android): route talk playback through gateway --- .../ai/openclaw/app/voice/TalkModeManager.kt | 943 ++---------------- 1 file changed, 106 insertions(+), 837 deletions(-) diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt index 70b6113fc35..4ba2c2ef043 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt @@ -6,9 +6,7 @@ import android.content.Intent import android.content.pm.PackageManager import android.media.AudioAttributes import android.media.AudioFocusRequest -import android.media.AudioFormat import android.media.AudioManager -import android.media.AudioTrack import android.media.MediaPlayer import android.os.Bundle import android.os.Handler @@ -17,16 +15,12 @@ import android.os.SystemClock import android.speech.RecognitionListener import android.speech.RecognizerIntent import android.speech.SpeechRecognizer -import android.speech.tts.TextToSpeech -import android.speech.tts.UtteranceProgressListener +import android.util.Base64 import android.util.Log import androidx.core.content.ContextCompat import ai.openclaw.app.gateway.GatewaySession import ai.openclaw.app.isCanonicalMainSessionKey -import ai.openclaw.app.normalizeMainKey import java.io.File -import java.net.HttpURLConnection -import java.net.URL import java.util.UUID import java.util.concurrent.atomic.AtomicLong import kotlinx.coroutines.CancellationException @@ -46,7 +40,6 @@ import kotlinx.serialization.json.JsonElement import kotlinx.serialization.json.JsonObject import kotlinx.serialization.json.JsonPrimitive import kotlinx.serialization.json.buildJsonObject -import kotlin.math.max class TalkModeManager( private val context: Context, @@ -57,9 +50,6 @@ class TalkModeManager( ) { companion object { private const val tag = "TalkMode" - private const val defaultModelIdFallback = "eleven_v3" - private const val defaultOutputFormatFallback = "pcm_24000" - private const val defaultTalkProvider = "elevenlabs" private const val listenWatchdogMs = 12_000L private const val chatFinalWaitWithSubscribeMs = 45_000L private const val chatFinalWaitWithoutSubscribeMs = 6_000L @@ -84,9 +74,6 @@ class TalkModeManager( private val _lastAssistantText = MutableStateFlow(null) val lastAssistantText: StateFlow = _lastAssistantText - private val _usingFallbackTts = MutableStateFlow(false) - val usingFallbackTts: StateFlow = _usingFallbackTts - private var recognizer: SpeechRecognizer? = null private var restartJob: Job? = null private var stopRequested = false @@ -99,21 +86,11 @@ class TalkModeManager( private var lastSpokenText: String? = null private var lastInterruptedAtSeconds: Double? = null - private var defaultVoiceId: String? = null private var currentVoiceId: String? = null - private var fallbackVoiceId: String? = null - private var defaultModelId: String? = null private var currentModelId: String? = null - private var defaultOutputFormat: String? = null - private var apiKey: String? = null - private var voiceAliases: Map = emptyMap() // Interrupt-on-speech is disabled by default: starting a SpeechRecognizer during - // TTS creates an audio session conflict on OxygenOS/OnePlus that causes AudioTrack - // write to return 0 and MediaPlayer to error. Can be enabled via gateway talk config. - private var activeProviderIsElevenLabs: Boolean = true + // TTS creates an audio session conflict on some OEMs. Can be enabled via gateway talk config. private var interruptOnSpeech: Boolean = false - private var voiceOverrideActive = false - private var modelOverrideActive = false private var mainSessionKey: String = "main" @Volatile private var pendingRunId: String? = null @@ -128,14 +105,8 @@ class TalkModeManager( private var ttsJob: Job? = null private var player: MediaPlayer? = null - private var streamingSource: StreamingMediaDataSource? = null - private var pcmTrack: AudioTrack? = null - @Volatile private var pcmStopRequested = false @Volatile private var finalizeInFlight = false private var listenWatchdogJob: Job? = null - private var systemTts: TextToSpeech? = null - private var systemTtsPending: CompletableDeferred? = null - private var systemTtsPendingId: String? = null private var audioFocusRequest: AudioFocusRequest? = null private val audioFocusListener = AudioManager.OnAudioFocusChangeListener { focusChange -> @@ -208,118 +179,6 @@ class TalkModeManager( /** When true, play TTS for all final chat responses (even ones we didn't initiate). */ @Volatile var ttsOnAllResponses = false - // Streaming TTS: active session keyed by runId - private var streamingTts: ElevenLabsStreamingTts? = null - private var streamingFullText: String = "" - @Volatile private var lastHandledStreamingRunId: String? = null - private var drainingTts: ElevenLabsStreamingTts? = null - - private fun stopActiveStreamingTts() { - streamingTts?.stop() - streamingTts = null - drainingTts?.stop() - drainingTts = null - streamingFullText = "" - } - - /** Handle agent stream events — only speak assistant text, not tool calls or thinking. */ - private fun handleAgentStreamEvent(payloadJson: String?) { - if (payloadJson.isNullOrBlank()) return - val payload = try { - json.parseToJsonElement(payloadJson).asObjectOrNull() - } catch (_: Throwable) { null } ?: return - - // Only speak events for the active session — prevents TTS leaking from - // concurrent sessions/channels (privacy + correctness). - val eventSession = payload["sessionKey"]?.asStringOrNull() - val activeSession = mainSessionKey.ifBlank { "main" } - if (eventSession != null && eventSession != activeSession) return - - val stream = payload["stream"]?.asStringOrNull() ?: return - if (stream != "assistant") return // Only speak assistant text - val data = payload["data"]?.asObjectOrNull() ?: return - if (data["type"]?.asStringOrNull() == "thinking") return // Skip thinking tokens - val text = data["text"]?.asStringOrNull()?.trim() ?: return - if (text.isEmpty()) return - if (!playbackEnabled) { - stopActiveStreamingTts() - return - } - - // Start streaming session if not already active - if (streamingTts == null) { - if (!activeProviderIsElevenLabs) return // Non-ElevenLabs provider — skip streaming TTS - val voiceId = currentVoiceId ?: defaultVoiceId - val apiKey = this.apiKey - if (voiceId == null || apiKey == null) { - Log.w(tag, "streaming TTS: missing voiceId or apiKey") - return - } - val modelId = currentModelId ?: defaultModelId ?: "" - val streamModel = if (ElevenLabsStreamingTts.supportsStreaming(modelId)) { - modelId - } else { - "eleven_flash_v2_5" - } - val tts = ElevenLabsStreamingTts( - scope = scope, - voiceId = voiceId, - apiKey = apiKey, - modelId = streamModel, - outputFormat = "pcm_24000", - sampleRate = 24000, - ) - streamingTts = tts - streamingFullText = "" - _isSpeaking.value = true - _statusText.value = "Speaking…" - tts.start() - Log.d(tag, "streaming TTS started for agent assistant text") - lastHandledStreamingRunId = null // will be set on final - } - - val accepted = streamingTts?.sendText(text) ?: false - if (!accepted && streamingTts != null) { - Log.d(tag, "text diverged, restarting streaming TTS") - streamingTts?.stop() - streamingTts = null - // Restart with the new text - val voiceId2 = currentVoiceId ?: defaultVoiceId - val apiKey2 = this.apiKey - if (voiceId2 != null && apiKey2 != null) { - val modelId2 = currentModelId ?: defaultModelId ?: "" - val streamModel2 = if (ElevenLabsStreamingTts.supportsStreaming(modelId2)) modelId2 else "eleven_flash_v2_5" - val newTts = ElevenLabsStreamingTts( - scope = scope, voiceId = voiceId2, apiKey = apiKey2, - modelId = streamModel2, outputFormat = "pcm_24000", sampleRate = 24000, - ) - streamingTts = newTts - streamingFullText = text - newTts.start() - newTts.sendText(streamingFullText) - Log.d(tag, "streaming TTS restarted with new text") - } - } - } - - /** Called when chat final/error/aborted arrives — finish any active streaming TTS. */ - private fun finishStreamingTts() { - streamingFullText = "" - val tts = streamingTts ?: return - // Null out immediately so the next response creates a fresh TTS instance. - // The drain coroutine below holds a reference to this instance for cleanup. - streamingTts = null - drainingTts = tts - tts.finish() - scope.launch { - delay(500) - while (tts.isPlaying.value) { delay(200) } - if (drainingTts === tts) drainingTts = null - _isSpeaking.value = false - _statusText.value = "Ready" - } - } - fun playTtsForText(text: String) { val playbackToken = playbackGeneration.incrementAndGet() ttsJob?.cancel() @@ -338,7 +197,6 @@ class TalkModeManager( Log.d(tag, "gateway event: $event") } if (event == "agent" && ttsOnAllResponses) { - handleAgentStreamEvent(payloadJson) return } if (event != "chat") return @@ -362,27 +220,10 @@ class TalkModeManager( // Otherwise, if ttsOnAllResponses, finish streaming TTS on terminal events. val pending = pendingRunId if (pending == null || runId != pending) { - if (ttsOnAllResponses && state in listOf("final", "error", "aborted")) { - // Skip if we already handled TTS for this run (multiple final events - // can arrive on different threads for the same run). - if (lastHandledStreamingRunId == runId) { - if (pending == null || runId != pending) return - } - lastHandledStreamingRunId = runId - val stts = streamingTts - if (stts != null) { - // Finish streaming and let the drain coroutine handle playback completion. - // Don’t check hasReceivedAudio synchronously — audio may still be in flight - // from the WebSocket (EOS was just sent). The drain coroutine in finishStreamingTts - // waits for playback to complete; if ElevenLabs truly fails, the user just won’t - // hear anything (silent failure is better than double-speaking with system TTS). - finishStreamingTts() - } else if (state == "final") { - // No streaming was active — fall back to non-streaming - val text = extractTextFromChatEventMessage(obj["message"]) - if (!text.isNullOrBlank()) { - playTtsForText(text) - } + if (ttsOnAllResponses && state == "final") { + val text = extractTextFromChatEventMessage(obj["message"]) + if (!text.isNullOrBlank()) { + playTtsForText(text) } } if (pending == null || runId != pending) return @@ -419,7 +260,6 @@ class TalkModeManager( playbackEnabled = enabled if (!enabled) { playbackGeneration.incrementAndGet() - stopActiveStreamingTts() stopSpeaking() } } @@ -485,7 +325,6 @@ class TalkModeManager( _isListening.value = false _statusText.value = "Off" stopSpeaking() - _usingFallbackTts.value = false chatSubscribedSessionKey = null pendingRunId = null pendingFinal?.cancel() @@ -500,10 +339,6 @@ class TalkModeManager( recognizer?.destroy() recognizer = null } - systemTts?.stop() - systemTtsPending?.cancel() - systemTtsPending = null - systemTtsPendingId = null } private fun startListeningInternal(markListening: Boolean) { @@ -813,59 +648,19 @@ class TalkModeManager( _lastAssistantText.value = cleaned val requestedVoice = directive?.voiceId?.trim()?.takeIf { it.isNotEmpty() } - val resolvedVoice = TalkModeVoiceResolver.resolveVoiceAlias(requestedVoice, voiceAliases) - if (requestedVoice != null && resolvedVoice == null) { - Log.w(tag, "unknown voice alias: $requestedVoice") - } if (directive?.voiceId != null) { if (directive.once != true) { - currentVoiceId = resolvedVoice - voiceOverrideActive = true + currentVoiceId = requestedVoice } } if (directive?.modelId != null) { if (directive.once != true) { - currentModelId = directive.modelId - modelOverrideActive = true + currentModelId = directive.modelId?.trim()?.takeIf { it.isNotEmpty() } } } ensurePlaybackActive(playbackToken) - val apiKey = - apiKey?.trim()?.takeIf { it.isNotEmpty() } - ?: System.getenv("ELEVENLABS_API_KEY")?.trim() - val preferredVoice = resolvedVoice ?: currentVoiceId ?: defaultVoiceId - val resolvedPlaybackVoice = - if (!apiKey.isNullOrEmpty()) { - try { - TalkModeVoiceResolver.resolveVoiceId( - preferred = preferredVoice, - fallbackVoiceId = fallbackVoiceId, - defaultVoiceId = defaultVoiceId, - currentVoiceId = currentVoiceId, - voiceOverrideActive = voiceOverrideActive, - listVoices = { TalkModeVoiceResolver.listVoices(apiKey, json) }, - ) - } catch (err: Throwable) { - Log.w(tag, "list voices failed: ${err.message ?: err::class.simpleName}") - null - } - } else { - null - } - resolvedPlaybackVoice?.let { resolved -> - fallbackVoiceId = resolved.fallbackVoiceId - defaultVoiceId = resolved.defaultVoiceId - currentVoiceId = resolved.currentVoiceId - resolved.selectedVoiceName?.let { name -> - resolved.voiceId?.let { voiceId -> - Log.d(tag, "default voice selected $name ($voiceId)") - } - } - } - val voiceId = resolvedPlaybackVoice?.voiceId - _statusText.value = "Speaking…" _isSpeaking.value = true lastSpokenText = cleaned @@ -873,210 +668,99 @@ class TalkModeManager( requestAudioFocusForTts() try { - val canUseElevenLabs = !voiceId.isNullOrBlank() && !apiKey.isNullOrEmpty() - if (!canUseElevenLabs) { - if (voiceId.isNullOrBlank()) { - Log.w(tag, "missing voiceId; falling back to system voice") - } - if (apiKey.isNullOrEmpty()) { - Log.w(tag, "missing ELEVENLABS_API_KEY; falling back to system voice") - } - ensurePlaybackActive(playbackToken) - _usingFallbackTts.value = true - _statusText.value = "Speaking (System)…" - speakWithSystemTts(cleaned, playbackToken) - } else { - _usingFallbackTts.value = false - val ttsStarted = SystemClock.elapsedRealtime() - val modelId = directive?.modelId ?: currentModelId ?: defaultModelId - val request = - ElevenLabsRequest( - text = cleaned, - modelId = modelId, - outputFormat = - TalkModeRuntime.validatedOutputFormat(directive?.outputFormat ?: defaultOutputFormat), - speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm), - stability = TalkModeRuntime.validatedStability(directive?.stability, modelId), - similarity = TalkModeRuntime.validatedUnit(directive?.similarity), - style = TalkModeRuntime.validatedUnit(directive?.style), - speakerBoost = directive?.speakerBoost, - seed = TalkModeRuntime.validatedSeed(directive?.seed), - normalize = TalkModeRuntime.validatedNormalize(directive?.normalize), - language = TalkModeRuntime.validatedLanguage(directive?.language), - latencyTier = TalkModeRuntime.validatedLatencyTier(directive?.latencyTier), - ) - streamAndPlay(voiceId = voiceId!!, apiKey = apiKey!!, request = request, playbackToken = playbackToken) - Log.d(tag, "elevenlabs stream ok durMs=${SystemClock.elapsedRealtime() - ttsStarted}") - } + val ttsStarted = SystemClock.elapsedRealtime() + val speech = requestTalkSpeak(cleaned, directive) + playGatewaySpeech(speech, playbackToken) + Log.d(tag, "talk.speak ok durMs=${SystemClock.elapsedRealtime() - ttsStarted} provider=${speech.provider}") } catch (err: Throwable) { if (isPlaybackCancelled(err, playbackToken)) { Log.d(tag, "assistant speech cancelled") return } - Log.w(tag, "speak failed: ${err.message ?: err::class.simpleName}; falling back to system voice") - try { - ensurePlaybackActive(playbackToken) - _usingFallbackTts.value = true - _statusText.value = "Speaking (System)…" - speakWithSystemTts(cleaned, playbackToken) - } catch (fallbackErr: Throwable) { - if (isPlaybackCancelled(fallbackErr, playbackToken)) { - Log.d(tag, "assistant fallback speech cancelled") - return - } - _statusText.value = "Speak failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}" - Log.w(tag, "system voice failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}") - } + _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}" + Log.w(tag, "talk.speak failed: ${err.message ?: err::class.simpleName}") } finally { _isSpeaking.value = false } } - private suspend fun streamAndPlay( - voiceId: String, - apiKey: String, - request: ElevenLabsRequest, - playbackToken: Long, - ) { + private data class GatewayTalkSpeech( + val audioBase64: String, + val provider: String, + val outputFormat: String?, + val mimeType: String?, + val fileExtension: String?, + ) + + private suspend fun requestTalkSpeak(text: String, directive: TalkDirective?): GatewayTalkSpeech { + val modelId = + directive?.modelId?.trim()?.takeIf { it.isNotEmpty() } ?: currentModelId?.trim()?.takeIf { it.isNotEmpty() } + val voiceId = + directive?.voiceId?.trim()?.takeIf { it.isNotEmpty() } ?: currentVoiceId?.trim()?.takeIf { it.isNotEmpty() } + val params = + buildJsonObject { + put("text", JsonPrimitive(text)) + voiceId?.let { put("voiceId", JsonPrimitive(it)) } + modelId?.let { put("modelId", JsonPrimitive(it)) } + TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm)?.let { + put("speed", JsonPrimitive(it)) + } + TalkModeRuntime.validatedStability(directive?.stability, modelId)?.let { + put("stability", JsonPrimitive(it)) + } + TalkModeRuntime.validatedUnit(directive?.similarity)?.let { + put("similarity", JsonPrimitive(it)) + } + TalkModeRuntime.validatedUnit(directive?.style)?.let { + put("style", JsonPrimitive(it)) + } + directive?.speakerBoost?.let { put("speakerBoost", JsonPrimitive(it)) } + TalkModeRuntime.validatedSeed(directive?.seed)?.let { put("seed", JsonPrimitive(it)) } + TalkModeRuntime.validatedNormalize(directive?.normalize)?.let { + put("normalize", JsonPrimitive(it)) + } + TalkModeRuntime.validatedLanguage(directive?.language)?.let { + put("language", JsonPrimitive(it)) + } + } + val res = session.request("talk.speak", params.toString()) + val root = json.parseToJsonElement(res).asObjectOrNull() ?: error("talk.speak returned invalid JSON") + val audioBase64 = root["audioBase64"].asStringOrNull()?.trim().orEmpty() + val provider = root["provider"].asStringOrNull()?.trim().orEmpty() + if (audioBase64.isEmpty()) { + error("talk.speak missing audioBase64") + } + if (provider.isEmpty()) { + error("talk.speak missing provider") + } + return GatewayTalkSpeech( + audioBase64 = audioBase64, + provider = provider, + outputFormat = root["outputFormat"].asStringOrNull()?.trim(), + mimeType = root["mimeType"].asStringOrNull()?.trim(), + fileExtension = root["fileExtension"].asStringOrNull()?.trim(), + ) + } + + private suspend fun playGatewaySpeech(speech: GatewayTalkSpeech, playbackToken: Long) { ensurePlaybackActive(playbackToken) stopSpeaking(resetInterrupt = false) ensurePlaybackActive(playbackToken) - pcmStopRequested = false - val pcmSampleRate = TalkModeRuntime.parsePcmSampleRate(request.outputFormat) - if (pcmSampleRate != null) { + val audioBytes = try { - streamAndPlayPcm( - voiceId = voiceId, - apiKey = apiKey, - request = request, - sampleRate = pcmSampleRate, - playbackToken = playbackToken, - ) - return - } catch (err: Throwable) { - if (isPlaybackCancelled(err, playbackToken) || pcmStopRequested) return - Log.w(tag, "pcm playback failed; falling back to mp3: ${err.message ?: err::class.simpleName}") + Base64.decode(speech.audioBase64, Base64.DEFAULT) + } catch (err: IllegalArgumentException) { + throw IllegalStateException("talk.speak returned invalid audio", err) } - } - - // When falling back from PCM, rewrite format to MP3 and download to file. - // File-based playback avoids custom DataSource races and is reliable across OEMs. - val mp3Request = if (request.outputFormat?.startsWith("pcm_") == true) { - request.copy(outputFormat = "mp3_44100_128") - } else { - request - } - streamAndPlayMp3(voiceId = voiceId, apiKey = apiKey, request = mp3Request, playbackToken = playbackToken) - } - - private suspend fun streamAndPlayMp3( - voiceId: String, - apiKey: String, - request: ElevenLabsRequest, - playbackToken: Long, - ) { - val dataSource = StreamingMediaDataSource() - streamingSource = dataSource - - val player = MediaPlayer() - this.player = player - - val prepared = CompletableDeferred() - val finished = CompletableDeferred() - - player.setAudioAttributes( - AudioAttributes.Builder() - .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) - .setUsage(AudioAttributes.USAGE_MEDIA) - .build(), - ) - player.setOnPreparedListener { - it.start() - prepared.complete(Unit) - } - player.setOnCompletionListener { - finished.complete(Unit) - } - player.setOnErrorListener { _, _, _ -> - finished.completeExceptionally(IllegalStateException("MediaPlayer error")) - true - } - - player.setDataSource(dataSource) - withContext(Dispatchers.Main) { - player.prepareAsync() - } - - val fetchError = CompletableDeferred() - val fetchJob = - scope.launch(Dispatchers.IO) { - try { - streamTts(voiceId = voiceId, apiKey = apiKey, request = request, sink = dataSource, playbackToken = playbackToken) - fetchError.complete(null) - } catch (err: Throwable) { - dataSource.fail() - fetchError.complete(err) + val suffix = resolveGatewayAudioSuffix(speech) + val tempFile = + withContext(Dispatchers.IO) { + File.createTempFile("tts_", suffix, context.cacheDir).apply { + writeBytes(audioBytes) } } - - Log.d(tag, "play start") - try { - ensurePlaybackActive(playbackToken) - prepared.await() - ensurePlaybackActive(playbackToken) - finished.await() - ensurePlaybackActive(playbackToken) - fetchError.await()?.let { throw it } - } finally { - fetchJob.cancel() - cleanupPlayer() - } - Log.d(tag, "play done") - } - - /** - * Download ElevenLabs audio to a temp file, then play from disk via MediaPlayer. - * Simpler and more reliable than streaming: avoids custom DataSource races and - * AudioTrack underrun issues on OxygenOS/OnePlus. - */ - private suspend fun streamAndPlayViaFile(voiceId: String, apiKey: String, request: ElevenLabsRequest) { - val tempFile = withContext(Dispatchers.IO) { - val file = File.createTempFile("tts_", ".mp3", context.cacheDir) - val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request) - try { - val payload = buildRequestPayload(request) - conn.outputStream.use { it.write(payload.toByteArray()) } - val code = conn.responseCode - if (code >= 400) { - val body = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: "" - file.delete() - throw IllegalStateException("ElevenLabs failed: $code $body") - } - Log.d(tag, "elevenlabs http code=$code voiceId=$voiceId format=${request.outputFormat}") - // Manual loop so cancellation is honoured on every chunk. - // input.copyTo() is a single blocking call with no yield points; if the - // coroutine is cancelled mid-download the entire response would finish - // before cancellation was observed. - conn.inputStream.use { input -> - file.outputStream().use { out -> - val buf = ByteArray(8192) - var n: Int - while (input.read(buf).also { n = it } != -1) { - ensureActive() - out.write(buf, 0, n) - } - } - } - } catch (err: Throwable) { - file.delete() - throw err - } finally { - conn.disconnect() - } - file - } try { val player = MediaPlayer() this.player = player @@ -1094,181 +778,45 @@ class TalkModeManager( } player.setDataSource(tempFile.absolutePath) withContext(Dispatchers.IO) { player.prepare() } - Log.d(tag, "file play start bytes=${tempFile.length()}") + ensurePlaybackActive(playbackToken) player.start() finished.await() - Log.d(tag, "file play done") + ensurePlaybackActive(playbackToken) } finally { - try { cleanupPlayer() } catch (_: Throwable) {} + try { + cleanupPlayer() + } catch (_: Throwable) {} tempFile.delete() } } - private suspend fun streamAndPlayPcm( - voiceId: String, - apiKey: String, - request: ElevenLabsRequest, - sampleRate: Int, - playbackToken: Long, - ) { - ensurePlaybackActive(playbackToken) - val minBuffer = - AudioTrack.getMinBufferSize( - sampleRate, - AudioFormat.CHANNEL_OUT_MONO, - AudioFormat.ENCODING_PCM_16BIT, - ) - if (minBuffer <= 0) { - throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer") + private fun resolveGatewayAudioSuffix(speech: GatewayTalkSpeech): String { + val extension = speech.fileExtension?.trim() + if (!extension.isNullOrEmpty()) { + return if (extension.startsWith(".")) extension else ".$extension" } - - val bufferSize = max(minBuffer * 2, 8 * 1024) - val track = - AudioTrack( - AudioAttributes.Builder() - .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) - .setUsage(AudioAttributes.USAGE_MEDIA) - .build(), - AudioFormat.Builder() - .setSampleRate(sampleRate) - .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) - .setEncoding(AudioFormat.ENCODING_PCM_16BIT) - .build(), - bufferSize, - AudioTrack.MODE_STREAM, - AudioManager.AUDIO_SESSION_ID_GENERATE, - ) - if (track.state != AudioTrack.STATE_INITIALIZED) { - track.release() - throw IllegalStateException("AudioTrack init failed") - } - pcmTrack = track - // Don't call track.play() yet — start the track only when the first audio - // chunk arrives from ElevenLabs (see streamPcm). OxygenOS/OnePlus kills an - // AudioTrack that underruns (no data written) for ~1+ seconds, causing - // write() to return 0. Deferring play() until first data avoids the underrun. - - Log.d(tag, "pcm play start sampleRate=$sampleRate bufferSize=$bufferSize") - try { - streamPcm(voiceId = voiceId, apiKey = apiKey, request = request, track = track, playbackToken = playbackToken) - } finally { - cleanupPcmTrack() - } - Log.d(tag, "pcm play done") + val mimeType = speech.mimeType?.trim()?.lowercase() + if (mimeType == "audio/mpeg") return ".mp3" + if (mimeType == "audio/ogg") return ".ogg" + if (mimeType == "audio/wav") return ".wav" + if (mimeType == "audio/webm") return ".webm" + val outputFormat = speech.outputFormat?.trim()?.lowercase().orEmpty() + if (outputFormat == "mp3" || outputFormat.startsWith("mp3_") || outputFormat.endsWith("-mp3")) return ".mp3" + if (outputFormat == "opus" || outputFormat.startsWith("opus_")) return ".ogg" + if (outputFormat.endsWith("-wav")) return ".wav" + if (outputFormat.endsWith("-webm")) return ".webm" + return ".audio" } - private suspend fun speakWithSystemTts(text: String, playbackToken: Long) { - val trimmed = text.trim() - if (trimmed.isEmpty()) return - ensurePlaybackActive(playbackToken) - val ok = ensureSystemTts() - if (!ok) { - throw IllegalStateException("system TTS unavailable") - } - ensurePlaybackActive(playbackToken) - - val tts = systemTts ?: throw IllegalStateException("system TTS unavailable") - val utteranceId = "talk-${UUID.randomUUID()}" - val deferred = CompletableDeferred() - systemTtsPending?.cancel() - systemTtsPending = deferred - systemTtsPendingId = utteranceId - - withContext(Dispatchers.Main) { - ensurePlaybackActive(playbackToken) - val params = Bundle() - tts.speak(trimmed, TextToSpeech.QUEUE_FLUSH, params, utteranceId) - } - - withContext(Dispatchers.IO) { - try { - kotlinx.coroutines.withTimeout(180_000) { deferred.await() } - } catch (err: Throwable) { - throw err - } - ensurePlaybackActive(playbackToken) - } - } - - private suspend fun ensureSystemTts(): Boolean { - if (systemTts != null) return true - return withContext(Dispatchers.Main) { - val deferred = CompletableDeferred() - val tts = - try { - TextToSpeech(context) { status -> - deferred.complete(status == TextToSpeech.SUCCESS) - } - } catch (_: Throwable) { - deferred.complete(false) - null - } - if (tts == null) return@withContext false - - tts.setOnUtteranceProgressListener( - object : UtteranceProgressListener() { - override fun onStart(utteranceId: String?) {} - - override fun onDone(utteranceId: String?) { - if (utteranceId == null) return - if (utteranceId != systemTtsPendingId) return - systemTtsPending?.complete(Unit) - systemTtsPending = null - systemTtsPendingId = null - } - - @Suppress("OVERRIDE_DEPRECATION") - @Deprecated("Deprecated in Java") - override fun onError(utteranceId: String?) { - if (utteranceId == null) return - if (utteranceId != systemTtsPendingId) return - systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error")) - systemTtsPending = null - systemTtsPendingId = null - } - - override fun onError(utteranceId: String?, errorCode: Int) { - if (utteranceId == null) return - if (utteranceId != systemTtsPendingId) return - systemTtsPending?.completeExceptionally(IllegalStateException("system TTS error $errorCode")) - systemTtsPending = null - systemTtsPendingId = null - } - }, - ) - - val ok = - try { - deferred.await() - } catch (_: Throwable) { - false - } - if (ok) { - systemTts = tts - } else { - tts.shutdown() - } - ok - } - } - - /** Stop any active TTS immediately — call when user taps mic to barge in. */ fun stopTts() { - stopActiveStreamingTts() stopSpeaking(resetInterrupt = true) _isSpeaking.value = false _statusText.value = "Listening" } private fun stopSpeaking(resetInterrupt: Boolean = true) { - pcmStopRequested = true if (!_isSpeaking.value) { cleanupPlayer() - cleanupPcmTrack() - systemTts?.stop() - systemTtsPending?.cancel() - systemTtsPending = null - systemTtsPendingId = null abandonAudioFocus() return } @@ -1277,11 +825,6 @@ class TalkModeManager( lastInterruptedAtSeconds = currentMs / 1000.0 } cleanupPlayer() - cleanupPcmTrack() - systemTts?.stop() - systemTtsPending?.cancel() - systemTtsPending = null - systemTtsPendingId = null _isSpeaking.value = false abandonAudioFocus() } @@ -1325,22 +868,6 @@ class TalkModeManager( player?.stop() player?.release() player = null - streamingSource?.close() - streamingSource = null - } - - private fun cleanupPcmTrack() { - val track = pcmTrack ?: return - try { - track.pause() - track.flush() - track.stop() - } catch (_: Throwable) { - // ignore cleanup errors - } finally { - track.release() - } - pcmTrack = null } private fun shouldInterrupt(transcript: String): Boolean { @@ -1369,71 +896,18 @@ class TalkModeManager( } private suspend fun reloadConfig() { - val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim() - val sagVoice = System.getenv("SAG_VOICE_ID")?.trim() - val envKey = System.getenv("ELEVENLABS_API_KEY")?.trim() try { - val res = session.request("talk.config", """{"includeSecrets":true}""") + val res = session.request("talk.config", "{}") val root = json.parseToJsonElement(res).asObjectOrNull() - val parsed = - TalkModeGatewayConfigParser.parse( - config = root?.get("config").asObjectOrNull(), - defaultProvider = defaultTalkProvider, - defaultModelIdFallback = defaultModelIdFallback, - defaultOutputFormatFallback = defaultOutputFormatFallback, - envVoice = envVoice, - sagVoice = sagVoice, - envKey = envKey, - ) - if (parsed.missingResolvedPayload) { - Log.w(tag, "talk config ignored: normalized payload missing talk.resolved") - } - + val parsed = TalkModeGatewayConfigParser.parse(root?.get("config").asObjectOrNull()) if (!isCanonicalMainSessionKey(mainSessionKey)) { mainSessionKey = parsed.mainSessionKey } - defaultVoiceId = parsed.defaultVoiceId - voiceAliases = parsed.voiceAliases - if (!voiceOverrideActive) currentVoiceId = defaultVoiceId - defaultModelId = parsed.defaultModelId - if (!modelOverrideActive) currentModelId = defaultModelId - defaultOutputFormat = parsed.defaultOutputFormat - apiKey = parsed.apiKey silenceWindowMs = parsed.silenceTimeoutMs - Log.d( - tag, - "reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId silenceTimeoutMs=${parsed.silenceTimeoutMs}", - ) - if (parsed.interruptOnSpeech != null) interruptOnSpeech = parsed.interruptOnSpeech - activeProviderIsElevenLabs = parsed.activeProvider == defaultTalkProvider - if (!activeProviderIsElevenLabs) { - // Clear ElevenLabs credentials so playAssistant won't attempt ElevenLabs calls - apiKey = null - defaultVoiceId = null - if (!voiceOverrideActive) currentVoiceId = null - Log.w(tag, "talk provider ${parsed.activeProvider} unsupported; using system voice fallback") - } else if (parsed.normalizedPayload) { - Log.d(tag, "talk config provider=elevenlabs") - } + parsed.interruptOnSpeech?.let { interruptOnSpeech = it } configLoaded = true } catch (_: Throwable) { - val fallback = - TalkModeGatewayConfigParser.fallback( - defaultProvider = defaultTalkProvider, - defaultModelIdFallback = defaultModelIdFallback, - defaultOutputFormatFallback = defaultOutputFormatFallback, - envVoice = envVoice, - sagVoice = sagVoice, - envKey = envKey, - ) - silenceWindowMs = fallback.silenceTimeoutMs - defaultVoiceId = fallback.defaultVoiceId - defaultModelId = fallback.defaultModelId - if (!modelOverrideActive) currentModelId = defaultModelId - apiKey = fallback.apiKey - voiceAliases = fallback.voiceAliases - defaultOutputFormat = fallback.defaultOutputFormat - // Keep config load retryable after transient fetch failures. + silenceWindowMs = TalkDefaults.defaultSilenceTimeoutMs configLoaded = false } } @@ -1443,189 +917,6 @@ class TalkModeManager( return obj["runId"].asStringOrNull() } - private suspend fun streamTts( - voiceId: String, - apiKey: String, - request: ElevenLabsRequest, - sink: StreamingMediaDataSource, - playbackToken: Long, - ) { - withContext(Dispatchers.IO) { - ensurePlaybackActive(playbackToken) - val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request) - try { - val payload = buildRequestPayload(request) - conn.outputStream.use { it.write(payload.toByteArray()) } - - val code = conn.responseCode - Log.d(tag, "elevenlabs http code=$code voiceId=$voiceId format=${request.outputFormat} keyLen=${apiKey.length}") - if (code >= 400) { - val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: "" - Log.w(tag, "elevenlabs error code=$code voiceId=$voiceId body=$message") - sink.fail() - throw IllegalStateException("ElevenLabs failed: $code $message") - } - - val buffer = ByteArray(8 * 1024) - conn.inputStream.use { input -> - while (true) { - ensurePlaybackActive(playbackToken) - val read = input.read(buffer) - if (read <= 0) break - ensurePlaybackActive(playbackToken) - sink.append(buffer.copyOf(read)) - } - } - sink.finish() - } finally { - conn.disconnect() - } - } - } - - private suspend fun streamPcm( - voiceId: String, - apiKey: String, - request: ElevenLabsRequest, - track: AudioTrack, - playbackToken: Long, - ) { - withContext(Dispatchers.IO) { - ensurePlaybackActive(playbackToken) - val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request) - try { - val payload = buildRequestPayload(request) - conn.outputStream.use { it.write(payload.toByteArray()) } - - val code = conn.responseCode - if (code >= 400) { - val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: "" - throw IllegalStateException("ElevenLabs failed: $code $message") - } - - var totalBytesWritten = 0L - var trackStarted = false - val buffer = ByteArray(8 * 1024) - conn.inputStream.use { input -> - while (true) { - if (pcmStopRequested || isPlaybackCancelled(null, playbackToken)) return@withContext - val read = input.read(buffer) - if (read <= 0) break - // Start the AudioTrack only when the first chunk is ready — avoids - // the ~1.4s underrun window while ElevenLabs prepares audio. - // OxygenOS kills a track that underruns for >1s (write() returns 0). - if (!trackStarted) { - track.play() - trackStarted = true - } - var offset = 0 - while (offset < read) { - if (pcmStopRequested || isPlaybackCancelled(null, playbackToken)) return@withContext - val wrote = - try { - track.write(buffer, offset, read - offset) - } catch (err: Throwable) { - if (pcmStopRequested || isPlaybackCancelled(err, playbackToken)) return@withContext - throw err - } - if (wrote <= 0) { - if (pcmStopRequested || isPlaybackCancelled(null, playbackToken)) return@withContext - throw IllegalStateException("AudioTrack write failed: $wrote") - } - offset += wrote - } - } - } - } finally { - conn.disconnect() - } - } - } - - private suspend fun waitForPcmDrain(track: AudioTrack, totalFrames: Long, sampleRate: Int) { - if (totalFrames <= 0) return - withContext(Dispatchers.IO) { - val drainDeadline = SystemClock.elapsedRealtime() + 15_000 - while (!pcmStopRequested && SystemClock.elapsedRealtime() < drainDeadline) { - val played = track.playbackHeadPosition.toLong().and(0xFFFFFFFFL) - if (played >= totalFrames) break - val remainingFrames = totalFrames - played - val sleepMs = ((remainingFrames * 1000L) / sampleRate.toLong()).coerceIn(12L, 120L) - delay(sleepMs) - } - } - } - - private fun openTtsConnection( - voiceId: String, - apiKey: String, - request: ElevenLabsRequest, - ): HttpURLConnection { - val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream" - val latencyTier = request.latencyTier - val url = - if (latencyTier != null) { - URL("$baseUrl?optimize_streaming_latency=$latencyTier") - } else { - URL(baseUrl) - } - val conn = url.openConnection() as HttpURLConnection - conn.requestMethod = "POST" - conn.connectTimeout = 30_000 - conn.readTimeout = 30_000 - conn.setRequestProperty("Content-Type", "application/json") - conn.setRequestProperty("Accept", resolveAcceptHeader(request.outputFormat)) - conn.setRequestProperty("xi-api-key", apiKey) - conn.doOutput = true - return conn - } - - private fun resolveAcceptHeader(outputFormat: String?): String { - val normalized = outputFormat?.trim()?.lowercase().orEmpty() - return if (normalized.startsWith("pcm_")) "audio/pcm" else "audio/mpeg" - } - - private fun buildRequestPayload(request: ElevenLabsRequest): String { - val voiceSettingsEntries = - buildJsonObject { - request.speed?.let { put("speed", JsonPrimitive(it)) } - request.stability?.let { put("stability", JsonPrimitive(it)) } - request.similarity?.let { put("similarity_boost", JsonPrimitive(it)) } - request.style?.let { put("style", JsonPrimitive(it)) } - request.speakerBoost?.let { put("use_speaker_boost", JsonPrimitive(it)) } - } - - val payload = - buildJsonObject { - put("text", JsonPrimitive(request.text)) - request.modelId?.takeIf { it.isNotEmpty() }?.let { put("model_id", JsonPrimitive(it)) } - request.outputFormat?.takeIf { it.isNotEmpty() }?.let { put("output_format", JsonPrimitive(it)) } - request.seed?.let { put("seed", JsonPrimitive(it)) } - request.normalize?.let { put("apply_text_normalization", JsonPrimitive(it)) } - request.language?.let { put("language_code", JsonPrimitive(it)) } - if (voiceSettingsEntries.isNotEmpty()) { - put("voice_settings", voiceSettingsEntries) - } - } - - return payload.toString() - } - - private data class ElevenLabsRequest( - val text: String, - val modelId: String?, - val outputFormat: String?, - val speed: Double?, - val stability: Double?, - val similarity: Double?, - val style: Double?, - val speakerBoost: Boolean?, - val seed: Long?, - val normalize: String?, - val language: String?, - val latencyTier: Int?, - ) - private object TalkModeRuntime { fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? { if (rateWpm != null && rateWpm > 0) { @@ -1673,28 +964,6 @@ class TalkModeManager( return normalized } - fun validatedOutputFormat(value: String?): String? { - val trimmed = value?.trim()?.lowercase() ?: return null - if (trimmed.isEmpty()) return null - if (trimmed.startsWith("mp3_")) return trimmed - return if (parsePcmSampleRate(trimmed) != null) trimmed else null - } - - fun validatedLatencyTier(value: Int?): Int? { - if (value == null) return null - if (value < 0 || value > 4) return null - return value - } - - fun parsePcmSampleRate(value: String?): Int? { - val trimmed = value?.trim()?.lowercase() ?: return null - if (!trimmed.startsWith("pcm_")) return null - val suffix = trimmed.removePrefix("pcm_") - val digits = suffix.takeWhile { it.isDigit() } - val rate = digits.toIntOrNull() ?: return null - return if (rate in setOf(16000, 22050, 24000, 44100)) rate else null - } - fun isMessageTimestampAfter(timestamp: Double, sinceSeconds: Double): Boolean { val sinceMs = sinceSeconds * 1000 return if (timestamp > 10_000_000_000) { From 4386a0ace8ada00f88dd0688b5023e93afe94ea2 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:29:06 +0530 Subject: [PATCH 06/11] refactor(android): remove legacy elevenlabs talk stack --- .../app/voice/ElevenLabsStreamingTts.kt | 338 ------------------ .../app/voice/StreamingMediaDataSource.kt | 98 ----- .../app/voice/TalkModeVoiceResolver.kt | 122 ------- .../app/voice/TalkModeVoiceResolverTest.kt | 92 ----- 4 files changed, 650 deletions(-) delete mode 100644 apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt delete mode 100644 apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt delete mode 100644 apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt delete mode 100644 apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt deleted file mode 100644 index ff13cf73911..00000000000 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt +++ /dev/null @@ -1,338 +0,0 @@ -package ai.openclaw.app.voice - -import android.media.AudioAttributes -import android.media.AudioFormat -import android.media.AudioManager -import android.media.AudioTrack -import android.util.Base64 -import android.util.Log -import kotlinx.coroutines.* -import kotlinx.coroutines.flow.MutableStateFlow -import kotlinx.coroutines.flow.StateFlow -import okhttp3.* -import org.json.JSONObject -import kotlin.math.max - -/** - * Streams text chunks to ElevenLabs WebSocket API and plays audio in real-time. - * - * Usage: - * 1. Create instance with voice/API config - * 2. Call [start] to open WebSocket + AudioTrack - * 3. Call [sendText] with incremental text chunks as they arrive - * 4. Call [finish] when the full response is ready (sends EOS to ElevenLabs) - * 5. Call [stop] to cancel/cleanup at any time - * - * Audio playback begins as soon as the first audio chunk arrives from ElevenLabs, - * typically within ~100ms of the first text chunk for eleven_flash_v2_5. - * - * Note: eleven_v3 does NOT support WebSocket streaming. Use eleven_flash_v2_5 - * or eleven_flash_v2 for lowest latency. - */ -class ElevenLabsStreamingTts( - private val scope: CoroutineScope, - private val voiceId: String, - private val apiKey: String, - private val modelId: String = "eleven_flash_v2_5", - private val outputFormat: String = "pcm_24000", - private val sampleRate: Int = 24000, -) { - companion object { - private const val TAG = "ElevenLabsStreamTTS" - private const val BASE_URL = "wss://api.elevenlabs.io/v1/text-to-speech" - - /** Models that support WebSocket input streaming */ - val STREAMING_MODELS = setOf( - "eleven_flash_v2_5", - "eleven_flash_v2", - "eleven_multilingual_v2", - "eleven_turbo_v2_5", - "eleven_turbo_v2", - "eleven_monolingual_v1", - ) - - fun supportsStreaming(modelId: String): Boolean = modelId in STREAMING_MODELS - } - - private val _isPlaying = MutableStateFlow(false) - val isPlaying: StateFlow = _isPlaying - - private var webSocket: WebSocket? = null - private var audioTrack: AudioTrack? = null - private var trackStarted = false - private var client: OkHttpClient? = null - @Volatile private var stopped = false - @Volatile private var finished = false - @Volatile var hasReceivedAudio = false - private set - private var drainJob: Job? = null - - // Track text already sent so we only send incremental chunks - private var sentTextLength = 0 - @Volatile private var wsReady = false - private val pendingText = mutableListOf() - - /** - * Open the WebSocket connection and prepare AudioTrack. - * Must be called before [sendText]. - */ - fun start() { - stopped = false - finished = false - hasReceivedAudio = false - sentTextLength = 0 - trackStarted = false - wsReady = false - sentFullText = "" - synchronized(pendingText) { pendingText.clear() } - - // Prepare AudioTrack - val minBuffer = AudioTrack.getMinBufferSize( - sampleRate, - AudioFormat.CHANNEL_OUT_MONO, - AudioFormat.ENCODING_PCM_16BIT, - ) - val bufferSize = max(minBuffer * 2, 8 * 1024) - val track = AudioTrack( - AudioAttributes.Builder() - .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) - .setUsage(AudioAttributes.USAGE_MEDIA) - .build(), - AudioFormat.Builder() - .setSampleRate(sampleRate) - .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) - .setEncoding(AudioFormat.ENCODING_PCM_16BIT) - .build(), - bufferSize, - AudioTrack.MODE_STREAM, - AudioManager.AUDIO_SESSION_ID_GENERATE, - ) - if (track.state != AudioTrack.STATE_INITIALIZED) { - track.release() - Log.e(TAG, "AudioTrack init failed") - return - } - audioTrack = track - _isPlaying.value = true - - // Open WebSocket - val url = "$BASE_URL/$voiceId/stream-input?model_id=$modelId&output_format=$outputFormat" - val okClient = OkHttpClient.Builder() - .readTimeout(30, java.util.concurrent.TimeUnit.SECONDS) - .writeTimeout(10, java.util.concurrent.TimeUnit.SECONDS) - .build() - client = okClient - - val request = Request.Builder() - .url(url) - .header("xi-api-key", apiKey) - .build() - - webSocket = okClient.newWebSocket(request, object : WebSocketListener() { - override fun onOpen(webSocket: WebSocket, response: Response) { - Log.d(TAG, "WebSocket connected") - // Send initial config with voice settings - val config = JSONObject().apply { - put("text", " ") - put("voice_settings", JSONObject().apply { - put("stability", 0.5) - put("similarity_boost", 0.8) - put("use_speaker_boost", false) - }) - put("generation_config", JSONObject().apply { - put("chunk_length_schedule", org.json.JSONArray(listOf(120, 160, 250, 290))) - }) - } - webSocket.send(config.toString()) - wsReady = true - // Flush any text that was queued before WebSocket was ready - synchronized(pendingText) { - for (queued in pendingText) { - val msg = JSONObject().apply { put("text", queued) } - webSocket.send(msg.toString()) - Log.d(TAG, "flushed queued chunk: ${queued.length} chars") - } - pendingText.clear() - } - // Send deferred EOS if finish() was called before WebSocket was ready - if (finished) { - val eos = JSONObject().apply { put("text", "") } - webSocket.send(eos.toString()) - Log.d(TAG, "sent deferred EOS") - } - } - - override fun onMessage(webSocket: WebSocket, text: String) { - if (stopped) return - try { - val json = JSONObject(text) - val audio = json.optString("audio", "") - if (audio.isNotEmpty()) { - val pcmBytes = Base64.decode(audio, Base64.DEFAULT) - writeToTrack(pcmBytes) - } - } catch (e: Exception) { - Log.e(TAG, "Error parsing WebSocket message: ${e.message}") - } - } - - override fun onFailure(webSocket: WebSocket, t: Throwable, response: Response?) { - Log.e(TAG, "WebSocket error: ${t.message}") - stopped = true - cleanup() - } - - override fun onClosed(webSocket: WebSocket, code: Int, reason: String) { - Log.d(TAG, "WebSocket closed: $code $reason") - // Wait for AudioTrack to finish playing buffered audio, then cleanup - drainJob = scope.launch(Dispatchers.IO) { - drainAudioTrack() - cleanup() - } - } - }) - } - - /** - * Send incremental text. Call with the full accumulated text so far — - * only the new portion (since last send) will be transmitted. - */ - // Track the full text we've sent so we can detect replacement vs append - private var sentFullText = "" - - /** - // If we already sent a superset of this text, it's just a stale/out-of-order - // event from a different thread — not a real divergence. Ignore it. - if (sentFullText.startsWith(fullText)) return true - * Returns true if text was accepted, false if text diverged (caller should restart). - */ - @Synchronized - fun sendText(fullText: String): Boolean { - if (stopped) return false - if (finished) return true // Already finishing — not a diverge, don't restart - - // Detect text replacement: if the new text doesn't start with what we already sent, - // the stream has diverged (e.g., tool call interrupted and text was replaced). - if (sentFullText.isNotEmpty() && !fullText.startsWith(sentFullText)) { - // If we already sent a superset of this text, it's just a stale/out-of-order - // event from a different thread — not a real divergence. Ignore it. - if (sentFullText.startsWith(fullText)) return true - Log.d(TAG, "text diverged — sent='${sentFullText.take(60)}' new='${fullText.take(60)}'") - return false - } - - if (fullText.length > sentTextLength) { - val newText = fullText.substring(sentTextLength) - sentTextLength = fullText.length - sentFullText = fullText - - val ws = webSocket - if (ws != null && wsReady) { - val msg = JSONObject().apply { put("text", newText) } - ws.send(msg.toString()) - Log.d(TAG, "sent chunk: ${newText.length} chars") - } else { - // Queue if WebSocket not connected yet (ws null = still connecting, wsReady false = handshake pending) - synchronized(pendingText) { pendingText.add(newText) } - Log.d(TAG, "queued chunk: ${newText.length} chars (ws not ready)") - } - } - return true - } - - /** - * Signal that no more text is coming. Sends EOS to ElevenLabs. - * The WebSocket will close after generating remaining audio. - */ - @Synchronized - fun finish() { - if (stopped || finished) return - finished = true - val ws = webSocket - if (ws != null && wsReady) { - // Send empty text to signal end of stream - val eos = JSONObject().apply { put("text", "") } - ws.send(eos.toString()) - Log.d(TAG, "sent EOS") - } - // else: WebSocket not ready yet; onOpen will send EOS after flushing queued text - } - - /** - * Immediately stop playback and close everything. - */ - fun stop() { - stopped = true - finished = true - drainJob?.cancel() - drainJob = null - webSocket?.cancel() - webSocket = null - val track = audioTrack - audioTrack = null - if (track != null) { - try { - track.pause() - track.flush() - track.release() - } catch (_: Throwable) {} - } - _isPlaying.value = false - client?.dispatcher?.executorService?.shutdown() - client = null - } - - private fun writeToTrack(pcmBytes: ByteArray) { - val track = audioTrack ?: return - if (stopped) return - - // Start playback on first audio chunk — avoids underrun - if (!trackStarted) { - track.play() - trackStarted = true - hasReceivedAudio = true - Log.d(TAG, "AudioTrack started on first chunk") - } - - var offset = 0 - while (offset < pcmBytes.size && !stopped) { - val wrote = track.write(pcmBytes, offset, pcmBytes.size - offset) - if (wrote <= 0) { - if (stopped) return - Log.w(TAG, "AudioTrack write returned $wrote") - break - } - offset += wrote - } - } - - private fun drainAudioTrack() { - if (stopped) return - // Wait up to 10s for audio to finish playing - val deadline = System.currentTimeMillis() + 10_000 - while (!stopped && System.currentTimeMillis() < deadline) { - // Check if track is still playing - val track = audioTrack ?: return - if (track.playState != AudioTrack.PLAYSTATE_PLAYING) return - try { - Thread.sleep(100) - } catch (_: InterruptedException) { - return - } - } - } - - private fun cleanup() { - val track = audioTrack - audioTrack = null - if (track != null) { - try { - track.stop() - track.release() - } catch (_: Throwable) {} - } - _isPlaying.value = false - client?.dispatcher?.executorService?.shutdown() - client = null - } -} diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt deleted file mode 100644 index 90bbd81b8bd..00000000000 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt +++ /dev/null @@ -1,98 +0,0 @@ -package ai.openclaw.app.voice - -import android.media.MediaDataSource -import kotlin.math.min - -internal class StreamingMediaDataSource : MediaDataSource() { - private data class Chunk(val start: Long, val data: ByteArray) - - private val lock = Object() - private val chunks = ArrayList() - private var totalSize: Long = 0 - private var closed = false - private var finished = false - private var lastReadIndex = 0 - - fun append(data: ByteArray) { - if (data.isEmpty()) return - synchronized(lock) { - if (closed || finished) return - val chunk = Chunk(totalSize, data) - chunks.add(chunk) - totalSize += data.size.toLong() - lock.notifyAll() - } - } - - fun finish() { - synchronized(lock) { - if (closed) return - finished = true - lock.notifyAll() - } - } - - fun fail() { - synchronized(lock) { - closed = true - lock.notifyAll() - } - } - - override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int { - if (position < 0) return -1 - synchronized(lock) { - while (!closed && !finished && position >= totalSize) { - lock.wait() - } - if (closed) return -1 - if (position >= totalSize && finished) return -1 - - val available = (totalSize - position).toInt() - val toRead = min(size, available) - var remaining = toRead - var destOffset = offset - var pos = position - - var index = findChunkIndex(pos) - while (remaining > 0 && index < chunks.size) { - val chunk = chunks[index] - val inChunkOffset = (pos - chunk.start).toInt() - if (inChunkOffset >= chunk.data.size) { - index++ - continue - } - val copyLen = min(remaining, chunk.data.size - inChunkOffset) - System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen) - remaining -= copyLen - destOffset += copyLen - pos += copyLen - if (inChunkOffset + copyLen >= chunk.data.size) { - index++ - } - } - - return toRead - remaining - } - } - - override fun getSize(): Long = -1 - - override fun close() { - synchronized(lock) { - closed = true - lock.notifyAll() - } - } - - private fun findChunkIndex(position: Long): Int { - var index = lastReadIndex - while (index < chunks.size) { - val chunk = chunks[index] - if (position < chunk.start + chunk.data.size) break - index++ - } - lastReadIndex = index - return index - } -} diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt deleted file mode 100644 index 7ada19e166b..00000000000 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt +++ /dev/null @@ -1,122 +0,0 @@ -package ai.openclaw.app.voice - -import java.net.HttpURLConnection -import java.net.URL -import kotlinx.coroutines.Dispatchers -import kotlinx.coroutines.withContext -import kotlinx.serialization.json.Json -import kotlinx.serialization.json.JsonArray -import kotlinx.serialization.json.JsonElement -import kotlinx.serialization.json.JsonObject -import kotlinx.serialization.json.JsonPrimitive - -internal data class ElevenLabsVoice(val voiceId: String, val name: String?) - -internal data class TalkModeResolvedVoice( - val voiceId: String?, - val fallbackVoiceId: String?, - val defaultVoiceId: String?, - val currentVoiceId: String?, - val selectedVoiceName: String? = null, -) - -internal object TalkModeVoiceResolver { - fun resolveVoiceAlias(value: String?, voiceAliases: Map): String? { - val trimmed = value?.trim().orEmpty() - if (trimmed.isEmpty()) return null - val normalized = normalizeAliasKey(trimmed) - voiceAliases[normalized]?.let { return it } - if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed - return if (isLikelyVoiceId(trimmed)) trimmed else null - } - - suspend fun resolveVoiceId( - preferred: String?, - fallbackVoiceId: String?, - defaultVoiceId: String?, - currentVoiceId: String?, - voiceOverrideActive: Boolean, - listVoices: suspend () -> List, - ): TalkModeResolvedVoice { - val trimmed = preferred?.trim().orEmpty() - if (trimmed.isNotEmpty()) { - return TalkModeResolvedVoice( - voiceId = trimmed, - fallbackVoiceId = fallbackVoiceId, - defaultVoiceId = defaultVoiceId, - currentVoiceId = currentVoiceId, - ) - } - if (!fallbackVoiceId.isNullOrBlank()) { - return TalkModeResolvedVoice( - voiceId = fallbackVoiceId, - fallbackVoiceId = fallbackVoiceId, - defaultVoiceId = defaultVoiceId, - currentVoiceId = currentVoiceId, - ) - } - - val first = listVoices().firstOrNull() - if (first == null) { - return TalkModeResolvedVoice( - voiceId = null, - fallbackVoiceId = fallbackVoiceId, - defaultVoiceId = defaultVoiceId, - currentVoiceId = currentVoiceId, - ) - } - - return TalkModeResolvedVoice( - voiceId = first.voiceId, - fallbackVoiceId = first.voiceId, - defaultVoiceId = if (defaultVoiceId.isNullOrBlank()) first.voiceId else defaultVoiceId, - currentVoiceId = if (voiceOverrideActive) currentVoiceId else first.voiceId, - selectedVoiceName = first.name, - ) - } - - suspend fun listVoices(apiKey: String, json: Json): List { - return withContext(Dispatchers.IO) { - val url = URL("https://api.elevenlabs.io/v1/voices") - val conn = url.openConnection() as HttpURLConnection - try { - conn.requestMethod = "GET" - conn.connectTimeout = 15_000 - conn.readTimeout = 15_000 - conn.setRequestProperty("xi-api-key", apiKey) - - val code = conn.responseCode - val stream = if (code >= 400) conn.errorStream else conn.inputStream - val data = stream?.use { it.readBytes() } ?: byteArrayOf() - if (code >= 400) { - val message = data.toString(Charsets.UTF_8) - throw IllegalStateException("ElevenLabs voices failed: $code $message") - } - - val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull() - val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList()) - voices.mapNotNull { entry -> - val obj = entry.asObjectOrNull() ?: return@mapNotNull null - val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null - val name = obj["name"].asStringOrNull() - ElevenLabsVoice(voiceId, name) - } - } finally { - conn.disconnect() - } - } - } - - private fun isLikelyVoiceId(value: String): Boolean { - if (value.length < 10) return false - return value.all { it.isLetterOrDigit() || it == '-' || it == '_' } - } - - private fun normalizeAliasKey(value: String): String = - value.trim().lowercase() -} - -private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject - -private fun JsonElement?.asStringOrNull(): String? = - (this as? JsonPrimitive)?.takeIf { it.isString }?.content diff --git a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt deleted file mode 100644 index 5cd46895d42..00000000000 --- a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt +++ /dev/null @@ -1,92 +0,0 @@ -package ai.openclaw.app.voice - -import kotlinx.coroutines.runBlocking -import org.junit.Assert.assertEquals -import org.junit.Assert.assertNull -import org.junit.Test - -class TalkModeVoiceResolverTest { - @Test - fun resolvesVoiceAliasCaseInsensitively() { - val resolved = - TalkModeVoiceResolver.resolveVoiceAlias( - " Clawd ", - mapOf("clawd" to "voice-123"), - ) - - assertEquals("voice-123", resolved) - } - - @Test - fun acceptsDirectVoiceIds() { - val resolved = TalkModeVoiceResolver.resolveVoiceAlias("21m00Tcm4TlvDq8ikWAM", emptyMap()) - - assertEquals("21m00Tcm4TlvDq8ikWAM", resolved) - } - - @Test - fun rejectsUnknownAliases() { - val resolved = TalkModeVoiceResolver.resolveVoiceAlias("nickname", emptyMap()) - - assertNull(resolved) - } - - @Test - fun reusesCachedFallbackVoiceBeforeFetchingCatalog() = - runBlocking { - var fetchCount = 0 - - val resolved = - TalkModeVoiceResolver.resolveVoiceId( - preferred = null, - fallbackVoiceId = "cached-voice", - defaultVoiceId = null, - currentVoiceId = null, - voiceOverrideActive = false, - listVoices = { - fetchCount += 1 - emptyList() - }, - ) - - assertEquals("cached-voice", resolved.voiceId) - assertEquals(0, fetchCount) - } - - @Test - fun seedsDefaultVoiceFromCatalogWhenNeeded() = - runBlocking { - val resolved = - TalkModeVoiceResolver.resolveVoiceId( - preferred = null, - fallbackVoiceId = null, - defaultVoiceId = null, - currentVoiceId = null, - voiceOverrideActive = false, - listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) }, - ) - - assertEquals("voice-1", resolved.voiceId) - assertEquals("voice-1", resolved.fallbackVoiceId) - assertEquals("voice-1", resolved.defaultVoiceId) - assertEquals("voice-1", resolved.currentVoiceId) - assertEquals("First", resolved.selectedVoiceName) - } - - @Test - fun preservesCurrentVoiceWhenOverrideIsActive() = - runBlocking { - val resolved = - TalkModeVoiceResolver.resolveVoiceId( - preferred = null, - fallbackVoiceId = null, - defaultVoiceId = null, - currentVoiceId = null, - voiceOverrideActive = true, - listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) }, - ) - - assertEquals("voice-1", resolved.voiceId) - assertNull(resolved.currentVoiceId) - } -} From 4a0341ed035cae117ee560def33a74e87dd036ef Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:45:32 +0530 Subject: [PATCH 07/11] fix(review): address talk cleanup feedback --- .../ai/openclaw/app/voice/TalkModeManager.kt | 7 +- src/gateway/server-methods/talk.ts | 99 +++++++------------ 2 files changed, 39 insertions(+), 67 deletions(-) diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt index 4ba2c2ef043..be62498e24e 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt @@ -756,12 +756,9 @@ class TalkModeManager( } val suffix = resolveGatewayAudioSuffix(speech) val tempFile = - withContext(Dispatchers.IO) { - File.createTempFile("tts_", suffix, context.cacheDir).apply { - writeBytes(audioBytes) - } - } + withContext(Dispatchers.IO) { File.createTempFile("tts_", suffix, context.cacheDir) } try { + withContext(Dispatchers.IO) { tempFile.writeBytes(audioBytes) } val player = MediaPlayer() this.player = player val finished = CompletableDeferred() diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index 33cb6d7f116..85f78e91b6a 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -112,83 +112,58 @@ function buildTalkTtsConfig( auto: "always", provider, }; + const baseUrl = trimString(providerConfig.baseUrl); + const voiceId = trimString(providerConfig.voiceId); + const modelId = trimString(providerConfig.modelId); + const languageCode = trimString(providerConfig.languageCode); if (provider === "elevenlabs") { + const seed = finiteNumber(providerConfig.seed); + const applyTextNormalization = normalizeTextNormalization( + providerConfig.applyTextNormalization, + ); + const voiceSettings = readTalkVoiceSettings(providerConfig); talkTts.elevenlabs = { ...baseTts.elevenlabs, ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }), - ...(trimString(providerConfig.baseUrl) == null - ? {} - : { baseUrl: trimString(providerConfig.baseUrl) }), - ...(trimString(providerConfig.voiceId) == null - ? {} - : { voiceId: trimString(providerConfig.voiceId) }), - ...(trimString(providerConfig.modelId) == null - ? {} - : { modelId: trimString(providerConfig.modelId) }), - ...(finiteNumber(providerConfig.seed) == null - ? {} - : { seed: finiteNumber(providerConfig.seed) }), - ...(normalizeTextNormalization(providerConfig.applyTextNormalization) == null - ? {} - : { - applyTextNormalization: normalizeTextNormalization( - providerConfig.applyTextNormalization, - ), - }), - ...(trimString(providerConfig.languageCode) == null - ? {} - : { languageCode: trimString(providerConfig.languageCode) }), - ...(readTalkVoiceSettings(providerConfig) == null - ? {} - : { voiceSettings: readTalkVoiceSettings(providerConfig) }), + ...(baseUrl == null ? {} : { baseUrl }), + ...(voiceId == null ? {} : { voiceId }), + ...(modelId == null ? {} : { modelId }), + ...(seed == null ? {} : { seed }), + ...(applyTextNormalization == null ? {} : { applyTextNormalization }), + ...(languageCode == null ? {} : { languageCode }), + ...(voiceSettings == null ? {} : { voiceSettings }), }; } else if (provider === "openai") { + const speed = finiteNumber(providerConfig.speed); + const instructions = trimString(providerConfig.instructions); talkTts.openai = { ...baseTts.openai, ...(providerConfig.apiKey === undefined ? {} : { apiKey: providerConfig.apiKey }), - ...(trimString(providerConfig.baseUrl) == null - ? {} - : { baseUrl: trimString(providerConfig.baseUrl) }), - ...(trimString(providerConfig.modelId) == null - ? {} - : { model: trimString(providerConfig.modelId) }), - ...(trimString(providerConfig.voiceId) == null - ? {} - : { voice: trimString(providerConfig.voiceId) }), - ...(finiteNumber(providerConfig.speed) == null - ? {} - : { speed: finiteNumber(providerConfig.speed) }), - ...(trimString(providerConfig.instructions) == null - ? {} - : { instructions: trimString(providerConfig.instructions) }), + ...(baseUrl == null ? {} : { baseUrl }), + ...(modelId == null ? {} : { model: modelId }), + ...(voiceId == null ? {} : { voice: voiceId }), + ...(speed == null ? {} : { speed }), + ...(instructions == null ? {} : { instructions }), }; } else if (provider === "microsoft") { + const outputFormat = trimString(providerConfig.outputFormat); + const pitch = trimString(providerConfig.pitch); + const rate = trimString(providerConfig.rate); + const volume = trimString(providerConfig.volume); + const proxy = trimString(providerConfig.proxy); + const timeoutMs = finiteNumber(providerConfig.timeoutMs); talkTts.microsoft = { ...baseTts.microsoft, enabled: true, - ...(trimString(providerConfig.voiceId) == null - ? {} - : { voice: trimString(providerConfig.voiceId) }), - ...(trimString(providerConfig.languageCode) == null - ? {} - : { lang: trimString(providerConfig.languageCode) }), - ...(trimString(providerConfig.outputFormat) == null - ? {} - : { outputFormat: trimString(providerConfig.outputFormat) }), - ...(trimString(providerConfig.pitch) == null - ? {} - : { pitch: trimString(providerConfig.pitch) }), - ...(trimString(providerConfig.rate) == null ? {} : { rate: trimString(providerConfig.rate) }), - ...(trimString(providerConfig.volume) == null - ? {} - : { volume: trimString(providerConfig.volume) }), - ...(trimString(providerConfig.proxy) == null - ? {} - : { proxy: trimString(providerConfig.proxy) }), - ...(finiteNumber(providerConfig.timeoutMs) == null - ? {} - : { timeoutMs: finiteNumber(providerConfig.timeoutMs) }), + ...(voiceId == null ? {} : { voice: voiceId }), + ...(languageCode == null ? {} : { lang: languageCode }), + ...(outputFormat == null ? {} : { outputFormat }), + ...(pitch == null ? {} : { pitch }), + ...(rate == null ? {} : { rate }), + ...(volume == null ? {} : { volume }), + ...(proxy == null ? {} : { proxy }), + ...(timeoutMs == null ? {} : { timeoutMs }), }; } else { return { error: `talk.speak unavailable: unsupported talk provider '${resolved.provider}'` }; From 47e412bd0b2bd81ad02613a8ec7ed41228c82bcb Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:51:29 +0530 Subject: [PATCH 08/11] fix(review): preserve talk directive overrides --- .../ai/openclaw/app/voice/TalkModeManager.kt | 3 ++ src/gateway/protocol/schema/channels.ts | 1 + src/gateway/server-methods/talk.ts | 15 +++++- src/gateway/server.talk-config.test.ts | 47 +++++++++++++++++++ src/tts/providers/elevenlabs.ts | 4 +- src/tts/providers/microsoft.ts | 2 +- src/tts/tts.ts | 2 + 7 files changed, 70 insertions(+), 4 deletions(-) diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt index be62498e24e..d4433d72a9c 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt @@ -723,6 +723,9 @@ class TalkModeManager( TalkModeRuntime.validatedLanguage(directive?.language)?.let { put("language", JsonPrimitive(it)) } + directive?.outputFormat?.trim()?.takeIf { it.isNotEmpty() }?.let { + put("outputFormat", JsonPrimitive(it)) + } } val res = session.request("talk.speak", params.toString()) val root = json.parseToJsonElement(res).asObjectOrNull() ?: error("talk.speak returned invalid JSON") diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 923432c7ac8..52f5ad597bc 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -21,6 +21,7 @@ export const TalkSpeakParamsSchema = Type.Object( text: NonEmptyString, voiceId: Type.Optional(Type.String()), modelId: Type.Optional(Type.String()), + outputFormat: Type.Optional(Type.String()), speed: Type.Optional(Type.Number()), stability: Type.Optional(Type.Number()), similarity: Type.Optional(Type.Number()), diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index 85f78e91b6a..acbede0b33d 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -69,7 +69,13 @@ function resolveTalkVoiceId( if (!aliases) { return requested; } - return aliases[normalizeAliasKey(requested)] ?? requested; + const normalizedRequested = normalizeAliasKey(requested); + for (const [alias, voiceId] of Object.entries(aliases)) { + if (normalizeAliasKey(alias) === normalizedRequested) { + return voiceId; + } + } + return requested; } function readTalkVoiceSettings( @@ -189,6 +195,7 @@ function buildTalkSpeakOverrides( ): TtsDirectiveOverrides { const voiceId = resolveTalkVoiceId(providerConfig, trimString(params.voiceId)); const modelId = trimString(params.modelId); + const outputFormat = trimString(params.outputFormat); const speed = finiteNumber(params.speed); const seed = finiteNumber(params.seed); const normalize = normalizeTextNormalization(params.normalize); @@ -212,6 +219,7 @@ function buildTalkSpeakOverrides( overrides.elevenlabs = { ...(voiceId == null ? {} : { voiceId }), ...(modelId == null ? {} : { modelId }), + ...(outputFormat == null ? {} : { outputFormat }), ...(seed == null ? {} : { seed }), ...(normalize == null ? {} : { applyTextNormalization: normalize }), ...(language == null ? {} : { languageCode: language }), @@ -230,7 +238,10 @@ function buildTalkSpeakOverrides( } if (provider === "microsoft") { - overrides.microsoft = voiceId == null ? undefined : { voice: voiceId }; + overrides.microsoft = { + ...(voiceId == null ? {} : { voice: voiceId }), + ...(outputFormat == null ? {} : { outputFormat }), + }; } return overrides; diff --git a/src/gateway/server.talk-config.test.ts b/src/gateway/server.talk-config.test.ts index eb2925db158..6433445795f 100644 --- a/src/gateway/server.talk-config.test.ts +++ b/src/gateway/server.talk-config.test.ts @@ -301,4 +301,51 @@ describe("gateway talk.config", () => { globalThis.fetch = originalFetch; } }); + + it("resolves talk voice aliases case-insensitively and forwards output format", async () => { + const { writeConfigFile } = await import("../config/config.js"); + await writeConfigFile({ + talk: { + provider: "elevenlabs", + providers: { + elevenlabs: { + apiKey: "elevenlabs-talk-key", // pragma: allowlist secret + voiceId: "voice-default", + voiceAliases: { + Clawd: "EXAVITQu4vr4xnSDxMaL", + }, + }, + }, + }, + }); + + const originalFetch = globalThis.fetch; + let fetchUrl: string | undefined; + const fetchMock = vi.fn(async (input: RequestInfo | URL) => { + fetchUrl = typeof input === "string" ? input : input instanceof URL ? input.href : input.url; + return new Response(new Uint8Array([4, 5, 6]), { status: 200 }); + }); + globalThis.fetch = fetchMock as typeof fetch; + + try { + await withServer(async (ws) => { + await connectOperator(ws, ["operator.read", "operator.write"]); + const res = await fetchTalkSpeak(ws, { + text: "Hello from talk mode.", + voiceId: "clawd", + outputFormat: "pcm_44100", + }); + expect(res.ok).toBe(true); + expect(res.payload?.provider).toBe("elevenlabs"); + expect(res.payload?.outputFormat).toBe("pcm_44100"); + expect(res.payload?.audioBase64).toBe(Buffer.from([4, 5, 6]).toString("base64")); + }); + + expect(fetchMock).toHaveBeenCalled(); + expect(fetchUrl).toContain("/v1/text-to-speech/EXAVITQu4vr4xnSDxMaL"); + expect(fetchUrl).toContain("output_format=pcm_44100"); + } finally { + globalThis.fetch = originalFetch; + } + }); }); diff --git a/src/tts/providers/elevenlabs.ts b/src/tts/providers/elevenlabs.ts index c22425926bf..99097fc42f3 100644 --- a/src/tts/providers/elevenlabs.ts +++ b/src/tts/providers/elevenlabs.ts @@ -72,7 +72,9 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin { if (!apiKey) { throw new Error("ElevenLabs API key missing"); } - const outputFormat = req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128"; + const outputFormat = + req.overrides?.elevenlabs?.outputFormat ?? + (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128"); const audioBuffer = await elevenLabsTTS({ text: req.text, apiKey, diff --git a/src/tts/providers/microsoft.ts b/src/tts/providers/microsoft.ts index ba2511e4de6..f6c5aa8c379 100644 --- a/src/tts/providers/microsoft.ts +++ b/src/tts/providers/microsoft.ts @@ -83,7 +83,7 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { const tempRoot = resolvePreferredOpenClawTmpDir(); mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-")); - let outputFormat = req.config.edge.outputFormat; + let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat; const fallbackOutputFormat = outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; diff --git a/src/tts/tts.ts b/src/tts/tts.ts index c64dda83909..17a7c2fc981 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -167,6 +167,7 @@ export type TtsDirectiveOverrides = { elevenlabs?: { voiceId?: string; modelId?: string; + outputFormat?: string; seed?: number; applyTextNormalization?: "auto" | "on" | "off"; languageCode?: string; @@ -174,6 +175,7 @@ export type TtsDirectiveOverrides = { }; microsoft?: { voice?: string; + outputFormat?: string; }; }; From 61965e500f93b039d21b9dbca34b320ed23dc704 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 10:56:18 +0530 Subject: [PATCH 09/11] fix: route Android Talk synthesis through the gateway (#50849) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37ff9e33f36..553fab9d3a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ Docs: https://docs.openclaw.ai - Plugins/context engines: expose `delegateCompactionToRuntime(...)` on the public plugin SDK, refactor the legacy engine to use the shared helper, and clarify `ownsCompaction` delegation semantics for non-owning engines. (#49061) Thanks @jalehman. - Plugins/MiniMax: add MiniMax-M2.7 and MiniMax-M2.7-highspeed models and update the default model from M2.5 to M2.7. (#49691) Thanks @liyuan97. - Plugins/Xiaomi: switch the bundled Xiaomi provider to the `/v1` OpenAI-compatible endpoint and add MiMo V2 Pro plus MiMo V2 Omni to the built-in catalog. (#49214) thanks @DJjjjhao. +- Android/Talk: move Talk speech synthesis behind gateway `talk.speak`, keep Talk secrets on the gateway, and switch Android playback to final-response audio instead of device-local ElevenLabs streaming. (#50849) - Plugins/Matrix: add `allowBots` room policy so configured Matrix bot accounts can talk to each other, with optional mention-only gating. Thanks @gumadeiras. - Plugins/Matrix: add per-account `allowPrivateNetwork` opt-in for private/internal homeservers, while keeping public cleartext homeservers blocked. Thanks @gumadeiras. - Web tools/Tavily: add Tavily as a bundled web-search provider with dedicated `tavily_search` and `tavily_extract` tools, using canonical plugin-owned config under `plugins.entries.tavily.config.webSearch.*`. (#49200) thanks @lakshyaag-tavily. From 2afd65741cdaa4808f43b11a0947a8f1fe6fe257 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 11:07:13 +0530 Subject: [PATCH 10/11] fix: preserve talk provider and speaking state --- .../ai/openclaw/app/voice/TalkModeManager.kt | 2 +- src/gateway/server-methods/talk.ts | 2 - src/gateway/server.talk-config.test.ts | 52 +++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt index d4433d72a9c..2a82588b46b 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt @@ -748,7 +748,7 @@ class TalkModeManager( private suspend fun playGatewaySpeech(speech: GatewayTalkSpeech, playbackToken: Long) { ensurePlaybackActive(playbackToken) - stopSpeaking(resetInterrupt = false) + cleanupPlayer() ensurePlaybackActive(playbackToken) val audioBytes = diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index acbede0b33d..3930dc4c4ca 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -171,8 +171,6 @@ function buildTalkTtsConfig( ...(proxy == null ? {} : { proxy }), ...(timeoutMs == null ? {} : { timeoutMs }), }; - } else { - return { error: `talk.speak unavailable: unsupported talk provider '${resolved.provider}'` }; } return { diff --git a/src/gateway/server.talk-config.test.ts b/src/gateway/server.talk-config.test.ts index 6433445795f..1dccbfab5c6 100644 --- a/src/gateway/server.talk-config.test.ts +++ b/src/gateway/server.talk-config.test.ts @@ -6,6 +6,8 @@ import { publicKeyRawBase64UrlFromPem, signDevicePayload, } from "../infra/device-identity.js"; +import { createEmptyPluginRegistry } from "../plugins/registry-empty.js"; +import { getActivePluginRegistry, setActivePluginRegistry } from "../plugins/runtime.js"; import { withEnvAsync } from "../test-utils/env.js"; import { buildDeviceAuthPayload } from "./device-auth.js"; import { validateTalkConfigResult } from "./protocol/index.js"; @@ -348,4 +350,54 @@ describe("gateway talk.config", () => { globalThis.fetch = originalFetch; } }); + + it("allows extension speech providers through talk.speak", async () => { + const { writeConfigFile } = await import("../config/config.js"); + await writeConfigFile({ + talk: { + provider: "acme", + providers: { + acme: { + voiceId: "plugin-voice", + }, + }, + }, + }); + + const previousRegistry = getActivePluginRegistry() ?? createEmptyPluginRegistry(); + setActivePluginRegistry({ + ...createEmptyPluginRegistry(), + speechProviders: [ + { + pluginId: "acme-plugin", + source: "test", + provider: { + id: "acme", + label: "Acme Speech", + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: Buffer.from([7, 8, 9]), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }), + }, + }, + ], + }); + + try { + await withServer(async (ws) => { + await connectOperator(ws, ["operator.read", "operator.write"]); + const res = await fetchTalkSpeak(ws, { + text: "Hello from plugin talk mode.", + }); + expect(res.ok).toBe(true); + expect(res.payload?.provider).toBe("acme"); + expect(res.payload?.audioBase64).toBe(Buffer.from([7, 8, 9]).toString("base64")); + }); + } finally { + setActivePluginRegistry(previousRegistry); + } + }); }); From a73e517ae3b8fc1f6c1ab48c2a98274eb36accb9 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 20 Mar 2026 11:12:53 +0530 Subject: [PATCH 11/11] build(protocol): regenerate swift talk models --- .../OpenClawProtocol/GatewayModels.swift | 92 +++++++++++++++++++ .../OpenClawProtocol/GatewayModels.swift | 92 +++++++++++++++++++ 2 files changed, 184 insertions(+) diff --git a/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift b/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift index 6f97c9bf9f1..0b1d7b13e01 100644 --- a/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift +++ b/apps/macos/Sources/OpenClawProtocol/GatewayModels.swift @@ -2012,6 +2012,98 @@ public struct TalkConfigResult: Codable, Sendable { } } +public struct TalkSpeakParams: Codable, Sendable { + public let text: String + public let voiceid: String? + public let modelid: String? + public let outputformat: String? + public let speed: Double? + public let stability: Double? + public let similarity: Double? + public let style: Double? + public let speakerboost: Bool? + public let seed: Int? + public let normalize: String? + public let language: String? + + public init( + text: String, + voiceid: String?, + modelid: String?, + outputformat: String?, + speed: Double?, + stability: Double?, + similarity: Double?, + style: Double?, + speakerboost: Bool?, + seed: Int?, + normalize: String?, + language: String?) + { + self.text = text + self.voiceid = voiceid + self.modelid = modelid + self.outputformat = outputformat + self.speed = speed + self.stability = stability + self.similarity = similarity + self.style = style + self.speakerboost = speakerboost + self.seed = seed + self.normalize = normalize + self.language = language + } + + private enum CodingKeys: String, CodingKey { + case text + case voiceid = "voiceId" + case modelid = "modelId" + case outputformat = "outputFormat" + case speed + case stability + case similarity + case style + case speakerboost = "speakerBoost" + case seed + case normalize + case language + } +} + +public struct TalkSpeakResult: Codable, Sendable { + public let audiobase64: String + public let provider: String + public let outputformat: String? + public let voicecompatible: Bool? + public let mimetype: String? + public let fileextension: String? + + public init( + audiobase64: String, + provider: String, + outputformat: String?, + voicecompatible: Bool?, + mimetype: String?, + fileextension: String?) + { + self.audiobase64 = audiobase64 + self.provider = provider + self.outputformat = outputformat + self.voicecompatible = voicecompatible + self.mimetype = mimetype + self.fileextension = fileextension + } + + private enum CodingKeys: String, CodingKey { + case audiobase64 = "audioBase64" + case provider + case outputformat = "outputFormat" + case voicecompatible = "voiceCompatible" + case mimetype = "mimeType" + case fileextension = "fileExtension" + } +} + public struct ChannelsStatusParams: Codable, Sendable { public let probe: Bool? public let timeoutms: Int? diff --git a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift index 6f97c9bf9f1..0b1d7b13e01 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift @@ -2012,6 +2012,98 @@ public struct TalkConfigResult: Codable, Sendable { } } +public struct TalkSpeakParams: Codable, Sendable { + public let text: String + public let voiceid: String? + public let modelid: String? + public let outputformat: String? + public let speed: Double? + public let stability: Double? + public let similarity: Double? + public let style: Double? + public let speakerboost: Bool? + public let seed: Int? + public let normalize: String? + public let language: String? + + public init( + text: String, + voiceid: String?, + modelid: String?, + outputformat: String?, + speed: Double?, + stability: Double?, + similarity: Double?, + style: Double?, + speakerboost: Bool?, + seed: Int?, + normalize: String?, + language: String?) + { + self.text = text + self.voiceid = voiceid + self.modelid = modelid + self.outputformat = outputformat + self.speed = speed + self.stability = stability + self.similarity = similarity + self.style = style + self.speakerboost = speakerboost + self.seed = seed + self.normalize = normalize + self.language = language + } + + private enum CodingKeys: String, CodingKey { + case text + case voiceid = "voiceId" + case modelid = "modelId" + case outputformat = "outputFormat" + case speed + case stability + case similarity + case style + case speakerboost = "speakerBoost" + case seed + case normalize + case language + } +} + +public struct TalkSpeakResult: Codable, Sendable { + public let audiobase64: String + public let provider: String + public let outputformat: String? + public let voicecompatible: Bool? + public let mimetype: String? + public let fileextension: String? + + public init( + audiobase64: String, + provider: String, + outputformat: String?, + voicecompatible: Bool?, + mimetype: String?, + fileextension: String?) + { + self.audiobase64 = audiobase64 + self.provider = provider + self.outputformat = outputformat + self.voicecompatible = voicecompatible + self.mimetype = mimetype + self.fileextension = fileextension + } + + private enum CodingKeys: String, CodingKey { + case audiobase64 = "audioBase64" + case provider + case outputformat = "outputFormat" + case voicecompatible = "voiceCompatible" + case mimetype = "mimeType" + case fileextension = "fileExtension" + } +} + public struct ChannelsStatusParams: Codable, Sendable { public let probe: Bool? public let timeoutms: Int?