diff --git a/src/agents/tools/web-fetch.ts b/src/agents/tools/web-fetch.ts index fdb5ade5172..931d8c778f0 100644 --- a/src/agents/tools/web-fetch.ts +++ b/src/agents/tools/web-fetch.ts @@ -2,7 +2,11 @@ import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; import type { AnyAgentTool } from "./common.js"; import { fetchWithSsrFGuard } from "../../infra/net/fetch-guard.js"; -import { SsrFBlockedError } from "../../infra/net/ssrf.js"; +import { + matchesHostnameAllowlist, + normalizeHostnameAllowlist, + SsrFBlockedError, +} from "../../infra/net/ssrf.js"; import { logDebug } from "../../logger.js"; import { wrapExternalContent, wrapWebContent } from "../../security/external-content.js"; import { normalizeSecretInput } from "../../utils/normalize-secret-input.js"; @@ -68,6 +72,32 @@ type WebFetchConfig = NonNullable["web"] extends infer : undefined : undefined; +type WebConfig = NonNullable["web"]; + +export function resolveFetchUrlAllowlist(web?: WebConfig): string[] | undefined { + if (!web || typeof web !== "object") { + return undefined; + } + if (!("urlAllowlist" in web)) { + return undefined; + } + const allowlist = web.urlAllowlist; + if (!Array.isArray(allowlist)) { + return undefined; + } + return allowlist.length > 0 ? allowlist : undefined; +} + +export function isUrlAllowedByAllowlist(url: string, allowlist: string[]): boolean { + try { + const hostname = new URL(url).hostname; + const normalizedAllowlist = normalizeHostnameAllowlist(allowlist); + return matchesHostnameAllowlist(hostname, normalizedAllowlist); + } catch { + return false; + } +} + type FirecrawlFetchConfig = | { enabled?: boolean; @@ -732,6 +762,7 @@ export function createWebFetchTool(options?: { (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || DEFAULT_FETCH_USER_AGENT; const maxResponseBytes = resolveFetchMaxResponseBytes(fetch); + const urlAllowlist = resolveFetchUrlAllowlist(options?.config?.tools?.web); return { label: "Web Fetch", name: "web_fetch", @@ -741,6 +772,25 @@ export function createWebFetchTool(options?: { execute: async (_toolCallId, args) => { const params = args as Record; const url = readStringParam(params, "url", { required: true }); + + // Check URL against allowlist if configured + if (urlAllowlist && urlAllowlist.length > 0) { + if (!isUrlAllowedByAllowlist(url, urlAllowlist)) { + let hostname: string; + try { + hostname = new URL(url).hostname; + } catch { + hostname = url; + } + return jsonResult({ + error: "url_not_allowed", + message: `URL not in allowlist. Allowed domains: ${urlAllowlist.join(", ")}`, + blockedUrl: url, + blockedHostname: hostname, + }); + } + } + const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown"; const maxChars = readNumberParam(params, "maxChars", { integer: true }); const maxCharsCap = resolveFetchMaxCharsCap(fetch); diff --git a/src/agents/tools/web-search.ts b/src/agents/tools/web-search.ts index be174b951d3..b03abf6bc59 100644 --- a/src/agents/tools/web-search.ts +++ b/src/agents/tools/web-search.ts @@ -2,6 +2,7 @@ import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; import type { AnyAgentTool } from "./common.js"; import { formatCliCommand } from "../../cli/command-format.js"; +import { matchesHostnameAllowlist, normalizeHostnameAllowlist } from "../../infra/net/ssrf.js"; import { wrapWebContent } from "../../security/external-content.js"; import { normalizeSecretInput } from "../../utils/normalize-secret-input.js"; import { jsonResult, readNumberParam, readStringParam } from "./common.js"; @@ -75,6 +76,43 @@ type WebSearchConfig = NonNullable["web"] extends infer : undefined : undefined; +type WebConfig = NonNullable["web"]; + +export function resolveUrlAllowlist(web?: WebConfig): string[] | undefined { + if (!web || typeof web !== "object") { + return undefined; + } + if (!("urlAllowlist" in web)) { + return undefined; + } + const allowlist = web.urlAllowlist; + if (!Array.isArray(allowlist)) { + return undefined; + } + return allowlist.length > 0 ? allowlist : undefined; +} + +export function filterResultsByAllowlist( + results: Array<{ url?: string; siteName?: string }>, + allowlist: string[], +): Array<{ url?: string; siteName?: string }> { + if (allowlist.length === 0) { + return results; + } + const normalizedAllowlist = normalizeHostnameAllowlist(allowlist); + return results.filter((result) => { + if (!result.url) { + return true; // Keep entries without URL + } + try { + const hostname = new URL(result.url).hostname; + return matchesHostnameAllowlist(hostname, normalizedAllowlist); + } catch { + return true; // Keep entries with invalid URLs (let them pass through) + } + }); +} + type BraveSearchResult = { title?: string; url?: string; @@ -566,6 +604,7 @@ async function runWebSearch(params: { perplexityModel?: string; grokModel?: string; grokInlineCitations?: boolean; + urlAllowlist?: string[]; }): Promise> { const cacheKey = normalizeCacheKey( params.provider === "brave" @@ -688,10 +727,15 @@ async function runWebSearch(params: { }; }); + // Filter results by urlAllowlist if configured + const filteredResults = params.urlAllowlist + ? filterResultsByAllowlist(mapped, params.urlAllowlist) + : mapped; + const payload = { query: params.query, provider: params.provider, - count: mapped.length, + count: filteredResults.length, tookMs: Date.now() - start, externalContent: { untrusted: true, @@ -699,7 +743,7 @@ async function runWebSearch(params: { provider: params.provider, wrapped: true, }, - results: mapped, + results: filteredResults, }; writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; @@ -717,6 +761,7 @@ export function createWebSearchTool(options?: { const provider = resolveSearchProvider(search); const perplexityConfig = resolvePerplexityConfig(search); const grokConfig = resolveGrokConfig(search); + const urlAllowlist = resolveUrlAllowlist(options?.config?.tools?.web); const description = provider === "perplexity" @@ -786,6 +831,7 @@ export function createWebSearchTool(options?: { perplexityModel: resolvePerplexityModel(perplexityConfig), grokModel: resolveGrokModel(grokConfig), grokInlineCitations: resolveGrokInlineCitations(grokConfig), + urlAllowlist, }); return jsonResult(result); }, @@ -803,4 +849,6 @@ export const __testing = { resolveGrokModel, resolveGrokInlineCitations, extractGrokContent, + resolveUrlAllowlist, + filterResultsByAllowlist, } as const; diff --git a/src/agents/tools/web-tools.url-allowlist.test.ts b/src/agents/tools/web-tools.url-allowlist.test.ts new file mode 100644 index 00000000000..c3dbb24b6ce --- /dev/null +++ b/src/agents/tools/web-tools.url-allowlist.test.ts @@ -0,0 +1,193 @@ +import { describe, expect, it } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; +import { isUrlAllowedByAllowlist, resolveFetchUrlAllowlist } from "./web-fetch.js"; +import { filterResultsByAllowlist, resolveUrlAllowlist } from "./web-search.js"; + +describe("web-search urlAllowlist", () => { + describe("resolveUrlAllowlist", () => { + it("returns undefined when web config is undefined", () => { + const result = resolveUrlAllowlist(undefined); + expect(result).toBeUndefined(); + }); + + it("returns undefined when urlAllowlist is not set", () => { + const result = resolveUrlAllowlist({ search: { enabled: true } }); + expect(result).toBeUndefined(); + }); + + it("returns undefined when urlAllowlist is empty array", () => { + const result = resolveUrlAllowlist({ urlAllowlist: [], search: { enabled: true } }); + expect(result).toBeUndefined(); + }); + + it("returns the allowlist when configured", () => { + const result = resolveUrlAllowlist({ + urlAllowlist: ["example.com", "*.github.com"], + search: { enabled: true }, + }); + expect(result).toEqual(["example.com", "*.github.com"]); + }); + }); + + describe("filterResultsByAllowlist", () => { + const results = [ + { url: "https://example.com/page", siteName: "example.com" }, + { url: "https://api.github.com/user/repo", siteName: "api.github.com" }, + { url: "https://docs.openclaw.ai/guide", siteName: "docs.openclaw.ai" }, + { url: "https://blocked.org/page", siteName: "blocked.org" }, + { url: undefined, siteName: "unknown" }, // entry without URL + ]; + + it("returns all results when allowlist is empty", () => { + const result = filterResultsByAllowlist(results, []); + expect(result).toHaveLength(5); + }); + + it("filters results by exact domain match", () => { + const result = filterResultsByAllowlist(results, ["example.com"]); + expect(result).toHaveLength(2); // example.com + entry without URL + expect(result.map((r) => r.url)).toContain("https://example.com/page"); + expect(result.map((r) => r.url)).not.toContain("https://api.github.com/user/repo"); + }); + + it("filters results by wildcard pattern", () => { + const result = filterResultsByAllowlist(results, ["*.github.com"]); + expect(result).toHaveLength(2); // api.github.com + entry without URL + expect(result.map((r) => r.url)).toContain("https://api.github.com/user/repo"); + expect(result.map((r) => r.url)).not.toContain("https://example.com/page"); + }); + + it("filters results with multiple patterns", () => { + const result = filterResultsByAllowlist(results, ["example.com", "*.github.com"]); + expect(result).toHaveLength(3); // example.com + api.github.com + entry without URL + expect(result.map((r) => r.url)).toContain("https://example.com/page"); + expect(result.map((r) => r.url)).toContain("https://api.github.com/user/repo"); + expect(result.map((r) => r.url)).not.toContain("https://blocked.org/page"); + }); + + it("keeps entries without URLs and entries not in blocklist", () => { + const result = filterResultsByAllowlist(results, ["blocked.org"]); + // With allowlist ["blocked.org"], we ONLY keep blocked.org URLs and entries without URLs + expect(result).toHaveLength(2); // blocked.org + entry without URL + expect(result.map((r) => r.url)).toContain("https://blocked.org/page"); + }); + }); +}); + +describe("web-fetch urlAllowlist", () => { + describe("resolveFetchUrlAllowlist", () => { + it("returns undefined when web config is undefined", () => { + const result = resolveFetchUrlAllowlist(undefined); + expect(result).toBeUndefined(); + }); + + it("returns undefined when urlAllowlist is not set", () => { + const result = resolveFetchUrlAllowlist({ fetch: { enabled: true } }); + expect(result).toBeUndefined(); + }); + + it("returns undefined when urlAllowlist is empty array", () => { + const result = resolveFetchUrlAllowlist({ urlAllowlist: [], fetch: { enabled: true } }); + expect(result).toBeUndefined(); + }); + + it("returns the allowlist when configured", () => { + const result = resolveFetchUrlAllowlist({ + urlAllowlist: ["example.com", "*.github.com"], + fetch: { enabled: true }, + }); + expect(result).toEqual(["example.com", "*.github.com"]); + }); + }); + + describe("isUrlAllowedByAllowlist", () => { + it("allows any URL when allowlist is empty", () => { + const result = isUrlAllowedByAllowlist("https://example.com/page", []); + expect(result).toBe(true); + }); + + it("blocks URLs not in allowlist", () => { + const result = isUrlAllowedByAllowlist("https://blocked.com/page", ["example.com"]); + expect(result).toBe(false); + }); + + it("allows URLs matching exact domain", () => { + const result = isUrlAllowedByAllowlist("https://example.com/page", ["example.com"]); + expect(result).toBe(true); + }); + + it("allows URLs matching wildcard pattern", () => { + const result = isUrlAllowedByAllowlist("https://api.github.com/users", ["*.github.com"]); + expect(result).toBe(true); + }); + + it("blocks URLs not matching wildcard pattern", () => { + const result = isUrlAllowedByAllowlist("https://github.com", ["*.github.com"]); + // Exact match "github.com" should not match "*.github.com" pattern + // because *.github.com requires at least one subdomain + expect(result).toBe(false); + }); + + it("allows subdomain with wildcard pattern", () => { + const result = isUrlAllowedByAllowlist("https://docs.openclaw.ai/guide", ["*.openclaw.ai"]); + expect(result).toBe(true); + }); + + it("handles URLs without protocol", () => { + const result = isUrlAllowedByAllowlist("not-a-url", ["example.com"]); + expect(result).toBe(false); + }); + }); + + describe("web_fetch error response", () => { + // This test verifies the error format returned when URL is blocked + it("returns correct error format for blocked URL", () => { + // Simulate the error response format + const urlAllowlist = ["example.com"]; + const url = "https://blocked.com/page"; + + if (!isUrlAllowedByAllowlist(url, urlAllowlist)) { + const hostname = new URL(url).hostname; + const errorResponse = { + error: "url_not_allowed", + message: `URL not in allowlist. Allowed domains: ${urlAllowlist.join(", ")}`, + blockedUrl: url, + blockedHostname: hostname, + }; + + expect(errorResponse.error).toBe("url_not_allowed"); + expect(errorResponse.message).toContain("example.com"); + expect(errorResponse.blockedUrl).toBe("https://blocked.com/page"); + expect(errorResponse.blockedHostname).toBe("blocked.com"); + } + }); + }); +}); + +describe("integration with config", () => { + it("reads urlAllowlist from tools.web config", () => { + const config: OpenClawConfig = { + tools: { + web: { + urlAllowlist: ["example.com", "*.github.com"], + search: { enabled: true }, + fetch: { enabled: true }, + }, + }, + }; + + const searchAllowlist = resolveUrlAllowlist(config.tools?.web); + const fetchAllowlist = resolveFetchUrlAllowlist(config.tools?.web); + + expect(searchAllowlist).toEqual(["example.com", "*.github.com"]); + expect(fetchAllowlist).toEqual(["example.com", "*.github.com"]); + }); + + it("works with undefined config", () => { + const searchAllowlist = resolveUrlAllowlist(undefined); + const fetchAllowlist = resolveFetchUrlAllowlist(undefined); + + expect(searchAllowlist).toBeUndefined(); + expect(fetchAllowlist).toBeUndefined(); + }); +}); diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 11c92956dfc..b8f16123db2 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -93,16 +93,13 @@ export const FIELD_HELP: Record = { "tools.web.search.enabled": "Enable the web_search tool (requires a provider API key).", "tools.web.search.provider": 'Search provider ("brave" or "perplexity").', "tools.web.search.apiKey": "Brave Search API key (fallback: BRAVE_API_KEY env var).", - "tools.web.search.maxResults": "Default number of results to return (1-10).", - "tools.web.search.timeoutSeconds": "Timeout in seconds for web_search requests.", - "tools.web.search.cacheTtlMinutes": "Cache TTL in minutes for web_search results.", - "tools.web.search.perplexity.apiKey": - "Perplexity or OpenRouter API key (fallback: PERPLEXITY_API_KEY or OPENROUTER_API_KEY env var).", - "tools.web.search.perplexity.baseUrl": - "Perplexity base URL override (default: https://openrouter.ai/api/v1 or https://api.perplexity.ai).", - "tools.web.search.perplexity.model": - 'Perplexity model override (default: "perplexity/sonar-pro").', + "tools.web.search.urlAllowlist": + "Optional URL/domain allowlist for web_search. When configured, Brave search results are filtered to only include URLs from allowed domains.", + "tools.web.urlAllowlist": + "Optional URL/domain allowlist shared by web_search and web_fetch. Accepts domain patterns like 'example.com', '*.github.com', 'docs.openclaw.ai'. When configured, only matching URLs are allowed.", "tools.web.fetch.enabled": "Enable the web_fetch tool (lightweight HTTP fetch).", + "tools.web.fetch.urlAllowlist": + "Optional URL/domain allowlist for web_fetch. When configured, only URLs matching these patterns can be fetched.", "tools.web.fetch.maxChars": "Max characters returned by web_fetch (truncated).", "tools.web.fetch.maxCharsCap": "Hard cap for web_fetch maxChars (applies to config and tool calls).", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 4f02166d7aa..db8a2352ed9 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -355,6 +355,8 @@ export type ToolsConfig = { /** Optional tool policy overrides keyed by provider id or "provider/model". */ byProvider?: Record; web?: { + /** Optional URL/domain allowlist for web tools. When configured, only URLs matching these patterns are allowed. */ + urlAllowlist?: string[]; search?: { /** Enable web search tool (default: true when API key is present). */ enabled?: boolean; diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index c049e63537a..91672ae6b9f 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -267,6 +267,7 @@ export const ToolsWebFetchSchema = z export const ToolsWebSchema = z .object({ + urlAllowlist: z.array(z.string()).optional(), search: ToolsWebSearchSchema, fetch: ToolsWebFetchSchema, }) diff --git a/src/infra/net/ssrf.ts b/src/infra/net/ssrf.ts index fce4204f4ff..3a7b1946195 100644 --- a/src/infra/net/ssrf.ts +++ b/src/infra/net/ssrf.ts @@ -33,7 +33,7 @@ function normalizeHostnameSet(values?: string[]): Set { return new Set(values.map((value) => normalizeHostname(value)).filter(Boolean)); } -function normalizeHostnameAllowlist(values?: string[]): string[] { +export function normalizeHostnameAllowlist(values?: string[]): string[] { if (!values || values.length === 0) { return []; } @@ -57,7 +57,7 @@ function isHostnameAllowedByPattern(hostname: string, pattern: string): boolean return hostname === pattern; } -function matchesHostnameAllowlist(hostname: string, allowlist: string[]): boolean { +export function matchesHostnameAllowlist(hostname: string, allowlist: string[]): boolean { if (allowlist.length === 0) { return true; }