From 54bf5d0f4123b949e07a8266b9aed6fe745f9909 Mon Sep 17 00:00:00 2001 From: Yaxuan42 Date: Fri, 13 Feb 2026 22:46:20 +0800 Subject: [PATCH] feat(web-fetch): support Cloudflare Markdown for Agents (#15376) Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: d0528dc429840b16bfcef4d921f3229653d38143 Co-authored-by: Yaxuan42 <184813557+Yaxuan42@users.noreply.github.com> Co-authored-by: steipete <58493+steipete@users.noreply.github.com> Reviewed-by: @steipete --- CHANGELOG.md | 1 + .../tools/web-fetch.cf-markdown.test.ts | 186 ++++++++++++++++++ src/agents/tools/web-fetch.ts | 28 ++- 3 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 src/agents/tools/web-fetch.cf-markdown.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e9bd5fd9af..b6cf77d85ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai - Sessions/Agents: pass `agentId` when resolving existing transcript paths in reply runs so non-default agents and heartbeat/chat handlers no longer fail with `Session file path must be within sessions directory`. (#15141) Thanks @Goldenmonstew. - Sessions/Agents: pass `agentId` through status and usage transcript-resolution paths (auto-reply, gateway usage APIs, and session cost/log loaders) so non-default agents can resolve absolute session files without path-validation failures. (#15103) Thanks @jalehman. - Signal/Install: auto-install `signal-cli` via Homebrew on non-x64 Linux architectures, avoiding x86_64 native binary `Exec format error` failures on arm64/arm hosts. (#15443) Thanks @jogvan-k. +- Web tools/web_fetch: prefer `text/markdown` responses for Cloudflare Markdown for Agents, add `cf-markdown` extraction for markdown bodies, and redact fetched URLs in `x-markdown-tokens` debug logs to avoid leaking raw paths/query params. (#15376) Thanks @Yaxuan42. ## 2026.2.12 diff --git a/src/agents/tools/web-fetch.cf-markdown.test.ts b/src/agents/tools/web-fetch.cf-markdown.test.ts new file mode 100644 index 00000000000..d73300681fc --- /dev/null +++ b/src/agents/tools/web-fetch.cf-markdown.test.ts @@ -0,0 +1,186 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import * as ssrf from "../../infra/net/ssrf.js"; +import * as logger from "../../logger.js"; + +const lookupMock = vi.fn(); +const resolvePinnedHostname = ssrf.resolvePinnedHostname; + +function makeHeaders(map: Record): { get: (key: string) => string | null } { + return { + get: (key) => map[key.toLowerCase()] ?? null, + }; +} + +function markdownResponse(body: string, extraHeaders: Record = {}): Response { + return { + ok: true, + status: 200, + headers: makeHeaders({ "content-type": "text/markdown; charset=utf-8", ...extraHeaders }), + text: async () => body, + } as Response; +} + +function htmlResponse(body: string): Response { + return { + ok: true, + status: 200, + headers: makeHeaders({ "content-type": "text/html; charset=utf-8" }), + text: async () => body, + } as Response; +} + +describe("web_fetch Cloudflare Markdown for Agents", () => { + const priorFetch = global.fetch; + + beforeEach(() => { + lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]); + vi.spyOn(ssrf, "resolvePinnedHostname").mockImplementation((hostname) => + resolvePinnedHostname(hostname, lookupMock), + ); + }); + + afterEach(() => { + // @ts-expect-error restore + global.fetch = priorFetch; + lookupMock.mockReset(); + vi.restoreAllMocks(); + }); + + it("sends Accept header preferring text/markdown", async () => { + const fetchSpy = vi.fn().mockResolvedValue(markdownResponse("# Test Page\n\nHello world.")); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + await tool?.execute?.("call", { url: "https://example.com/page" }); + + expect(fetchSpy).toHaveBeenCalled(); + const [, init] = fetchSpy.mock.calls[0]; + expect(init.headers.Accept).toBe("text/markdown, text/html;q=0.9, */*;q=0.1"); + }); + + it("uses cf-markdown extractor for text/markdown responses", async () => { + const md = "# CF Markdown\n\nThis is server-rendered markdown."; + const fetchSpy = vi.fn().mockResolvedValue(markdownResponse(md)); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + const result = await tool?.execute?.("call", { url: "https://example.com/cf" }); + expect(result?.details).toMatchObject({ + status: 200, + extractor: "cf-markdown", + contentType: "text/markdown", + }); + // The body should contain the original markdown (wrapped with security markers) + expect(result?.details?.text).toContain("CF Markdown"); + expect(result?.details?.text).toContain("server-rendered markdown"); + }); + + it("falls back to readability for text/html responses", async () => { + const html = + "

HTML Page

Content here.

"; + const fetchSpy = vi.fn().mockResolvedValue(htmlResponse(html)); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + const result = await tool?.execute?.("call", { url: "https://example.com/html" }); + expect(result?.details?.extractor).not.toBe("cf-markdown"); + expect(result?.details?.contentType).toBe("text/html"); + }); + + it("logs x-markdown-tokens when header is present", async () => { + const logSpy = vi.spyOn(logger, "logDebug").mockImplementation(() => {}); + const fetchSpy = vi + .fn() + .mockResolvedValue(markdownResponse("# Tokens Test", { "x-markdown-tokens": "1500" })); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + await tool?.execute?.("call", { url: "https://example.com/tokens/private?token=secret" }); + + expect(logSpy).toHaveBeenCalledWith( + expect.stringContaining("x-markdown-tokens: 1500 (https://example.com/...)"), + ); + const tokenLogs = logSpy.mock.calls + .map(([message]) => String(message)) + .filter((message) => message.includes("x-markdown-tokens")); + expect(tokenLogs).toHaveLength(1); + expect(tokenLogs[0]).not.toContain("token=secret"); + expect(tokenLogs[0]).not.toContain("/tokens/private"); + }); + + it("converts markdown to text when extractMode is text", async () => { + const md = "# Heading\n\n**Bold text** and [a link](https://example.com)."; + const fetchSpy = vi.fn().mockResolvedValue(markdownResponse(md)); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + const result = await tool?.execute?.("call", { + url: "https://example.com/text-mode", + extractMode: "text", + }); + expect(result?.details).toMatchObject({ + extractor: "cf-markdown", + extractMode: "text", + }); + // Text mode strips header markers (#) and link syntax + expect(result?.details?.text).not.toContain("# Heading"); + expect(result?.details?.text).toContain("Heading"); + expect(result?.details?.text).not.toContain("[a link](https://example.com)"); + }); + + it("does not log x-markdown-tokens when header is absent", async () => { + const logSpy = vi.spyOn(logger, "logDebug").mockImplementation(() => {}); + const fetchSpy = vi.fn().mockResolvedValue(markdownResponse("# No tokens")); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const { createWebFetchTool } = await import("./web-tools.js"); + const tool = createWebFetchTool({ + config: { + tools: { web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false } } } }, + }, + }); + + await tool?.execute?.("call", { url: "https://example.com/no-tokens" }); + + const tokenLogs = logSpy.mock.calls.filter( + (args) => typeof args[0] === "string" && args[0].includes("x-markdown-tokens"), + ); + expect(tokenLogs).toHaveLength(0); + }); +}); diff --git a/src/agents/tools/web-fetch.ts b/src/agents/tools/web-fetch.ts index 3b24e409e16..97bb5406863 100644 --- a/src/agents/tools/web-fetch.ts +++ b/src/agents/tools/web-fetch.ts @@ -3,6 +3,7 @@ import type { OpenClawConfig } from "../../config/config.js"; import type { AnyAgentTool } from "./common.js"; import { fetchWithSsrFGuard } from "../../infra/net/fetch-guard.js"; import { SsrFBlockedError } from "../../infra/net/ssrf.js"; +import { logDebug } from "../../logger.js"; import { wrapExternalContent, wrapWebContent } from "../../security/external-content.js"; import { normalizeSecretInput } from "../../utils/normalize-secret-input.js"; import { stringEnum } from "../schema/typebox.js"; @@ -212,6 +213,15 @@ function formatWebFetchErrorDetail(params: { return truncated.text; } +function redactUrlForDebugLog(rawUrl: string): string { + try { + const parsed = new URL(rawUrl); + return parsed.pathname && parsed.pathname !== "/" ? `${parsed.origin}/...` : parsed.origin; + } catch { + return "[invalid-url]"; + } +} + const WEB_FETCH_WRAPPER_WITH_WARNING_OVERHEAD = wrapWebContent("", "web_fetch").length; const WEB_FETCH_WRAPPER_NO_WARNING_OVERHEAD = wrapExternalContent("", { source: "web_fetch", @@ -409,7 +419,7 @@ async function runWebFetch(params: { timeoutMs: params.timeoutSeconds * 1000, init: { headers: { - Accept: "*/*", + Accept: "text/markdown, text/html;q=0.9, */*;q=0.1", "User-Agent": params.userAgent, "Accept-Language": "en-US,en;q=0.9", }, @@ -418,6 +428,14 @@ async function runWebFetch(params: { res = result.response; finalUrl = result.finalUrl; release = result.release; + + // Cloudflare Markdown for Agents — log token budget hint when present + const markdownTokens = res.headers.get("x-markdown-tokens"); + if (markdownTokens) { + logDebug( + `[web-fetch] x-markdown-tokens: ${markdownTokens} (${redactUrlForDebugLog(finalUrl)})`, + ); + } } catch (error) { if (error instanceof SsrFBlockedError) { throw error; @@ -522,7 +540,13 @@ async function runWebFetch(params: { let title: string | undefined; let extractor = "raw"; let text = body; - if (contentType.includes("text/html")) { + if (contentType.includes("text/markdown")) { + // Cloudflare Markdown for Agents: server returned pre-rendered markdown + extractor = "cf-markdown"; + if (params.extractMode === "text") { + text = markdownToText(body); + } + } else if (contentType.includes("text/html")) { if (params.readabilityEnabled) { const readable = await extractReadableContent({ html: body,