497 lines
16 KiB
TypeScript
497 lines
16 KiB
TypeScript
import { Type } from "@sinclair/typebox";
|
|
|
|
import type { ClawdbotConfig } from "../../config/config.js";
|
|
import { stringEnum } from "../schema/typebox.js";
|
|
import type { AnyAgentTool } from "./common.js";
|
|
import { jsonResult, readNumberParam, readStringParam } from "./common.js";
|
|
import {
|
|
CacheEntry,
|
|
DEFAULT_CACHE_TTL_MINUTES,
|
|
DEFAULT_TIMEOUT_SECONDS,
|
|
normalizeCacheKey,
|
|
readCache,
|
|
readResponseText,
|
|
resolveCacheTtlMs,
|
|
resolveTimeoutSeconds,
|
|
withTimeout,
|
|
writeCache,
|
|
} from "./web-shared.js";
|
|
import {
|
|
extractReadableContent,
|
|
markdownToText,
|
|
truncateText,
|
|
type ExtractMode,
|
|
} from "./web-fetch-utils.js";
|
|
|
|
export { extractReadableContent } from "./web-fetch-utils.js";
|
|
|
|
const EXTRACT_MODES = ["markdown", "text"] as const;
|
|
|
|
const DEFAULT_FETCH_MAX_CHARS = 50_000;
|
|
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
|
|
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
|
|
const DEFAULT_FETCH_USER_AGENT =
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
|
|
|
|
const FETCH_CACHE = new Map<string, CacheEntry<Record<string, unknown>>>();
|
|
|
|
const WebFetchSchema = Type.Object({
|
|
url: Type.String({ description: "HTTP or HTTPS URL to fetch." }),
|
|
extractMode: Type.Optional(
|
|
stringEnum(EXTRACT_MODES, {
|
|
description: 'Extraction mode ("markdown" or "text").',
|
|
default: "markdown",
|
|
}),
|
|
),
|
|
maxChars: Type.Optional(
|
|
Type.Number({
|
|
description: "Maximum characters to return (truncates when exceeded).",
|
|
minimum: 100,
|
|
}),
|
|
),
|
|
});
|
|
|
|
type WebFetchConfig = NonNullable<ClawdbotConfig["tools"]>["web"] extends infer Web
|
|
? Web extends { fetch?: infer Fetch }
|
|
? Fetch
|
|
: undefined
|
|
: undefined;
|
|
|
|
type FirecrawlFetchConfig =
|
|
| {
|
|
enabled?: boolean;
|
|
apiKey?: string;
|
|
baseUrl?: string;
|
|
onlyMainContent?: boolean;
|
|
maxAgeMs?: number;
|
|
timeoutSeconds?: number;
|
|
}
|
|
| undefined;
|
|
|
|
function resolveFetchConfig(cfg?: ClawdbotConfig): WebFetchConfig {
|
|
const fetch = cfg?.tools?.web?.fetch;
|
|
if (!fetch || typeof fetch !== "object") return undefined;
|
|
return fetch as WebFetchConfig;
|
|
}
|
|
|
|
function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boolean }): boolean {
|
|
if (typeof params.fetch?.enabled === "boolean") return params.fetch.enabled;
|
|
return true;
|
|
}
|
|
|
|
function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
|
|
if (typeof fetch?.readability === "boolean") return fetch.readability;
|
|
return true;
|
|
}
|
|
|
|
function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
|
|
if (!fetch || typeof fetch !== "object") return undefined;
|
|
const firecrawl = "firecrawl" in fetch ? fetch.firecrawl : undefined;
|
|
if (!firecrawl || typeof firecrawl !== "object") return undefined;
|
|
return firecrawl as FirecrawlFetchConfig;
|
|
}
|
|
|
|
function resolveFirecrawlApiKey(firecrawl?: FirecrawlFetchConfig): string | undefined {
|
|
const fromConfig =
|
|
firecrawl && "apiKey" in firecrawl && typeof firecrawl.apiKey === "string"
|
|
? firecrawl.apiKey.trim()
|
|
: "";
|
|
const fromEnv = (process.env.FIRECRAWL_API_KEY ?? "").trim();
|
|
return fromConfig || fromEnv || undefined;
|
|
}
|
|
|
|
function resolveFirecrawlEnabled(params: {
|
|
firecrawl?: FirecrawlFetchConfig;
|
|
apiKey?: string;
|
|
}): boolean {
|
|
if (typeof params.firecrawl?.enabled === "boolean") return params.firecrawl.enabled;
|
|
return Boolean(params.apiKey);
|
|
}
|
|
|
|
function resolveFirecrawlBaseUrl(firecrawl?: FirecrawlFetchConfig): string {
|
|
const raw =
|
|
firecrawl && "baseUrl" in firecrawl && typeof firecrawl.baseUrl === "string"
|
|
? firecrawl.baseUrl.trim()
|
|
: "";
|
|
return raw || DEFAULT_FIRECRAWL_BASE_URL;
|
|
}
|
|
|
|
function resolveFirecrawlOnlyMainContent(firecrawl?: FirecrawlFetchConfig): boolean {
|
|
if (typeof firecrawl?.onlyMainContent === "boolean") return firecrawl.onlyMainContent;
|
|
return true;
|
|
}
|
|
|
|
function resolveFirecrawlMaxAgeMs(firecrawl?: FirecrawlFetchConfig): number | undefined {
|
|
const raw =
|
|
firecrawl && "maxAgeMs" in firecrawl && typeof firecrawl.maxAgeMs === "number"
|
|
? firecrawl.maxAgeMs
|
|
: undefined;
|
|
if (typeof raw !== "number" || !Number.isFinite(raw)) return undefined;
|
|
const parsed = Math.max(0, Math.floor(raw));
|
|
return parsed > 0 ? parsed : undefined;
|
|
}
|
|
|
|
function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): number {
|
|
const resolved = resolveFirecrawlMaxAgeMs(firecrawl);
|
|
if (typeof resolved === "number") return resolved;
|
|
return DEFAULT_FIRECRAWL_MAX_AGE_MS;
|
|
}
|
|
|
|
function resolveMaxChars(value: unknown, fallback: number): number {
|
|
const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback;
|
|
return Math.max(100, Math.floor(parsed));
|
|
}
|
|
|
|
export async function fetchFirecrawlContent(params: {
|
|
url: string;
|
|
extractMode: ExtractMode;
|
|
apiKey: string;
|
|
baseUrl: string;
|
|
onlyMainContent: boolean;
|
|
maxAgeMs: number;
|
|
proxy: "auto" | "basic" | "stealth";
|
|
storeInCache: boolean;
|
|
timeoutSeconds: number;
|
|
}): Promise<{
|
|
text: string;
|
|
title?: string;
|
|
finalUrl?: string;
|
|
status?: number;
|
|
warning?: string;
|
|
}> {
|
|
const endpoint = resolveFirecrawlEndpoint(params.baseUrl);
|
|
const body: Record<string, unknown> = {
|
|
url: params.url,
|
|
formats: ["markdown"],
|
|
onlyMainContent: params.onlyMainContent,
|
|
timeout: params.timeoutSeconds * 1000,
|
|
maxAge: params.maxAgeMs,
|
|
proxy: params.proxy,
|
|
storeInCache: params.storeInCache,
|
|
};
|
|
|
|
const res = await fetch(endpoint, {
|
|
method: "POST",
|
|
headers: {
|
|
Authorization: `Bearer ${params.apiKey}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify(body),
|
|
signal: withTimeout(undefined, params.timeoutSeconds * 1000),
|
|
});
|
|
|
|
const payload = (await res.json()) as {
|
|
success?: boolean;
|
|
data?: {
|
|
markdown?: string;
|
|
content?: string;
|
|
metadata?: {
|
|
title?: string;
|
|
sourceURL?: string;
|
|
statusCode?: number;
|
|
};
|
|
};
|
|
warning?: string;
|
|
error?: string;
|
|
};
|
|
|
|
if (!res.ok || payload?.success === false) {
|
|
const detail = payload?.error || res.statusText;
|
|
throw new Error(`Firecrawl fetch failed (${res.status}): ${detail}`.trim());
|
|
}
|
|
|
|
const data = payload?.data ?? {};
|
|
const rawText =
|
|
typeof data.markdown === "string"
|
|
? data.markdown
|
|
: typeof data.content === "string"
|
|
? data.content
|
|
: "";
|
|
const text = params.extractMode === "text" ? markdownToText(rawText) : rawText;
|
|
return {
|
|
text,
|
|
title: data.metadata?.title,
|
|
finalUrl: data.metadata?.sourceURL,
|
|
status: data.metadata?.statusCode,
|
|
warning: payload?.warning,
|
|
};
|
|
}
|
|
|
|
async function runWebFetch(params: {
|
|
url: string;
|
|
extractMode: ExtractMode;
|
|
maxChars: number;
|
|
timeoutSeconds: number;
|
|
cacheTtlMs: number;
|
|
userAgent: string;
|
|
readabilityEnabled: boolean;
|
|
firecrawlEnabled: boolean;
|
|
firecrawlApiKey?: string;
|
|
firecrawlBaseUrl: string;
|
|
firecrawlOnlyMainContent: boolean;
|
|
firecrawlMaxAgeMs: number;
|
|
firecrawlProxy: "auto" | "basic" | "stealth";
|
|
firecrawlStoreInCache: boolean;
|
|
firecrawlTimeoutSeconds: number;
|
|
}): Promise<Record<string, unknown>> {
|
|
const cacheKey = normalizeCacheKey(
|
|
`fetch:${params.url}:${params.extractMode}:${params.maxChars}`,
|
|
);
|
|
const cached = readCache(FETCH_CACHE, cacheKey);
|
|
if (cached) return { ...cached.value, cached: true };
|
|
|
|
let parsedUrl: URL;
|
|
try {
|
|
parsedUrl = new URL(params.url);
|
|
} catch {
|
|
throw new Error("Invalid URL: must be http or https");
|
|
}
|
|
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
|
|
throw new Error("Invalid URL: must be http or https");
|
|
}
|
|
|
|
const start = Date.now();
|
|
let res: Response;
|
|
try {
|
|
res = await fetch(parsedUrl.toString(), {
|
|
method: "GET",
|
|
headers: {
|
|
Accept: "*/*",
|
|
"User-Agent": params.userAgent,
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
},
|
|
signal: withTimeout(undefined, params.timeoutSeconds * 1000),
|
|
});
|
|
} catch (error) {
|
|
if (params.firecrawlEnabled && params.firecrawlApiKey) {
|
|
const firecrawl = await fetchFirecrawlContent({
|
|
url: params.url,
|
|
extractMode: params.extractMode,
|
|
apiKey: params.firecrawlApiKey,
|
|
baseUrl: params.firecrawlBaseUrl,
|
|
onlyMainContent: params.firecrawlOnlyMainContent,
|
|
maxAgeMs: params.firecrawlMaxAgeMs,
|
|
proxy: params.firecrawlProxy,
|
|
storeInCache: params.firecrawlStoreInCache,
|
|
timeoutSeconds: params.firecrawlTimeoutSeconds,
|
|
});
|
|
const truncated = truncateText(firecrawl.text, params.maxChars);
|
|
const payload = {
|
|
url: params.url,
|
|
finalUrl: firecrawl.finalUrl || params.url,
|
|
status: firecrawl.status ?? 200,
|
|
contentType: "text/markdown",
|
|
title: firecrawl.title,
|
|
extractMode: params.extractMode,
|
|
extractor: "firecrawl",
|
|
truncated: truncated.truncated,
|
|
length: truncated.text.length,
|
|
fetchedAt: new Date().toISOString(),
|
|
tookMs: Date.now() - start,
|
|
text: truncated.text,
|
|
warning: firecrawl.warning,
|
|
};
|
|
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
|
return payload;
|
|
}
|
|
throw error;
|
|
}
|
|
|
|
if (!res.ok) {
|
|
if (params.firecrawlEnabled && params.firecrawlApiKey) {
|
|
const firecrawl = await fetchFirecrawlContent({
|
|
url: params.url,
|
|
extractMode: params.extractMode,
|
|
apiKey: params.firecrawlApiKey,
|
|
baseUrl: params.firecrawlBaseUrl,
|
|
onlyMainContent: params.firecrawlOnlyMainContent,
|
|
maxAgeMs: params.firecrawlMaxAgeMs,
|
|
proxy: params.firecrawlProxy,
|
|
storeInCache: params.firecrawlStoreInCache,
|
|
timeoutSeconds: params.firecrawlTimeoutSeconds,
|
|
});
|
|
const truncated = truncateText(firecrawl.text, params.maxChars);
|
|
const payload = {
|
|
url: params.url,
|
|
finalUrl: firecrawl.finalUrl || params.url,
|
|
status: firecrawl.status ?? res.status,
|
|
contentType: "text/markdown",
|
|
title: firecrawl.title,
|
|
extractMode: params.extractMode,
|
|
extractor: "firecrawl",
|
|
truncated: truncated.truncated,
|
|
length: truncated.text.length,
|
|
fetchedAt: new Date().toISOString(),
|
|
tookMs: Date.now() - start,
|
|
text: truncated.text,
|
|
warning: firecrawl.warning,
|
|
};
|
|
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
|
return payload;
|
|
}
|
|
const detail = await readResponseText(res);
|
|
throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`);
|
|
}
|
|
|
|
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
|
|
const body = await readResponseText(res);
|
|
|
|
let title: string | undefined;
|
|
let extractor = "raw";
|
|
let text = body;
|
|
if (contentType.includes("text/html")) {
|
|
if (params.readabilityEnabled) {
|
|
const readable = await extractReadableContent({
|
|
html: body,
|
|
url: res.url || params.url,
|
|
extractMode: params.extractMode,
|
|
});
|
|
if (readable?.text) {
|
|
text = readable.text;
|
|
title = readable.title;
|
|
extractor = "readability";
|
|
} else {
|
|
const firecrawl = await tryFirecrawlFallback(params);
|
|
if (firecrawl) {
|
|
text = firecrawl.text;
|
|
title = firecrawl.title;
|
|
extractor = "firecrawl";
|
|
} else {
|
|
throw new Error(
|
|
"Web fetch extraction failed: Readability and Firecrawl returned no content.",
|
|
);
|
|
}
|
|
}
|
|
} else {
|
|
throw new Error(
|
|
"Web fetch extraction failed: Readability disabled and Firecrawl unavailable.",
|
|
);
|
|
}
|
|
} else if (contentType.includes("application/json")) {
|
|
try {
|
|
text = JSON.stringify(JSON.parse(body), null, 2);
|
|
extractor = "json";
|
|
} catch {
|
|
text = body;
|
|
extractor = "raw";
|
|
}
|
|
}
|
|
|
|
const truncated = truncateText(text, params.maxChars);
|
|
const payload = {
|
|
url: params.url,
|
|
finalUrl: res.url || params.url,
|
|
status: res.status,
|
|
contentType,
|
|
title,
|
|
extractMode: params.extractMode,
|
|
extractor,
|
|
truncated: truncated.truncated,
|
|
length: truncated.text.length,
|
|
fetchedAt: new Date().toISOString(),
|
|
tookMs: Date.now() - start,
|
|
text: truncated.text,
|
|
};
|
|
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
|
return payload;
|
|
}
|
|
|
|
async function tryFirecrawlFallback(params: {
|
|
url: string;
|
|
extractMode: ExtractMode;
|
|
firecrawlEnabled: boolean;
|
|
firecrawlApiKey?: string;
|
|
firecrawlBaseUrl: string;
|
|
firecrawlOnlyMainContent: boolean;
|
|
firecrawlMaxAgeMs: number;
|
|
firecrawlProxy: "auto" | "basic" | "stealth";
|
|
firecrawlStoreInCache: boolean;
|
|
firecrawlTimeoutSeconds: number;
|
|
}): Promise<{ text: string; title?: string } | null> {
|
|
if (!params.firecrawlEnabled || !params.firecrawlApiKey) return null;
|
|
try {
|
|
const firecrawl = await fetchFirecrawlContent({
|
|
url: params.url,
|
|
extractMode: params.extractMode,
|
|
apiKey: params.firecrawlApiKey,
|
|
baseUrl: params.firecrawlBaseUrl,
|
|
onlyMainContent: params.firecrawlOnlyMainContent,
|
|
maxAgeMs: params.firecrawlMaxAgeMs,
|
|
proxy: params.firecrawlProxy,
|
|
storeInCache: params.firecrawlStoreInCache,
|
|
timeoutSeconds: params.firecrawlTimeoutSeconds,
|
|
});
|
|
return { text: firecrawl.text, title: firecrawl.title };
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function resolveFirecrawlEndpoint(baseUrl: string): string {
|
|
const trimmed = baseUrl.trim();
|
|
if (!trimmed) return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`;
|
|
try {
|
|
const url = new URL(trimmed);
|
|
if (url.pathname && url.pathname !== "/") {
|
|
return url.toString();
|
|
}
|
|
url.pathname = "/v2/scrape";
|
|
return url.toString();
|
|
} catch {
|
|
return `${DEFAULT_FIRECRAWL_BASE_URL}/v2/scrape`;
|
|
}
|
|
}
|
|
|
|
export function createWebFetchTool(options?: {
|
|
config?: ClawdbotConfig;
|
|
sandboxed?: boolean;
|
|
}): AnyAgentTool | null {
|
|
const fetch = resolveFetchConfig(options?.config);
|
|
if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null;
|
|
const readabilityEnabled = resolveFetchReadabilityEnabled(fetch);
|
|
const firecrawl = resolveFirecrawlConfig(fetch);
|
|
const firecrawlApiKey = resolveFirecrawlApiKey(firecrawl);
|
|
const firecrawlEnabled = resolveFirecrawlEnabled({ firecrawl, apiKey: firecrawlApiKey });
|
|
const firecrawlBaseUrl = resolveFirecrawlBaseUrl(firecrawl);
|
|
const firecrawlOnlyMainContent = resolveFirecrawlOnlyMainContent(firecrawl);
|
|
const firecrawlMaxAgeMs = resolveFirecrawlMaxAgeMsOrDefault(firecrawl);
|
|
const firecrawlTimeoutSeconds = resolveTimeoutSeconds(
|
|
firecrawl?.timeoutSeconds ?? fetch?.timeoutSeconds,
|
|
DEFAULT_TIMEOUT_SECONDS,
|
|
);
|
|
const userAgent =
|
|
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
|
|
DEFAULT_FETCH_USER_AGENT;
|
|
return {
|
|
label: "Web Fetch",
|
|
name: "web_fetch",
|
|
description:
|
|
"Fetch and extract readable content from a URL (HTML → markdown/text). Use for lightweight page access without browser automation.",
|
|
parameters: WebFetchSchema,
|
|
execute: async (_toolCallId, args) => {
|
|
const params = args as Record<string, unknown>;
|
|
const url = readStringParam(params, "url", { required: true });
|
|
const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown";
|
|
const maxChars = readNumberParam(params, "maxChars", { integer: true });
|
|
const result = await runWebFetch({
|
|
url,
|
|
extractMode,
|
|
maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS),
|
|
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
|
|
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
|
|
userAgent,
|
|
readabilityEnabled,
|
|
firecrawlEnabled,
|
|
firecrawlApiKey,
|
|
firecrawlBaseUrl,
|
|
firecrawlOnlyMainContent,
|
|
firecrawlMaxAgeMs,
|
|
firecrawlProxy: "auto",
|
|
firecrawlStoreInCache: true,
|
|
firecrawlTimeoutSeconds,
|
|
});
|
|
return jsonResult(result);
|
|
},
|
|
};
|
|
}
|