openclaw/src/agents/tools/pdf-native-providers.ts
Tyler Yust d0ac1b0195
feat: add PDF analysis tool with native provider support (#31319)
* feat: add PDF analysis tool with native provider support

New `pdf` tool for analyzing PDF documents with model-powered analysis.

Architecture:
- Native PDF path: sends raw PDF bytes directly to providers that support
  inline document input (Anthropic via DocumentBlockParam, Google Gemini
  via inlineData with application/pdf MIME type)
- Extraction fallback: for providers without native PDF support, extracts
  text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas,
  then sends through the standard vision/text completion path

Key features:
- Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10)
- Page range selection (`pages` param, e.g. "1-5", "1,3,7-9")
- Model override (`model` param) and file size limits (`maxBytesMb`)
- Auto-detects provider capability and falls back gracefully
- Same security patterns as image tool (SSRF guards, sandbox support,
  local path roots, workspace-only policy)

Config (agents.defaults):
- pdfModel: primary/fallbacks (defaults to imageModel, then session model)
- pdfMaxBytesMb: max PDF file size (default: 10)
- pdfMaxPages: max pages to process (default: 20)

Model catalog:
- Extended ModelInputType to include "document" alongside "text"/"image"
- Added modelSupportsDocument() capability check

Files:
- src/agents/tools/pdf-tool.ts - main tool factory
- src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.)
- src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google
- src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths
- Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help

* fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
2026-03-01 22:39:12 -08:00

180 lines
4.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Direct SDK/HTTP calls for providers that support native PDF document input.
* This bypasses pi-ai's content type system which does not have a "document" type.
*/
import { isRecord } from "../../utils.js";
import { normalizeSecretInput } from "../../utils/normalize-secret-input.js";
type PdfInput = {
base64: string;
filename?: string;
};
// ---------------------------------------------------------------------------
// Anthropic native PDF via Messages API
// ---------------------------------------------------------------------------
type AnthropicDocBlock = {
type: "document";
source: {
type: "base64";
media_type: "application/pdf";
data: string;
};
};
type AnthropicTextBlock = {
type: "text";
text: string;
};
type AnthropicContentBlock = AnthropicDocBlock | AnthropicTextBlock;
type AnthropicResponseContent = Array<{ type: string; text?: string }>;
export async function anthropicAnalyzePdf(params: {
apiKey: string;
modelId: string;
prompt: string;
pdfs: PdfInput[];
maxTokens?: number;
baseUrl?: string;
}): Promise<string> {
const apiKey = normalizeSecretInput(params.apiKey);
if (!apiKey) {
throw new Error("Anthropic PDF: apiKey required");
}
const content: AnthropicContentBlock[] = [];
for (const pdf of params.pdfs) {
content.push({
type: "document",
source: {
type: "base64",
media_type: "application/pdf",
data: pdf.base64,
},
});
}
content.push({ type: "text", text: params.prompt });
const baseUrl = (params.baseUrl ?? "https://api.anthropic.com").replace(/\/+$/, "");
const res = await fetch(`${baseUrl}/v1/messages`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-api-key": apiKey,
"anthropic-version": "2023-06-01",
"anthropic-beta": "pdfs-2024-09-25",
},
body: JSON.stringify({
model: params.modelId,
max_tokens: params.maxTokens ?? 4096,
messages: [{ role: "user", content }],
}),
});
if (!res.ok) {
const body = await res.text().catch(() => "");
throw new Error(
`Anthropic PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
);
}
const json = (await res.json().catch(() => null)) as unknown;
if (!isRecord(json)) {
throw new Error("Anthropic PDF response was not JSON.");
}
const responseContent = json.content as AnthropicResponseContent | undefined;
if (!Array.isArray(responseContent)) {
throw new Error("Anthropic PDF response missing content array.");
}
const text = responseContent
.filter((block) => block.type === "text" && typeof block.text === "string")
.map((block) => block.text!)
.join("");
if (!text.trim()) {
throw new Error("Anthropic PDF returned no text.");
}
return text.trim();
}
// ---------------------------------------------------------------------------
// Google Gemini native PDF via generateContent API
// ---------------------------------------------------------------------------
type GeminiPart = { inline_data: { mime_type: string; data: string } } | { text: string };
type GeminiCandidate = {
content?: { parts?: Array<{ text?: string }> };
};
export async function geminiAnalyzePdf(params: {
apiKey: string;
modelId: string;
prompt: string;
pdfs: PdfInput[];
baseUrl?: string;
}): Promise<string> {
const apiKey = normalizeSecretInput(params.apiKey);
if (!apiKey) {
throw new Error("Gemini PDF: apiKey required");
}
const parts: GeminiPart[] = [];
for (const pdf of params.pdfs) {
parts.push({
inline_data: {
mime_type: "application/pdf",
data: pdf.base64,
},
});
}
parts.push({ text: params.prompt });
const baseUrl = (params.baseUrl ?? "https://generativelanguage.googleapis.com").replace(
/\/+$/,
"",
);
const url = `${baseUrl}/v1beta/models/${encodeURIComponent(params.modelId)}:generateContent?key=${encodeURIComponent(apiKey)}`;
const res = await fetch(url, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
contents: [{ role: "user", parts }],
}),
});
if (!res.ok) {
const body = await res.text().catch(() => "");
throw new Error(
`Gemini PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
);
}
const json = (await res.json().catch(() => null)) as unknown;
if (!isRecord(json)) {
throw new Error("Gemini PDF response was not JSON.");
}
const candidates = json.candidates as GeminiCandidate[] | undefined;
if (!Array.isArray(candidates) || candidates.length === 0) {
throw new Error("Gemini PDF returned no candidates.");
}
const textParts = candidates[0].content?.parts?.filter((p) => typeof p.text === "string") ?? [];
const text = textParts.map((p) => p.text!).join("");
if (!text.trim()) {
throw new Error("Gemini PDF returned no text.");
}
return text.trim();
}