openclaw/src/agents/session-tool-result-guard.test.ts

361 lines
10 KiB
TypeScript
Raw Normal View History

2026-01-12 17:48:08 +00:00
import type { AgentMessage } from "@mariozechner/pi-agent-core";
2026-01-12 17:28:39 +00:00
import { SessionManager } from "@mariozechner/pi-coding-agent";
2026-01-12 17:48:08 +00:00
import { describe, expect, it } from "vitest";
2026-01-12 17:28:39 +00:00
import { installSessionToolResultGuard } from "./session-tool-result-guard.js";
type AppendMessage = Parameters<SessionManager["appendMessage"]>[0];
const asAppendMessage = (message: unknown) => message as AppendMessage;
const toolCallMessage = asAppendMessage({
2026-01-12 17:28:39 +00:00
role: "assistant",
content: [{ type: "toolCall", id: "call_1", name: "read", arguments: {} }],
});
2026-01-12 17:28:39 +00:00
function appendToolResultText(sm: SessionManager, text: string) {
sm.appendMessage(toolCallMessage);
sm.appendMessage(
asAppendMessage({
role: "toolResult",
toolCallId: "call_1",
toolName: "read",
content: [{ type: "text", text }],
isError: false,
timestamp: Date.now(),
}),
);
}
function getPersistedMessages(sm: SessionManager): AgentMessage[] {
return sm
.getEntries()
.filter((e) => e.type === "message")
.map((e) => (e as { message: AgentMessage }).message);
}
function expectPersistedRoles(sm: SessionManager, expectedRoles: AgentMessage["role"][]) {
const messages = getPersistedMessages(sm);
expect(messages.map((message) => message.role)).toEqual(expectedRoles);
return messages;
}
function getToolResultText(messages: AgentMessage[]): string {
const toolResult = messages.find((m) => m.role === "toolResult") as {
content: Array<{ type: string; text: string }>;
};
expect(toolResult).toBeDefined();
const textBlock = toolResult.content.find((b: { type: string }) => b.type === "text") as {
text: string;
};
return textBlock.text;
}
2026-01-12 17:28:39 +00:00
describe("installSessionToolResultGuard", () => {
it("inserts synthetic toolResult before non-tool message when pending", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
sm.appendMessage(toolCallMessage);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "text", text: "error" }],
stopReason: "error",
}),
);
2026-01-12 17:28:39 +00:00
const messages = expectPersistedRoles(sm, ["assistant", "toolResult", "assistant"]);
const synthetic = messages[1] as {
2026-01-12 17:28:39 +00:00
toolCallId?: string;
isError?: boolean;
content?: Array<{ type?: string; text?: string }>;
};
expect(synthetic.toolCallId).toBe("call_1");
expect(synthetic.isError).toBe(true);
expect(synthetic.content?.[0]?.text).toContain("missing tool result");
});
it("flushes pending tool calls when asked explicitly", () => {
const sm = SessionManager.inMemory();
const guard = installSessionToolResultGuard(sm);
sm.appendMessage(toolCallMessage);
guard.flushPendingToolResults();
expectPersistedRoles(sm, ["assistant", "toolResult"]);
2026-01-12 17:28:39 +00:00
});
it("does not add synthetic toolResult when a matching one exists", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
sm.appendMessage(toolCallMessage);
sm.appendMessage(
asAppendMessage({
role: "toolResult",
toolCallId: "call_1",
content: [{ type: "text", text: "ok" }],
isError: false,
}),
);
2026-01-12 17:28:39 +00:00
expectPersistedRoles(sm, ["assistant", "toolResult"]);
2026-01-12 17:28:39 +00:00
});
it("preserves ordering with multiple tool calls and partial results", () => {
const sm = SessionManager.inMemory();
const guard = installSessionToolResultGuard(sm);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [
{ type: "toolCall", id: "call_a", name: "one", arguments: {} },
{ type: "toolUse", id: "call_b", name: "two", arguments: {} },
],
}),
);
sm.appendMessage(
asAppendMessage({
role: "toolResult",
toolUseId: "call_a",
content: [{ type: "text", text: "a" }],
isError: false,
}),
);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "text", text: "after tools" }],
}),
);
2026-01-12 17:28:39 +00:00
const messages = expectPersistedRoles(sm, [
2026-01-12 17:28:39 +00:00
"assistant", // tool calls
"toolResult", // call_a real
"toolResult", // synthetic for call_b
"assistant", // text
]);
2026-01-12 17:48:08 +00:00
expect((messages[2] as { toolCallId?: string }).toolCallId).toBe("call_b");
2026-01-12 17:28:39 +00:00
expect(guard.getPendingIds()).toEqual([]);
});
it("flushes pending on guard when no toolResult arrived", () => {
const sm = SessionManager.inMemory();
const guard = installSessionToolResultGuard(sm);
sm.appendMessage(toolCallMessage);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "text", text: "hard error" }],
stopReason: "error",
}),
);
2026-01-12 17:28:39 +00:00
expect(guard.getPendingIds()).toEqual([]);
});
it("handles toolUseId on toolResult", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "toolUse", id: "use_1", name: "f", arguments: {} }],
}),
);
sm.appendMessage(
asAppendMessage({
role: "toolResult",
toolUseId: "use_1",
content: [{ type: "text", text: "ok" }],
}),
);
2026-01-12 17:28:39 +00:00
expectPersistedRoles(sm, ["assistant", "toolResult"]);
2026-01-12 17:28:39 +00:00
});
it("drops malformed tool calls missing input before persistence", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "toolCall", id: "call_1", name: "read" }],
}),
);
const messages = getPersistedMessages(sm);
expect(messages).toHaveLength(0);
});
it("drops malformed tool calls with invalid name tokens before persistence", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [
{
type: "toolCall",
id: "call_bad_name",
name: 'toolu_01mvznfebfuu <|tool_call_argument_begin|> {"command"',
arguments: {},
},
],
}),
);
expect(getPersistedMessages(sm)).toHaveLength(0);
});
it("drops tool calls not present in allowedToolNames", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm, {
allowedToolNames: ["read"],
});
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "toolCall", id: "call_1", name: "write", arguments: {} }],
}),
);
expect(getPersistedMessages(sm)).toHaveLength(0);
});
it("flushes pending tool results when a sanitized assistant message is dropped", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "toolCall", id: "call_1", name: "read", arguments: {} }],
}),
);
sm.appendMessage(
asAppendMessage({
role: "assistant",
content: [{ type: "toolCall", id: "call_2", name: "read" }],
}),
);
expectPersistedRoles(sm, ["assistant", "toolResult"]);
});
fix: recover from context overflow caused by oversized tool results (#11579) * fix: gracefully handle oversized tool results causing context overflow When a subagent reads a very large file or gets a huge tool result (e.g., gh pr diff on a massive PR), it can exceed the model's context window in a single prompt. Auto-compaction can't help because there's no older history to compact — just one giant tool result. This adds two layers of defense: 1. Pre-emptive: Hard cap on tool result size (400K chars ≈ 100K tokens) applied in the session tool result guard before persistence. This prevents extremely large tool results from being stored in full, regardless of model context window size. 2. Recovery: When context overflow is detected and compaction fails, scan session messages for oversized tool results relative to the model's actual context window (30% max share). If found, truncate them in the session via branching (creating a new branch with truncated content) and retry the prompt. The truncation preserves the beginning of the content (most useful for understanding what was read) and appends a notice explaining the truncation and suggesting offset/limit parameters for targeted reads. Includes comprehensive tests for: - Text truncation with newline-boundary awareness - Context-window-proportional size calculation - In-memory message truncation - Oversized detection heuristics - Guard-level size capping during persistence * fix: prep fixes for tool result truncation PR (#11579) (thanks @tyler6204)
2026-02-07 17:40:51 -08:00
it("caps oversized tool result text during persistence", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
appendToolResultText(sm, "x".repeat(500_000));
fix: recover from context overflow caused by oversized tool results (#11579) * fix: gracefully handle oversized tool results causing context overflow When a subagent reads a very large file or gets a huge tool result (e.g., gh pr diff on a massive PR), it can exceed the model's context window in a single prompt. Auto-compaction can't help because there's no older history to compact — just one giant tool result. This adds two layers of defense: 1. Pre-emptive: Hard cap on tool result size (400K chars ≈ 100K tokens) applied in the session tool result guard before persistence. This prevents extremely large tool results from being stored in full, regardless of model context window size. 2. Recovery: When context overflow is detected and compaction fails, scan session messages for oversized tool results relative to the model's actual context window (30% max share). If found, truncate them in the session via branching (creating a new branch with truncated content) and retry the prompt. The truncation preserves the beginning of the content (most useful for understanding what was read) and appends a notice explaining the truncation and suggesting offset/limit parameters for targeted reads. Includes comprehensive tests for: - Text truncation with newline-boundary awareness - Context-window-proportional size calculation - In-memory message truncation - Oversized detection heuristics - Guard-level size capping during persistence * fix: prep fixes for tool result truncation PR (#11579) (thanks @tyler6204)
2026-02-07 17:40:51 -08:00
const text = getToolResultText(getPersistedMessages(sm));
expect(text.length).toBeLessThan(500_000);
expect(text).toContain("truncated");
fix: recover from context overflow caused by oversized tool results (#11579) * fix: gracefully handle oversized tool results causing context overflow When a subagent reads a very large file or gets a huge tool result (e.g., gh pr diff on a massive PR), it can exceed the model's context window in a single prompt. Auto-compaction can't help because there's no older history to compact — just one giant tool result. This adds two layers of defense: 1. Pre-emptive: Hard cap on tool result size (400K chars ≈ 100K tokens) applied in the session tool result guard before persistence. This prevents extremely large tool results from being stored in full, regardless of model context window size. 2. Recovery: When context overflow is detected and compaction fails, scan session messages for oversized tool results relative to the model's actual context window (30% max share). If found, truncate them in the session via branching (creating a new branch with truncated content) and retry the prompt. The truncation preserves the beginning of the content (most useful for understanding what was read) and appends a notice explaining the truncation and suggesting offset/limit parameters for targeted reads. Includes comprehensive tests for: - Text truncation with newline-boundary awareness - Context-window-proportional size calculation - In-memory message truncation - Oversized detection heuristics - Guard-level size capping during persistence * fix: prep fixes for tool result truncation PR (#11579) (thanks @tyler6204)
2026-02-07 17:40:51 -08:00
});
it("does not truncate tool results under the limit", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm);
const originalText = "small tool result";
appendToolResultText(sm, originalText);
fix: recover from context overflow caused by oversized tool results (#11579) * fix: gracefully handle oversized tool results causing context overflow When a subagent reads a very large file or gets a huge tool result (e.g., gh pr diff on a massive PR), it can exceed the model's context window in a single prompt. Auto-compaction can't help because there's no older history to compact — just one giant tool result. This adds two layers of defense: 1. Pre-emptive: Hard cap on tool result size (400K chars ≈ 100K tokens) applied in the session tool result guard before persistence. This prevents extremely large tool results from being stored in full, regardless of model context window size. 2. Recovery: When context overflow is detected and compaction fails, scan session messages for oversized tool results relative to the model's actual context window (30% max share). If found, truncate them in the session via branching (creating a new branch with truncated content) and retry the prompt. The truncation preserves the beginning of the content (most useful for understanding what was read) and appends a notice explaining the truncation and suggesting offset/limit parameters for targeted reads. Includes comprehensive tests for: - Text truncation with newline-boundary awareness - Context-window-proportional size calculation - In-memory message truncation - Oversized detection heuristics - Guard-level size capping during persistence * fix: prep fixes for tool result truncation PR (#11579) (thanks @tyler6204)
2026-02-07 17:40:51 -08:00
const text = getToolResultText(getPersistedMessages(sm));
expect(text).toBe(originalText);
fix: recover from context overflow caused by oversized tool results (#11579) * fix: gracefully handle oversized tool results causing context overflow When a subagent reads a very large file or gets a huge tool result (e.g., gh pr diff on a massive PR), it can exceed the model's context window in a single prompt. Auto-compaction can't help because there's no older history to compact — just one giant tool result. This adds two layers of defense: 1. Pre-emptive: Hard cap on tool result size (400K chars ≈ 100K tokens) applied in the session tool result guard before persistence. This prevents extremely large tool results from being stored in full, regardless of model context window size. 2. Recovery: When context overflow is detected and compaction fails, scan session messages for oversized tool results relative to the model's actual context window (30% max share). If found, truncate them in the session via branching (creating a new branch with truncated content) and retry the prompt. The truncation preserves the beginning of the content (most useful for understanding what was read) and appends a notice explaining the truncation and suggesting offset/limit parameters for targeted reads. Includes comprehensive tests for: - Text truncation with newline-boundary awareness - Context-window-proportional size calculation - In-memory message truncation - Oversized detection heuristics - Guard-level size capping during persistence * fix: prep fixes for tool result truncation PR (#11579) (thanks @tyler6204)
2026-02-07 17:40:51 -08:00
});
it("blocks persistence when before_message_write returns block=true", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm, {
beforeMessageWriteHook: () => ({ block: true }),
});
sm.appendMessage(
asAppendMessage({
role: "user",
content: "hidden",
timestamp: Date.now(),
}),
);
expect(getPersistedMessages(sm)).toHaveLength(0);
});
it("applies before_message_write message mutations before persistence", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm, {
beforeMessageWriteHook: ({ message }) => {
if ((message as { role?: string }).role !== "toolResult") {
return undefined;
}
return {
message: {
...(message as unknown as Record<string, unknown>),
content: [{ type: "text", text: "rewritten by hook" }],
} as unknown as AgentMessage,
};
},
});
appendToolResultText(sm, "original");
const text = getToolResultText(getPersistedMessages(sm));
expect(text).toBe("rewritten by hook");
});
it("applies before_message_write to synthetic tool-result flushes", () => {
const sm = SessionManager.inMemory();
const guard = installSessionToolResultGuard(sm, {
beforeMessageWriteHook: ({ message }) => {
if ((message as { role?: string }).role !== "toolResult") {
return undefined;
}
return { block: true };
},
});
sm.appendMessage(toolCallMessage);
guard.flushPendingToolResults();
const messages = getPersistedMessages(sm);
expect(messages.map((m) => m.role)).toEqual(["assistant"]);
});
it("applies message persistence transform to user messages", () => {
const sm = SessionManager.inMemory();
installSessionToolResultGuard(sm, {
transformMessageForPersistence: (message) =>
(message as { role?: string }).role === "user"
? ({
...(message as unknown as Record<string, unknown>),
provenance: { kind: "inter_session", sourceTool: "sessions_send" },
2026-02-17 14:33:38 +09:00
} as unknown as AgentMessage)
: message,
});
sm.appendMessage(
asAppendMessage({
role: "user",
content: "forwarded",
timestamp: Date.now(),
}),
);
const persisted = sm.getEntries().find((e) => e.type === "message") as
| { message?: Record<string, unknown> }
| undefined;
expect(persisted?.message?.role).toBe("user");
expect(persisted?.message?.provenance).toEqual({
kind: "inter_session",
sourceTool: "sessions_send",
});
});
2026-01-12 17:28:39 +00:00
});