843 lines
25 KiB
TypeScript
Raw Normal View History

2026-01-14 01:08:15 +00:00
import type { AssistantMessage } from "@mariozechner/pi-ai";
2026-01-30 03:15:10 +01:00
import type { OpenClawConfig } from "../../config/config.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
2026-02-18 01:29:02 +00:00
import { formatSandboxToolPolicyBlockedMessage } from "../sandbox.js";
import { stableStringify } from "../stable-stringify.js";
import type { FailoverReason } from "./types.js";
2026-01-14 01:08:15 +00:00
const log = createSubsystemLogger("errors");
export function formatBillingErrorMessage(provider?: string, model?: string): string {
const providerName = provider?.trim();
const modelName = model?.trim();
const providerLabel =
providerName && modelName ? `${providerName} (${modelName})` : providerName || undefined;
if (providerLabel) {
return `⚠️ ${providerLabel} returned a billing error — your API key has run out of credits or has an insufficient balance. Check your ${providerName} billing dashboard and top up or switch to a different API key.`;
}
return "⚠️ API provider returned a billing error — your API key has run out of credits or has an insufficient balance. Check your provider's billing dashboard and top up or switch to a different API key.";
}
export const BILLING_ERROR_USER_MESSAGE = formatBillingErrorMessage();
const RATE_LIMIT_ERROR_USER_MESSAGE = "⚠️ API rate limit reached. Please try again later.";
const OVERLOADED_ERROR_USER_MESSAGE =
"The AI service is temporarily overloaded. Please try again in a moment.";
function formatRateLimitOrOverloadedErrorCopy(raw: string): string | undefined {
if (isRateLimitErrorMessage(raw)) {
return RATE_LIMIT_ERROR_USER_MESSAGE;
}
if (isOverloadedErrorMessage(raw)) {
return OVERLOADED_ERROR_USER_MESSAGE;
}
return undefined;
}
2026-01-14 01:08:15 +00:00
export function isContextOverflowError(errorMessage?: string): boolean {
if (!errorMessage) {
return false;
}
2026-01-14 01:08:15 +00:00
const lower = errorMessage.toLowerCase();
fix: auto-compact on context overflow promptError before returning error (#1627) * fix: detect Anthropic 'Request size exceeds model context window' as context overflow Anthropic now returns 'Request size exceeds model context window' instead of the previously detected 'prompt is too long' format. This new error message was not recognized by isContextOverflowError(), causing auto-compaction to NOT trigger. Users would see the raw error twice without any recovery attempt. Changes: - Add 'exceeds model context window' and 'request size exceeds' to isContextOverflowError() detection patterns - Add tests that fail without the fix, verifying both the raw error string and the JSON-wrapped format from Anthropic's API - Add test for formatAssistantErrorText to ensure the friendly 'Context overflow' message is shown instead of the raw error Note: The upstream pi-ai package (@mariozechner/pi-ai) also needs a fix in its OVERFLOW_PATTERNS regex: /exceeds the context window/i should be changed to /exceeds.*context window/i to match both 'the' and 'model' variants for triggering auto-compaction retry. * fix(tests): remove unused imports and helper from test files Remove WorkspaceBootstrapFile references and _makeFile helper that were incorrectly copied from another test file. These caused type errors and were unrelated to the context overflow detection tests. * fix: trigger auto-compaction on context overflow promptError When the LLM rejects a request with a context overflow error that surfaces as a promptError (thrown exception rather than streamed error), the existing auto-compaction in pi-coding-agent never triggers. This happens because the error bypasses the agent's message_end → agent_end → _checkCompaction path. This fix adds a fallback compaction attempt directly in the run loop: - Detects context overflow in promptError (excluding compaction_failure) - Calls compactEmbeddedPiSessionDirect (bypassing lane queues since already in-lane) - Retries the prompt after successful compaction - Limits to one compaction attempt per run to prevent infinite loops Fixes: context overflow errors shown to user without auto-compaction attempt * style: format compact.ts and run.ts with oxfmt * fix: tighten context overflow match (#1627) (thanks @rodrigouroz) --------- Co-authored-by: Claude <claude@anthropic.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-01-24 19:09:24 -03:00
const hasRequestSizeExceeds = lower.includes("request size exceeds");
const hasContextWindow =
lower.includes("context window") ||
lower.includes("context length") ||
lower.includes("maximum context length");
2026-01-14 01:08:15 +00:00
return (
lower.includes("request_too_large") ||
lower.includes("request exceeds the maximum size") ||
lower.includes("context length exceeded") ||
lower.includes("maximum context length") ||
lower.includes("prompt is too long") ||
fix: auto-compact on context overflow promptError before returning error (#1627) * fix: detect Anthropic 'Request size exceeds model context window' as context overflow Anthropic now returns 'Request size exceeds model context window' instead of the previously detected 'prompt is too long' format. This new error message was not recognized by isContextOverflowError(), causing auto-compaction to NOT trigger. Users would see the raw error twice without any recovery attempt. Changes: - Add 'exceeds model context window' and 'request size exceeds' to isContextOverflowError() detection patterns - Add tests that fail without the fix, verifying both the raw error string and the JSON-wrapped format from Anthropic's API - Add test for formatAssistantErrorText to ensure the friendly 'Context overflow' message is shown instead of the raw error Note: The upstream pi-ai package (@mariozechner/pi-ai) also needs a fix in its OVERFLOW_PATTERNS regex: /exceeds the context window/i should be changed to /exceeds.*context window/i to match both 'the' and 'model' variants for triggering auto-compaction retry. * fix(tests): remove unused imports and helper from test files Remove WorkspaceBootstrapFile references and _makeFile helper that were incorrectly copied from another test file. These caused type errors and were unrelated to the context overflow detection tests. * fix: trigger auto-compaction on context overflow promptError When the LLM rejects a request with a context overflow error that surfaces as a promptError (thrown exception rather than streamed error), the existing auto-compaction in pi-coding-agent never triggers. This happens because the error bypasses the agent's message_end → agent_end → _checkCompaction path. This fix adds a fallback compaction attempt directly in the run loop: - Detects context overflow in promptError (excluding compaction_failure) - Calls compactEmbeddedPiSessionDirect (bypassing lane queues since already in-lane) - Retries the prompt after successful compaction - Limits to one compaction attempt per run to prevent infinite loops Fixes: context overflow errors shown to user without auto-compaction attempt * style: format compact.ts and run.ts with oxfmt * fix: tighten context overflow match (#1627) (thanks @rodrigouroz) --------- Co-authored-by: Claude <claude@anthropic.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-01-24 19:09:24 -03:00
lower.includes("exceeds model context window") ||
lower.includes("model token limit") ||
fix: auto-compact on context overflow promptError before returning error (#1627) * fix: detect Anthropic 'Request size exceeds model context window' as context overflow Anthropic now returns 'Request size exceeds model context window' instead of the previously detected 'prompt is too long' format. This new error message was not recognized by isContextOverflowError(), causing auto-compaction to NOT trigger. Users would see the raw error twice without any recovery attempt. Changes: - Add 'exceeds model context window' and 'request size exceeds' to isContextOverflowError() detection patterns - Add tests that fail without the fix, verifying both the raw error string and the JSON-wrapped format from Anthropic's API - Add test for formatAssistantErrorText to ensure the friendly 'Context overflow' message is shown instead of the raw error Note: The upstream pi-ai package (@mariozechner/pi-ai) also needs a fix in its OVERFLOW_PATTERNS regex: /exceeds the context window/i should be changed to /exceeds.*context window/i to match both 'the' and 'model' variants for triggering auto-compaction retry. * fix(tests): remove unused imports and helper from test files Remove WorkspaceBootstrapFile references and _makeFile helper that were incorrectly copied from another test file. These caused type errors and were unrelated to the context overflow detection tests. * fix: trigger auto-compaction on context overflow promptError When the LLM rejects a request with a context overflow error that surfaces as a promptError (thrown exception rather than streamed error), the existing auto-compaction in pi-coding-agent never triggers. This happens because the error bypasses the agent's message_end → agent_end → _checkCompaction path. This fix adds a fallback compaction attempt directly in the run loop: - Detects context overflow in promptError (excluding compaction_failure) - Calls compactEmbeddedPiSessionDirect (bypassing lane queues since already in-lane) - Retries the prompt after successful compaction - Limits to one compaction attempt per run to prevent infinite loops Fixes: context overflow errors shown to user without auto-compaction attempt * style: format compact.ts and run.ts with oxfmt * fix: tighten context overflow match (#1627) (thanks @rodrigouroz) --------- Co-authored-by: Claude <claude@anthropic.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-01-24 19:09:24 -03:00
(hasRequestSizeExceeds && hasContextWindow) ||
lower.includes("context overflow:") ||
2026-01-14 01:08:15 +00:00
(lower.includes("413") && lower.includes("too large"))
);
}
const CONTEXT_WINDOW_TOO_SMALL_RE = /context window.*(too small|minimum is)/i;
const CONTEXT_OVERFLOW_HINT_RE =
/context.*overflow|context window.*(too (?:large|long)|exceed|over|limit|max(?:imum)?|requested|sent|tokens)|prompt.*(too (?:large|long)|exceed|over|limit|max(?:imum)?)|(?:request|input).*(?:context|window|length|token).*(too (?:large|long)|exceed|over|limit|max(?:imum)?)/i;
const RATE_LIMIT_HINT_RE =
/rate limit|too many requests|requests per (?:minute|hour|day)|quota|throttl|429\b/i;
export function isLikelyContextOverflowError(errorMessage?: string): boolean {
if (!errorMessage) {
return false;
}
if (CONTEXT_WINDOW_TOO_SMALL_RE.test(errorMessage)) {
return false;
}
// Rate limit errors can match the broad CONTEXT_OVERFLOW_HINT_RE pattern
// (e.g., "request reached organization TPD rate limit" matches request.*limit).
// Exclude them before checking context overflow heuristics.
if (isRateLimitErrorMessage(errorMessage)) {
return false;
}
if (isContextOverflowError(errorMessage)) {
return true;
}
if (RATE_LIMIT_HINT_RE.test(errorMessage)) {
return false;
}
return CONTEXT_OVERFLOW_HINT_RE.test(errorMessage);
}
2026-01-14 01:08:15 +00:00
export function isCompactionFailureError(errorMessage?: string): boolean {
if (!errorMessage) {
return false;
}
2026-01-14 01:08:15 +00:00
const lower = errorMessage.toLowerCase();
const hasCompactionTerm =
2026-01-14 01:08:15 +00:00
lower.includes("summarization failed") ||
lower.includes("auto-compaction") ||
lower.includes("compaction failed") ||
lower.includes("compaction");
if (!hasCompactionTerm) {
return false;
}
// Treat any likely overflow shape as a compaction failure when compaction terms are present.
// Providers often vary wording (e.g. "context window exceeded") across APIs.
if (isLikelyContextOverflowError(errorMessage)) {
return true;
}
// Keep explicit fallback for bare "context overflow" strings.
return lower.includes("context overflow");
2026-01-14 01:08:15 +00:00
}
const ERROR_PAYLOAD_PREFIX_RE =
/^(?:error|api\s*error|apierror|openai\s*error|anthropic\s*error|gateway\s*error)[:\s-]+/i;
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/gi;
const ERROR_PREFIX_RE =
/^(?:error|api\s*error|openai\s*error|anthropic\s*error|gateway\s*error|request failed|failed|exception)[:\s-]+/i;
const CONTEXT_OVERFLOW_ERROR_HEAD_RE =
/^(?:context overflow:|request_too_large\b|request size exceeds\b|request exceeds the maximum size\b|context length exceeded\b|maximum context length\b|prompt is too long\b|exceeds model context window\b)/i;
Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447) * Agents: add subagent orchestration controls * Agents: add subagent orchestration controls (WIP uncommitted changes) * feat(subagents): add depth-based spawn gating for sub-sub-agents * feat(subagents): tool policy, registry, and announce chain for nested agents * feat(subagents): system prompt, docs, changelog for nested sub-agents * fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex), the fallback candidate logic in resolveFallbackCandidates silently appended the global primary model (opus) as a backstop. On reinjection/steer with a transient error, the session could fall back to opus which has a smaller context window and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? [] instead of undefined, preventing the implicit primary backstop. Bug 2: Active subagents showed 'model n/a' in /subagents list because resolveModelDisplay only read entry.model/modelProvider (populated after run completes). Fix: fall back to modelOverride/providerOverride fields which are populated at spawn time via sessions.patch. Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could theoretically escape runEmbeddedPiAgent and be treated as failover candidates in runWithModelFallback, causing a switch to a model with a smaller context window. Fix: in runWithModelFallback, detect context overflow errors via isLikelyContextOverflowError and rethrow them immediately instead of trying the next model candidate. * fix(subagents): track spawn depth in session store and fix announce routing for nested agents * Fix compaction status tracking and dedupe overflow compaction triggers * fix(subagents): enforce depth block via session store and implement cascade kill * fix: inject group chat context into system prompt * fix(subagents): always write model to session store at spawn time * Preserve spawnDepth when agent handler rewrites session entry * fix(subagents): suppress announce on steer-restart * fix(subagents): fallback spawned session model to runtime default * fix(subagents): enforce spawn depth when caller key resolves by sessionId * feat(subagents): implement active-first ordering for numeric targets and enhance task display - Added a test to verify that subagents with numeric targets follow an active-first list ordering. - Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity. - Enhanced task display in command responses to prevent truncation of long task descriptions. - Introduced new utility functions for compacting task text and managing subagent run states. * fix(subagents): show model for active runs via run record fallback When the spawned model matches the agent's default model, the session store's override fields are intentionally cleared (isDefault: true). The model/modelProvider fields are only populated after the run completes. This left active subagents showing 'model n/a'. Fix: store the resolved model on SubagentRunRecord at registration time, and use it as a fallback in both display paths (subagents tool and /subagents command) when the session store entry has no model info. Changes: - SubagentRunRecord: add optional model field - registerSubagentRun: accept and persist model param - sessions-spawn-tool: pass resolvedModel to registerSubagentRun - subagents-tool: pass run record model as fallback to resolveModelDisplay - commands-subagents: pass run record model as fallback to resolveModelDisplay * feat(chat): implement session key resolution and reset on sidebar navigation - Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar. - Updated the `renderTab` function to handle session key changes when navigating to the chat tab. - Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation. * fix: subagent timeout=0 passthrough and fallback prompt duplication Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default - sessions-spawn-tool: default to undefined (not 0) when neither timeout param is provided; use != null check so explicit 0 passes through to gateway - agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles 0 → MAX_SAFE_TIMEOUT_MS) Bug 2: model fallback no longer re-injects the original prompt as a duplicate - agent.ts: track fallback attempt index; on retries use a short continuation message instead of the full original prompt since the session file already contains it from the first attempt - Also skip re-sending images on fallback retries (already in session) * feat(subagents): truncate long task descriptions in subagents command output - Introduced a new utility function to format task previews, limiting their length to improve readability. - Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately. - Adjusted related tests to verify that long task descriptions are now truncated in the output. * refactor(subagents): update subagent registry path resolution and improve command output formatting - Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically. - Enhanced the formatting of command output for active and recent subagents, adding separators for better readability. - Updated related tests to reflect changes in command output structure. * fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted The previous fix (75a791106) correctly handled the case where runTimeoutSeconds was explicitly set to 0 ("no timeout"). However, when models omit the parameter entirely (which is common since the schema marks it as optional), runTimeoutSeconds resolved to undefined. undefined flowed through the chain as: sessions_spawn → timeout: undefined (since undefined != null is false) → gateway agent handler → agentCommand opts.timeout: undefined → resolveAgentTimeoutMs({ overrideSeconds: undefined }) → DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes) This caused subagents to be killed at exactly 10 minutes even though the user's intent (via TOOLS.md) was for subagents to run without a timeout. Fix: default runTimeoutSeconds to 0 (no timeout) when neither runTimeoutSeconds nor timeoutSeconds is provided by the caller. Subagent spawns are long-running by design and should not inherit the 600s agent-command default timeout. * fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default) * fix: thread timeout override through getReplyFromConfig dispatch path getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override, always falling back to the config default (600s). Add timeoutOverrideSeconds to GetReplyOptions and pass it through as overrideSeconds so callers of the dispatch chain can specify a custom timeout (0 = no timeout). This complements the existing timeout threading in agentCommand and the cron isolated-agent runner, which already pass overrideSeconds correctly. * feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling - Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution. - Updated the `resolveFallbackCandidates` function to utilize the new normalization logic. - Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms. - Introduced a new test case to ensure that the normalization process works as expected for various input formats. * feat(tests): add unit tests for steer failure behavior in openclaw-tools - Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails. - Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected. - Enhanced the subagent registry with a new function to clear steer restart suppression. - Updated related components to support the new test scenarios. * fix(subagents): replace stop command with kill in slash commands and documentation - Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs. - Modified related documentation to reflect the change in command usage. - Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling. - Enhanced tests to ensure correct behavior of the updated commands and their interactions. * feat(tests): add unit tests for readLatestAssistantReply function - Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios. - Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text. - Mocked the gateway call to simulate different message histories for comprehensive testing. * feat(tests): enhance subagent kill-all cascade tests and announce formatting - Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents. - Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content. - Improved the handling of long findings and stats in the announce formatting logic to ensure concise output. - Refactored related functions to enhance clarity and maintainability in the subagent registry and tools. * refactor(subagent): update announce formatting and remove unused constants - Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests. - Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic. - Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs. - Cleaned up unused imports in the commands-subagents file to enhance code clarity. * feat(tests): enhance billing error handling in user-facing text - Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context. - Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages. - Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output. - Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements. * feat(subagent): enhance workflow guidance and auto-announcement clarity - Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates. - Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow. - Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts. * fix(cron): avoid announcing interim subagent spawn acks * chore: clean post-rebase imports * fix(cron): fall back to child replies when parent stays interim * fix(subagents): make active-run guidance advisory * fix(subagents): update announce flow to handle active descendants and enhance test coverage - Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting. - Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents. - Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process. * fix(subagents): enhance announce flow and formatting for user updates - Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context. - Refactored the announcement logic to improve clarity and ensure internal context remains private. - Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates. - Introduced a new function to build reply instructions based on session context, improving the overall announcement process. * fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204) * fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204) * fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204) * fix: apply formatting after rebase (#14447) (thanks @tyler6204)
2026-02-14 22:03:45 -08:00
const BILLING_ERROR_HEAD_RE =
/^(?:error[:\s-]+)?billing(?:\s+error)?(?:[:\s-]+|$)|^(?:error[:\s-]+)?(?:credit balance|insufficient credits?|payment required|http\s*402\b)/i;
const HTTP_STATUS_PREFIX_RE = /^(?:http\s*)?(\d{3})\s+(.+)$/i;
const HTTP_STATUS_CODE_PREFIX_RE = /^(?:http\s*)?(\d{3})(?:\s+([\s\S]+))?$/i;
const HTML_ERROR_PREFIX_RE = /^\s*(?:<!doctype\s+html\b|<html\b)/i;
const CLOUDFLARE_HTML_ERROR_CODES = new Set([521, 522, 523, 524, 525, 526, 530]);
const TRANSIENT_HTTP_ERROR_CODES = new Set([500, 502, 503, 504, 521, 522, 523, 524, 529]);
const HTTP_ERROR_HINTS = [
"error",
"bad request",
"not found",
"unauthorized",
"forbidden",
"internal server",
"service unavailable",
"gateway",
"rate limit",
"overloaded",
"timeout",
"timed out",
"invalid",
"too many requests",
"permission",
];
function extractLeadingHttpStatus(raw: string): { code: number; rest: string } | null {
const match = raw.match(HTTP_STATUS_CODE_PREFIX_RE);
if (!match) {
return null;
}
const code = Number(match[1]);
if (!Number.isFinite(code)) {
return null;
}
return { code, rest: (match[2] ?? "").trim() };
}
export function isCloudflareOrHtmlErrorPage(raw: string): boolean {
const trimmed = raw.trim();
if (!trimmed) {
return false;
}
const status = extractLeadingHttpStatus(trimmed);
if (!status || status.code < 500) {
return false;
}
if (CLOUDFLARE_HTML_ERROR_CODES.has(status.code)) {
return true;
}
return (
status.code < 600 && HTML_ERROR_PREFIX_RE.test(status.rest) && /<\/html>/i.test(status.rest)
);
}
export function isTransientHttpError(raw: string): boolean {
const trimmed = raw.trim();
if (!trimmed) {
return false;
}
const status = extractLeadingHttpStatus(trimmed);
if (!status) {
return false;
}
return TRANSIENT_HTTP_ERROR_CODES.has(status.code);
}
function stripFinalTagsFromText(text: string): string {
if (!text) {
return text;
}
return text.replace(FINAL_TAG_RE, "");
}
function collapseConsecutiveDuplicateBlocks(text: string): string {
const trimmed = text.trim();
if (!trimmed) {
return text;
}
const blocks = trimmed.split(/\n{2,}/);
if (blocks.length < 2) {
return text;
}
const normalizeBlock = (value: string) => value.trim().replace(/\s+/g, " ");
const result: string[] = [];
let lastNormalized: string | null = null;
for (const block of blocks) {
const normalized = normalizeBlock(block);
if (lastNormalized && normalized === lastNormalized) {
continue;
}
result.push(block.trim());
lastNormalized = normalized;
}
if (result.length === blocks.length) {
return text;
}
return result.join("\n\n");
}
function isLikelyHttpErrorText(raw: string): boolean {
if (isCloudflareOrHtmlErrorPage(raw)) {
return true;
}
const match = raw.match(HTTP_STATUS_PREFIX_RE);
if (!match) {
return false;
}
const code = Number(match[1]);
if (!Number.isFinite(code) || code < 400) {
return false;
}
const message = match[2].toLowerCase();
return HTTP_ERROR_HINTS.some((hint) => message.includes(hint));
}
function shouldRewriteContextOverflowText(raw: string): boolean {
if (!isContextOverflowError(raw)) {
return false;
}
return (
isRawApiErrorPayload(raw) ||
isLikelyHttpErrorText(raw) ||
ERROR_PREFIX_RE.test(raw) ||
CONTEXT_OVERFLOW_ERROR_HEAD_RE.test(raw)
);
}
type ErrorPayload = Record<string, unknown>;
function isErrorPayloadObject(payload: unknown): payload is ErrorPayload {
if (!payload || typeof payload !== "object" || Array.isArray(payload)) {
return false;
}
const record = payload as ErrorPayload;
if (record.type === "error") {
return true;
}
if (typeof record.request_id === "string" || typeof record.requestId === "string") {
return true;
}
if ("error" in record) {
const err = record.error;
if (err && typeof err === "object" && !Array.isArray(err)) {
const errRecord = err as ErrorPayload;
if (
typeof errRecord.message === "string" ||
typeof errRecord.type === "string" ||
typeof errRecord.code === "string"
) {
return true;
}
}
}
return false;
}
function parseApiErrorPayload(raw: string): ErrorPayload | null {
if (!raw) {
return null;
}
const trimmed = raw.trim();
if (!trimmed) {
return null;
}
const candidates = [trimmed];
if (ERROR_PAYLOAD_PREFIX_RE.test(trimmed)) {
candidates.push(trimmed.replace(ERROR_PAYLOAD_PREFIX_RE, "").trim());
}
for (const candidate of candidates) {
if (!candidate.startsWith("{") || !candidate.endsWith("}")) {
continue;
}
try {
const parsed = JSON.parse(candidate) as unknown;
if (isErrorPayloadObject(parsed)) {
return parsed;
}
} catch {
// ignore parse errors
}
}
return null;
}
export function getApiErrorPayloadFingerprint(raw?: string): string | null {
if (!raw) {
return null;
}
const payload = parseApiErrorPayload(raw);
if (!payload) {
return null;
}
return stableStringify(payload);
}
export function isRawApiErrorPayload(raw?: string): boolean {
return getApiErrorPayloadFingerprint(raw) !== null;
}
2026-01-15 08:16:44 +00:00
export type ApiErrorInfo = {
httpCode?: string;
type?: string;
message?: string;
requestId?: string;
};
export function parseApiErrorInfo(raw?: string): ApiErrorInfo | null {
if (!raw) {
return null;
}
2026-01-15 08:16:44 +00:00
const trimmed = raw.trim();
if (!trimmed) {
return null;
}
2026-01-15 08:16:44 +00:00
let httpCode: string | undefined;
let candidate = trimmed;
const httpPrefixMatch = candidate.match(/^(\d{3})\s+(.+)$/s);
if (httpPrefixMatch) {
httpCode = httpPrefixMatch[1];
candidate = httpPrefixMatch[2].trim();
}
const payload = parseApiErrorPayload(candidate);
if (!payload) {
return null;
}
2026-01-15 08:16:44 +00:00
const requestId =
typeof payload.request_id === "string"
? payload.request_id
: typeof payload.requestId === "string"
? payload.requestId
: undefined;
const topType = typeof payload.type === "string" ? payload.type : undefined;
const topMessage = typeof payload.message === "string" ? payload.message : undefined;
let errType: string | undefined;
let errMessage: string | undefined;
if (payload.error && typeof payload.error === "object" && !Array.isArray(payload.error)) {
const err = payload.error as Record<string, unknown>;
if (typeof err.type === "string") {
errType = err.type;
}
if (typeof err.code === "string" && !errType) {
errType = err.code;
}
if (typeof err.message === "string") {
errMessage = err.message;
}
2026-01-15 08:16:44 +00:00
}
return {
httpCode,
type: errType ?? topType,
message: errMessage ?? topMessage,
requestId,
};
}
export function formatRawAssistantErrorForUi(raw?: string): string {
const trimmed = (raw ?? "").trim();
if (!trimmed) {
return "LLM request failed with an unknown error.";
}
2026-01-15 08:16:44 +00:00
const leadingStatus = extractLeadingHttpStatus(trimmed);
if (leadingStatus && isCloudflareOrHtmlErrorPage(trimmed)) {
return `The AI service is temporarily unavailable (HTTP ${leadingStatus.code}). Please try again in a moment.`;
}
2026-01-22 22:24:25 +00:00
const httpMatch = trimmed.match(HTTP_STATUS_PREFIX_RE);
if (httpMatch) {
const rest = httpMatch[2].trim();
if (!rest.startsWith("{")) {
return `HTTP ${httpMatch[1]}: ${rest}`;
}
}
2026-01-15 08:16:44 +00:00
const info = parseApiErrorInfo(trimmed);
if (info?.message) {
const prefix = info.httpCode ? `HTTP ${info.httpCode}` : "LLM error";
const type = info.type ? ` ${info.type}` : "";
const requestId = info.requestId ? ` (request_id: ${info.requestId})` : "";
return `${prefix}${type}: ${info.message}${requestId}`;
}
return trimmed.length > 600 ? `${trimmed.slice(0, 600)}` : trimmed;
}
2026-01-14 01:08:15 +00:00
export function formatAssistantErrorText(
msg: AssistantMessage,
opts?: { cfg?: OpenClawConfig; sessionKey?: string; provider?: string; model?: string },
2026-01-14 01:08:15 +00:00
): string | undefined {
// Also format errors if errorMessage is present, even if stopReason isn't "error"
2026-01-14 01:08:15 +00:00
const raw = (msg.errorMessage ?? "").trim();
if (msg.stopReason !== "error" && !raw) {
return undefined;
}
if (!raw) {
return "LLM request failed with an unknown error.";
}
2026-01-14 01:08:15 +00:00
const unknownTool =
raw.match(/unknown tool[:\s]+["']?([a-z0-9_-]+)["']?/i) ??
raw.match(/tool\s+["']?([a-z0-9_-]+)["']?\s+(?:not found|is not available)/i);
2026-01-14 01:08:15 +00:00
if (unknownTool?.[1]) {
const rewritten = formatSandboxToolPolicyBlockedMessage({
cfg: opts?.cfg,
sessionKey: opts?.sessionKey,
toolName: unknownTool[1],
});
if (rewritten) {
return rewritten;
}
2026-01-14 01:08:15 +00:00
}
if (isContextOverflowError(raw)) {
return (
"Context overflow: prompt too large for the model. " +
"Try /reset (or /new) to start a fresh session, or use a larger-context model."
2026-01-14 01:08:15 +00:00
);
}
// Catch role ordering errors - including JSON-wrapped and "400" prefix variants
if (
/incorrect role information|roles must alternate|400.*role|"message".*role.*information/i.test(
raw,
)
) {
2026-01-14 01:08:15 +00:00
return (
"Message ordering conflict - please try again. " +
"If this persists, use /new to start a fresh session."
);
}
if (isMissingToolCallInputError(raw)) {
return (
"Session history looks corrupted (tool call input missing). " +
"Use /new to start a fresh session. " +
"If this keeps happening, reset the session or delete the corrupted session transcript."
);
}
const invalidRequest = raw.match(/"type":"invalid_request_error".*?"message":"([^"]+)"/);
2026-01-14 01:08:15 +00:00
if (invalidRequest?.[1]) {
return `LLM request rejected: ${invalidRequest[1]}`;
}
const transientCopy = formatRateLimitOrOverloadedErrorCopy(raw);
if (transientCopy) {
return transientCopy;
2026-01-14 01:08:15 +00:00
}
if (isTimeoutErrorMessage(raw)) {
return "LLM request timed out.";
}
if (isBillingErrorMessage(raw)) {
return formatBillingErrorMessage(opts?.provider, opts?.model ?? msg.model);
}
2026-01-22 22:24:25 +00:00
if (isLikelyHttpErrorText(raw) || isRawApiErrorPayload(raw)) {
return formatRawAssistantErrorForUi(raw);
}
// Never return raw unhandled errors - log for debugging but return safe message
if (raw.length > 600) {
log.warn(`Long error truncated: ${raw.slice(0, 200)}`);
}
2026-01-14 01:08:15 +00:00
return raw.length > 600 ? `${raw.slice(0, 600)}` : raw;
}
export function sanitizeUserFacingText(text: string, opts?: { errorContext?: boolean }): string {
if (!text) {
return text;
}
const errorContext = opts?.errorContext ?? false;
const stripped = stripFinalTagsFromText(text);
const trimmed = stripped.trim();
if (!trimmed) {
return "";
}
// Only apply error-pattern rewrites when the caller knows this text is an error payload.
// Otherwise we risk swallowing legitimate assistant text that merely *mentions* these errors.
if (errorContext) {
if (/incorrect role information|roles must alternate/i.test(trimmed)) {
return (
"Message ordering conflict - please try again. " +
"If this persists, use /new to start a fresh session."
);
}
if (shouldRewriteContextOverflowText(trimmed)) {
return (
"Context overflow: prompt too large for the model. " +
"Try /reset (or /new) to start a fresh session, or use a larger-context model."
);
}
if (isBillingErrorMessage(trimmed)) {
return BILLING_ERROR_USER_MESSAGE;
}
if (isRawApiErrorPayload(trimmed) || isLikelyHttpErrorText(trimmed)) {
return formatRawAssistantErrorForUi(trimmed);
}
if (ERROR_PREFIX_RE.test(trimmed)) {
const prefixedCopy = formatRateLimitOrOverloadedErrorCopy(trimmed);
if (prefixedCopy) {
return prefixedCopy;
}
if (isTimeoutErrorMessage(trimmed)) {
return "LLM request timed out.";
}
return formatRawAssistantErrorForUi(trimmed);
}
}
// Strip leading blank lines (including whitespace-only lines) without clobbering indentation on
// the first content line (e.g. markdown/code blocks).
const withoutLeadingEmptyLines = stripped.replace(/^(?:[ \t]*\r?\n)+/, "");
return collapseConsecutiveDuplicateBlocks(withoutLeadingEmptyLines);
}
export function isRateLimitAssistantError(msg: AssistantMessage | undefined): boolean {
if (!msg || msg.stopReason !== "error") {
return false;
}
2026-01-14 01:08:15 +00:00
return isRateLimitErrorMessage(msg.errorMessage ?? "");
}
type ErrorPattern = RegExp | string;
const ERROR_PATTERNS = {
rateLimit: [
/rate[_ ]limit|too many requests|429/,
"exceeded your current quota",
"resource has been exhausted",
"quota exceeded",
"resource_exhausted",
"usage limit",
],
fix: treat HTTP 503 as failover-eligible for LLM provider errors (#21086) * fix: treat HTTP 503 as failover-eligible for LLM provider errors When LLM SDKs wrap 503 responses, the leading "503" prefix is lost (e.g. Google Gemini returns "high demand" / "UNAVAILABLE" without a numeric prefix). The existing isTransientHttpError only matches messages starting with "503 ...", so these wrapped errors silently skip failover — no profile rotation, no model fallback. This patch closes that gap: - resolveFailoverReasonFromError: map HTTP status 503 → rate_limit (covers structured error objects with a status field) - ERROR_PATTERNS.overloaded: add /\b503\b/, "service unavailable", "high demand" (covers message-only classification when the leading status prefix is absent) Existing isTransientHttpError behavior is unchanged; these additions are complementary and only fire for errors that previously fell through unclassified. * fix: address review feedback — drop /\b503\b/ pattern, add test coverage - Remove `/\b503\b/` from ERROR_PATTERNS.overloaded to resolve the semantic inconsistency noted by reviewers: `isTransientHttpError` already handles messages prefixed with "503" (→ "timeout"), so a redundant overloaded pattern would classify the same class of errors differently depending on message formatting. - Keep "service unavailable" and "high demand" patterns — these are the real gap-fillers for SDK-rewritten messages that lack a numeric prefix. - Add test case for JSON-wrapped 503 error body containing "overloaded" to strengthen coverage. * fix: unify 503 classification — status 503 → timeout (consistent with isTransientHttpError) resolveFailoverReasonFromError previously mapped status 503 → "rate_limit", while the string-based isTransientHttpError mapped "503 ..." → "timeout". Align both paths: structured {status: 503} now also returns "timeout", matching the existing transient-error convention. Both reasons are failover-eligible, so runtime behavior is unchanged. --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
2026-02-20 04:45:09 +08:00
overloaded: [
/overloaded_error|"type"\s*:\s*"overloaded_error"/i,
"overloaded",
"service unavailable",
"high demand",
],
timeout: [
"timeout",
"timed out",
"deadline exceeded",
"context deadline exceeded",
/without sending (?:any )?chunks?/i,
/\bstop reason:\s*abort\b/i,
/\breason:\s*abort\b/i,
/\bunhandled stop reason:\s*abort\b/i,
],
2026-01-14 01:08:15 +00:00
billing: [
/["']?(?:status|code)["']?\s*[:=]\s*402\b|\bhttp\s*402\b|\berror(?:\s+code)?\s*[:=]?\s*402\b|\b(?:got|returned|received)\s+(?:a\s+)?402\b|^\s*402\s+payment/i,
2026-01-14 01:08:15 +00:00
"payment required",
"insufficient credits",
"credit balance",
"plans & billing",
"insufficient balance",
2026-01-14 01:08:15 +00:00
],
auth: [
/invalid[_ ]?api[_ ]?key/,
"incorrect api key",
"invalid token",
"authentication",
"re-authenticate",
"oauth token refresh failed",
2026-01-14 01:08:15 +00:00
"unauthorized",
"forbidden",
"access denied",
"expired",
"token has expired",
/\b401\b/,
/\b403\b/,
"no credentials found",
"no api key found",
],
format: [
"string should match pattern",
"tool_use.id",
"tool_use_id",
"messages.1.content.1.tool_use.id",
"invalid request format",
/tool call id was.*must be/i,
2026-01-14 01:08:15 +00:00
],
} as const;
const TOOL_CALL_INPUT_MISSING_RE =
/tool_(?:use|call)\.(?:input|arguments).*?(?:field required|required)/i;
const TOOL_CALL_INPUT_PATH_RE =
/messages\.\d+\.content\.\d+\.tool_(?:use|call)\.(?:input|arguments)/i;
2026-01-18 15:19:25 +00:00
const IMAGE_DIMENSION_ERROR_RE =
/image dimensions exceed max allowed size for many-image requests:\s*(\d+)\s*pixels/i;
const IMAGE_DIMENSION_PATH_RE = /messages\.(\d+)\.content\.(\d+)\.image/i;
const IMAGE_SIZE_ERROR_RE = /image exceeds\s*(\d+(?:\.\d+)?)\s*mb/i;
2026-01-18 15:19:25 +00:00
function matchesErrorPatterns(raw: string, patterns: readonly ErrorPattern[]): boolean {
if (!raw) {
return false;
}
2026-01-14 01:08:15 +00:00
const value = raw.toLowerCase();
return patterns.some((pattern) =>
pattern instanceof RegExp ? pattern.test(value) : value.includes(pattern),
);
}
export function isRateLimitErrorMessage(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.rateLimit);
}
export function isTimeoutErrorMessage(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.timeout);
}
export function isBillingErrorMessage(raw: string): boolean {
const value = raw.toLowerCase();
if (!value) {
return false;
}
Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447) * Agents: add subagent orchestration controls * Agents: add subagent orchestration controls (WIP uncommitted changes) * feat(subagents): add depth-based spawn gating for sub-sub-agents * feat(subagents): tool policy, registry, and announce chain for nested agents * feat(subagents): system prompt, docs, changelog for nested sub-agents * fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex), the fallback candidate logic in resolveFallbackCandidates silently appended the global primary model (opus) as a backstop. On reinjection/steer with a transient error, the session could fall back to opus which has a smaller context window and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? [] instead of undefined, preventing the implicit primary backstop. Bug 2: Active subagents showed 'model n/a' in /subagents list because resolveModelDisplay only read entry.model/modelProvider (populated after run completes). Fix: fall back to modelOverride/providerOverride fields which are populated at spawn time via sessions.patch. Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could theoretically escape runEmbeddedPiAgent and be treated as failover candidates in runWithModelFallback, causing a switch to a model with a smaller context window. Fix: in runWithModelFallback, detect context overflow errors via isLikelyContextOverflowError and rethrow them immediately instead of trying the next model candidate. * fix(subagents): track spawn depth in session store and fix announce routing for nested agents * Fix compaction status tracking and dedupe overflow compaction triggers * fix(subagents): enforce depth block via session store and implement cascade kill * fix: inject group chat context into system prompt * fix(subagents): always write model to session store at spawn time * Preserve spawnDepth when agent handler rewrites session entry * fix(subagents): suppress announce on steer-restart * fix(subagents): fallback spawned session model to runtime default * fix(subagents): enforce spawn depth when caller key resolves by sessionId * feat(subagents): implement active-first ordering for numeric targets and enhance task display - Added a test to verify that subagents with numeric targets follow an active-first list ordering. - Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity. - Enhanced task display in command responses to prevent truncation of long task descriptions. - Introduced new utility functions for compacting task text and managing subagent run states. * fix(subagents): show model for active runs via run record fallback When the spawned model matches the agent's default model, the session store's override fields are intentionally cleared (isDefault: true). The model/modelProvider fields are only populated after the run completes. This left active subagents showing 'model n/a'. Fix: store the resolved model on SubagentRunRecord at registration time, and use it as a fallback in both display paths (subagents tool and /subagents command) when the session store entry has no model info. Changes: - SubagentRunRecord: add optional model field - registerSubagentRun: accept and persist model param - sessions-spawn-tool: pass resolvedModel to registerSubagentRun - subagents-tool: pass run record model as fallback to resolveModelDisplay - commands-subagents: pass run record model as fallback to resolveModelDisplay * feat(chat): implement session key resolution and reset on sidebar navigation - Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar. - Updated the `renderTab` function to handle session key changes when navigating to the chat tab. - Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation. * fix: subagent timeout=0 passthrough and fallback prompt duplication Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default - sessions-spawn-tool: default to undefined (not 0) when neither timeout param is provided; use != null check so explicit 0 passes through to gateway - agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles 0 → MAX_SAFE_TIMEOUT_MS) Bug 2: model fallback no longer re-injects the original prompt as a duplicate - agent.ts: track fallback attempt index; on retries use a short continuation message instead of the full original prompt since the session file already contains it from the first attempt - Also skip re-sending images on fallback retries (already in session) * feat(subagents): truncate long task descriptions in subagents command output - Introduced a new utility function to format task previews, limiting their length to improve readability. - Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately. - Adjusted related tests to verify that long task descriptions are now truncated in the output. * refactor(subagents): update subagent registry path resolution and improve command output formatting - Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically. - Enhanced the formatting of command output for active and recent subagents, adding separators for better readability. - Updated related tests to reflect changes in command output structure. * fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted The previous fix (75a791106) correctly handled the case where runTimeoutSeconds was explicitly set to 0 ("no timeout"). However, when models omit the parameter entirely (which is common since the schema marks it as optional), runTimeoutSeconds resolved to undefined. undefined flowed through the chain as: sessions_spawn → timeout: undefined (since undefined != null is false) → gateway agent handler → agentCommand opts.timeout: undefined → resolveAgentTimeoutMs({ overrideSeconds: undefined }) → DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes) This caused subagents to be killed at exactly 10 minutes even though the user's intent (via TOOLS.md) was for subagents to run without a timeout. Fix: default runTimeoutSeconds to 0 (no timeout) when neither runTimeoutSeconds nor timeoutSeconds is provided by the caller. Subagent spawns are long-running by design and should not inherit the 600s agent-command default timeout. * fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default) * fix: thread timeout override through getReplyFromConfig dispatch path getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override, always falling back to the config default (600s). Add timeoutOverrideSeconds to GetReplyOptions and pass it through as overrideSeconds so callers of the dispatch chain can specify a custom timeout (0 = no timeout). This complements the existing timeout threading in agentCommand and the cron isolated-agent runner, which already pass overrideSeconds correctly. * feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling - Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution. - Updated the `resolveFallbackCandidates` function to utilize the new normalization logic. - Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms. - Introduced a new test case to ensure that the normalization process works as expected for various input formats. * feat(tests): add unit tests for steer failure behavior in openclaw-tools - Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails. - Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected. - Enhanced the subagent registry with a new function to clear steer restart suppression. - Updated related components to support the new test scenarios. * fix(subagents): replace stop command with kill in slash commands and documentation - Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs. - Modified related documentation to reflect the change in command usage. - Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling. - Enhanced tests to ensure correct behavior of the updated commands and their interactions. * feat(tests): add unit tests for readLatestAssistantReply function - Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios. - Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text. - Mocked the gateway call to simulate different message histories for comprehensive testing. * feat(tests): enhance subagent kill-all cascade tests and announce formatting - Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents. - Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content. - Improved the handling of long findings and stats in the announce formatting logic to ensure concise output. - Refactored related functions to enhance clarity and maintainability in the subagent registry and tools. * refactor(subagent): update announce formatting and remove unused constants - Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests. - Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic. - Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs. - Cleaned up unused imports in the commands-subagents file to enhance code clarity. * feat(tests): enhance billing error handling in user-facing text - Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context. - Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages. - Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output. - Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements. * feat(subagent): enhance workflow guidance and auto-announcement clarity - Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates. - Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow. - Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts. * fix(cron): avoid announcing interim subagent spawn acks * chore: clean post-rebase imports * fix(cron): fall back to child replies when parent stays interim * fix(subagents): make active-run guidance advisory * fix(subagents): update announce flow to handle active descendants and enhance test coverage - Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting. - Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents. - Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process. * fix(subagents): enhance announce flow and formatting for user updates - Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context. - Refactored the announcement logic to improve clarity and ensure internal context remains private. - Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates. - Introduced a new function to build reply instructions based on session context, improving the overall announcement process. * fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204) * fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204) * fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204) * fix: apply formatting after rebase (#14447) (thanks @tyler6204)
2026-02-14 22:03:45 -08:00
if (matchesErrorPatterns(value, ERROR_PATTERNS.billing)) {
return true;
}
if (!BILLING_ERROR_HEAD_RE.test(raw)) {
return false;
}
return (
value.includes("upgrade") ||
value.includes("credits") ||
value.includes("payment") ||
value.includes("plan")
);
2026-01-14 01:08:15 +00:00
}
export function isMissingToolCallInputError(raw: string): boolean {
if (!raw) {
return false;
}
return TOOL_CALL_INPUT_MISSING_RE.test(raw) || TOOL_CALL_INPUT_PATH_RE.test(raw);
}
export function isBillingAssistantError(msg: AssistantMessage | undefined): boolean {
if (!msg || msg.stopReason !== "error") {
return false;
}
2026-01-14 01:08:15 +00:00
return isBillingErrorMessage(msg.errorMessage ?? "");
}
export function isAuthErrorMessage(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.auth);
}
export function isOverloadedErrorMessage(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.overloaded);
}
function isJsonApiInternalServerError(raw: string): boolean {
if (!raw) {
return false;
}
const value = raw.toLowerCase();
// Anthropic often wraps transient 500s in JSON payloads like:
// {"type":"error","error":{"type":"api_error","message":"Internal server error"}}
return value.includes('"type":"api_error"') && value.includes("internal server error");
}
2026-01-18 15:19:25 +00:00
export function parseImageDimensionError(raw: string): {
maxDimensionPx?: number;
messageIndex?: number;
contentIndex?: number;
raw: string;
} | null {
if (!raw) {
return null;
}
2026-01-18 15:19:25 +00:00
const lower = raw.toLowerCase();
if (!lower.includes("image dimensions exceed max allowed size")) {
return null;
}
2026-01-18 15:19:25 +00:00
const limitMatch = raw.match(IMAGE_DIMENSION_ERROR_RE);
const pathMatch = raw.match(IMAGE_DIMENSION_PATH_RE);
return {
maxDimensionPx: limitMatch?.[1] ? Number.parseInt(limitMatch[1], 10) : undefined,
messageIndex: pathMatch?.[1] ? Number.parseInt(pathMatch[1], 10) : undefined,
contentIndex: pathMatch?.[2] ? Number.parseInt(pathMatch[2], 10) : undefined,
raw,
};
}
export function isImageDimensionErrorMessage(raw: string): boolean {
return Boolean(parseImageDimensionError(raw));
}
export function parseImageSizeError(raw: string): {
maxMb?: number;
raw: string;
} | null {
if (!raw) {
return null;
}
const lower = raw.toLowerCase();
if (!lower.includes("image exceeds") || !lower.includes("mb")) {
return null;
}
const match = raw.match(IMAGE_SIZE_ERROR_RE);
return {
maxMb: match?.[1] ? Number.parseFloat(match[1]) : undefined,
raw,
};
}
export function isImageSizeError(errorMessage?: string): boolean {
if (!errorMessage) {
return false;
}
return Boolean(parseImageSizeError(errorMessage));
}
2026-01-14 01:08:15 +00:00
export function isCloudCodeAssistFormatError(raw: string): boolean {
2026-01-18 15:19:25 +00:00
return !isImageDimensionErrorMessage(raw) && matchesErrorPatterns(raw, ERROR_PATTERNS.format);
2026-01-14 01:08:15 +00:00
}
export function isAuthAssistantError(msg: AssistantMessage | undefined): boolean {
if (!msg || msg.stopReason !== "error") {
return false;
}
2026-01-14 01:08:15 +00:00
return isAuthErrorMessage(msg.errorMessage ?? "");
}
export function isModelNotFoundErrorMessage(raw: string): boolean {
if (!raw) {
return false;
}
const lower = raw.toLowerCase();
// Direct pattern matches from OpenClaw internals and common providers.
if (
lower.includes("unknown model") ||
lower.includes("model not found") ||
lower.includes("model_not_found") ||
lower.includes("not_found_error") ||
(lower.includes("does not exist") && lower.includes("model")) ||
(lower.includes("invalid model") && !lower.includes("invalid model reference"))
) {
return true;
}
// Google Gemini: "models/X is not found for api version"
if (/models\/[^\s]+ is not found/i.test(raw)) {
return true;
}
// JSON error payloads: {"status": "NOT_FOUND"} or {"code": 404} combined with not-found text.
if (/\b404\b/.test(raw) && /not[-_ ]?found/i.test(raw)) {
return true;
}
return false;
}
2026-01-14 01:08:15 +00:00
export function classifyFailoverReason(raw: string): FailoverReason | null {
if (isImageDimensionErrorMessage(raw)) {
return null;
}
if (isImageSizeError(raw)) {
return null;
}
if (isModelNotFoundErrorMessage(raw)) {
return "model_not_found";
}
if (isTransientHttpError(raw)) {
// Treat transient 5xx provider failures as retryable transport issues.
return "timeout";
}
if (isJsonApiInternalServerError(raw)) {
return "timeout";
}
if (isRateLimitErrorMessage(raw)) {
return "rate_limit";
}
if (isOverloadedErrorMessage(raw)) {
return "rate_limit";
}
if (isCloudCodeAssistFormatError(raw)) {
return "format";
}
if (isBillingErrorMessage(raw)) {
return "billing";
}
if (isTimeoutErrorMessage(raw)) {
return "timeout";
}
if (isAuthErrorMessage(raw)) {
return "auth";
}
2026-01-14 01:08:15 +00:00
return null;
}
export function isFailoverErrorMessage(raw: string): boolean {
return classifyFailoverReason(raw) !== null;
}
export function isFailoverAssistantError(msg: AssistantMessage | undefined): boolean {
if (!msg || msg.stopReason !== "error") {
return false;
}
2026-01-14 01:08:15 +00:00
return isFailoverErrorMessage(msg.errorMessage ?? "");
}