openclaw/src/agents/failover-error.test.ts
AI南柯(KingMo) 30ab9b2068
fix(agents): recognize connection errors as retryable timeout failures (#31697)
* fix(agents): recognize connection errors as retryable timeout failures

## Problem

When a model endpoint becomes unreachable (e.g., local proxy down,
relay server offline), the failover system fails to switch to the
next candidate model. Errors like "Connection error." are not
classified as retryable, causing the session to hang on a broken
endpoint instead of falling back to healthy alternatives.

## Root Cause

Connection/network errors are not recognized by the current failover
classifier:
- Text patterns like "Connection error.", "fetch failed", "network error"
- Error codes like ECONNREFUSED, ENOTFOUND, EAI_AGAIN (in message text)

While `failover-error.ts` handles these as error codes (err.code),
it misses them when they appear as plain text in error messages.

## Solution

Extend timeout error patterns to include connection/network failures:

**In `errors.ts` (ERROR_PATTERNS.timeout):**
- Text: "connection error", "network error", "fetch failed", etc.
- Regex: /\beconn(?:refused|reset|aborted)\b/i, /\benotfound\b/i, /\beai_again\b/i

**In `failover-error.ts` (TIMEOUT_HINT_RE):**
- Same patterns for non-assistant error paths

## Testing

Added test cases covering:
- "Connection error."
- "fetch failed"
- "network error: ECONNREFUSED"
- "ENOTFOUND" / "EAI_AGAIN" in message text

## Impact

- **Compatibility:** High - only expands retryable error detection
- **Behavior:** Connection failures now trigger automatic fallback
- **Risk:** Low - changes are additive and well-tested

* style: fix code formatting for test file
2026-03-03 02:37:23 +00:00

158 lines
6.1 KiB
TypeScript

import { describe, expect, it } from "vitest";
import {
coerceToFailoverError,
describeFailoverError,
isTimeoutError,
resolveFailoverReasonFromError,
resolveFailoverStatus,
} from "./failover-error.js";
describe("failover-error", () => {
it("infers failover reason from HTTP status", () => {
expect(resolveFailoverReasonFromError({ status: 402 })).toBe("billing");
expect(resolveFailoverReasonFromError({ statusCode: "429" })).toBe("rate_limit");
expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format");
// Transient server errors (502/503/504) should trigger failover as timeout.
expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
// Anthropic 529 (overloaded) should trigger failover as rate_limit.
expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit");
});
it("infers format errors from error messages", () => {
expect(
resolveFailoverReasonFromError({
message: "invalid request format: messages.1.content.1.tool_use.id",
}),
).toBe("format");
});
it("infers timeout from common node error codes", () => {
expect(resolveFailoverReasonFromError({ code: "ETIMEDOUT" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe("timeout");
});
it("infers timeout from abort/error stop-reason messages", () => {
expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe(
"timeout",
);
expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: error" })).toBe(
"timeout",
);
expect(resolveFailoverReasonFromError({ message: "stop reason: abort" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "stop reason: error" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "reason: abort" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "reason: error" })).toBe("timeout");
});
it("infers timeout from connection/network error messages", () => {
expect(resolveFailoverReasonFromError({ message: "Connection error." })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "fetch failed" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "Network error: ECONNREFUSED" })).toBe(
"timeout",
);
expect(
resolveFailoverReasonFromError({
message: "dial tcp: lookup api.example.com: no such host (ENOTFOUND)",
}),
).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "temporary dns failure EAI_AGAIN" })).toBe(
"timeout",
);
});
it("treats AbortError reason=abort as timeout", () => {
const err = Object.assign(new Error("aborted"), {
name: "AbortError",
reason: "reason: abort",
});
expect(isTimeoutError(err)).toBe(true);
});
it("coerces failover-worthy errors into FailoverError with metadata", () => {
const err = coerceToFailoverError("credit balance too low", {
provider: "anthropic",
model: "claude-opus-4-5",
});
expect(err?.name).toBe("FailoverError");
expect(err?.reason).toBe("billing");
expect(err?.status).toBe(402);
expect(err?.provider).toBe("anthropic");
expect(err?.model).toBe("claude-opus-4-5");
});
it("coerces format errors with a 400 status", () => {
const err = coerceToFailoverError("invalid request format", {
provider: "google",
model: "cloud-code-assist",
});
expect(err?.reason).toBe("format");
expect(err?.status).toBe(400);
});
it("401/403 with generic message still returns auth (backward compat)", () => {
expect(resolveFailoverReasonFromError({ status: 401, message: "Unauthorized" })).toBe("auth");
expect(resolveFailoverReasonFromError({ status: 403, message: "Forbidden" })).toBe("auth");
});
it("401 with permanent auth message returns auth_permanent", () => {
expect(resolveFailoverReasonFromError({ status: 401, message: "invalid_api_key" })).toBe(
"auth_permanent",
);
});
it("403 with revoked key message returns auth_permanent", () => {
expect(resolveFailoverReasonFromError({ status: 403, message: "api key revoked" })).toBe(
"auth_permanent",
);
});
it("resolveFailoverStatus maps auth_permanent to 403", () => {
expect(resolveFailoverStatus("auth_permanent")).toBe(403);
});
it("coerces permanent auth error with correct reason", () => {
const err = coerceToFailoverError(
{ status: 401, message: "invalid_api_key" },
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
expect(err?.provider).toBe("anthropic");
});
it("403 permission_error returns auth_permanent", () => {
expect(
resolveFailoverReasonFromError({
status: 403,
message:
"permission_error: OAuth authentication is currently not allowed for this organization.",
}),
).toBe("auth_permanent");
});
it("permission_error in error message string classifies as auth_permanent", () => {
const err = coerceToFailoverError(
"HTTP 403 permission_error: OAuth authentication is currently not allowed for this organization.",
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
});
it("'not allowed for this organization' classifies as auth_permanent", () => {
const err = coerceToFailoverError(
"OAuth authentication is currently not allowed for this organization",
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
});
it("describes non-Error values consistently", () => {
const described = describeFailoverError(123);
expect(described.message).toBe("123");
expect(described.reason).toBeUndefined();
});
});