From fc07dee37e95e2cb5f74ab4ee364066edbb4ff70 Mon Sep 17 00:00:00 2001 From: Altay Date: Fri, 6 Mar 2026 15:27:57 +0300 Subject: [PATCH] fix(agents): classify overloaded failures separately --- CHANGELOG.md | 2 +- src/agents/failover-error.test.ts | 22 +++++++++++++---- src/agents/failover-error.ts | 2 ++ ...dded-helpers.isbillingerrormessage.test.ts | 16 ++++++------- src/agents/pi-embedded-helpers/errors.ts | 24 +++++++++---------- src/agents/pi-embedded-helpers/types.ts | 1 + src/agents/pi-embedded-runner/run.ts | 22 ++++------------- 7 files changed, 45 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e43d7d2b42..3c17ef9c1c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -118,7 +118,7 @@ Docs: https://docs.openclaw.ai - Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n. - Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant. - Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot. -- Agents/overload auth-profile state: keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider. +- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts and keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider. - Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan. - Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo. - Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc. diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index 60e7510e67e..828a64cb749 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -69,13 +69,13 @@ describe("failover-error", () => { // Keep the status-only path behavior-preserving and conservative. expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout"); - expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout"); + expect(resolveFailoverReasonFromError({ status: 503 })).toBe("overloaded"); expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout"); expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull(); - expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit"); + expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded"); }); it("classifies documented provider error shapes at the error boundary", () => { @@ -90,7 +90,7 @@ describe("failover-error", () => { status: 529, message: ANTHROPIC_OVERLOADED_PAYLOAD, }), - ).toBe("rate_limit"); + ).toBe("overloaded"); expect( resolveFailoverReasonFromError({ status: 429, @@ -114,7 +114,7 @@ describe("failover-error", () => { status: 503, message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE, }), - ).toBe("timeout"); + ).toBe("overloaded"); expect( resolveFailoverReasonFromError({ status: 429, @@ -126,7 +126,7 @@ describe("failover-error", () => { status: 503, message: GROQ_SERVICE_UNAVAILABLE_MESSAGE, }), - ).toBe("timeout"); + ).toBe("overloaded"); }); it("treats 400 insufficient_quota payloads as billing instead of format", () => { @@ -151,6 +151,14 @@ describe("failover-error", () => { ).toBe("rate_limit"); }); + it("treats overloaded provider payloads as overloaded", () => { + expect( + resolveFailoverReasonFromError({ + message: ANTHROPIC_OVERLOADED_PAYLOAD, + }), + ).toBe("overloaded"); + }); + it("keeps raw-text 402 weekly/monthly limit errors in billing", () => { expect( resolveFailoverReasonFromError({ @@ -221,6 +229,10 @@ describe("failover-error", () => { expect(err?.model).toBe("claude-opus-4-5"); }); + it("maps overloaded to a 503 fallback status", () => { + expect(resolveFailoverStatus("overloaded")).toBe(503); + }); + it("coerces format errors with a 400 status", () => { const err = coerceToFailoverError("invalid request format", { provider: "google", diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index 5c16d3508fd..a39685e1b16 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -49,6 +49,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine return 402; case "rate_limit": return 429; + case "overloaded": + return 503; case "auth": return 401; case "auth_permanent": diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 9eb2657158b..f3b0c630061 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -509,12 +509,12 @@ describe("classifyFailoverReason", () => { it("classifies documented provider error messages", () => { expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit"); expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit"); - expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("rate_limit"); + expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("overloaded"); expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing"); expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing"); - expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("timeout"); + expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("overloaded"); expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit"); - expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("timeout"); + expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("overloaded"); }); it("classifies internal and compatibility error messages", () => { @@ -572,20 +572,20 @@ describe("classifyFailoverReason", () => { "rate_limit", ); }); - it("classifies provider high-demand / service-unavailable messages as rate_limit", () => { + it("classifies provider high-demand / service-unavailable messages as overloaded", () => { expect( classifyFailoverReason( "This model is currently experiencing high demand. Please try again later.", ), - ).toBe("rate_limit"); - // "service unavailable" combined with overload/capacity indicator → rate_limit + ).toBe("overloaded"); + // "service unavailable" combined with overload/capacity indicator → overloaded // (exercises the new regex — none of the standalone patterns match here) - expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("rate_limit"); + expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("overloaded"); expect( classifyFailoverReason( '{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}', ), - ).toBe("rate_limit"); + ).toBe("overloaded"); }); it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => { // A generic "service unavailable" from a proxy/CDN should stay retryable, diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index e7cd440d779..5efe875553b 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -293,13 +293,11 @@ export function classifyFailoverReasonFromHttpStatus( if (status === 408) { return "timeout"; } - // Keep the status-only path conservative and behavior-preserving. - // Message-path HTTP heuristics are broader and should not leak in here. - if (status === 502 || status === 503 || status === 504) { + if (status === 502 || status === 504) { return "timeout"; } - if (status === 529) { - return "rate_limit"; + if (status === 503 || status === 529) { + return "overloaded"; } if (status === 400) { // Some providers return quota/balance errors under HTTP 400, so do not @@ -854,13 +852,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null { if (isModelNotFoundErrorMessage(raw)) { return "model_not_found"; } - if (isTransientHttpError(raw)) { - // Treat transient 5xx provider failures as retryable transport issues. - return "timeout"; - } - if (isJsonApiInternalServerError(raw)) { - return "timeout"; - } if (isPeriodicUsageLimitErrorMessage(raw)) { return isBillingErrorMessage(raw) ? "billing" : "rate_limit"; } @@ -868,7 +859,14 @@ export function classifyFailoverReason(raw: string): FailoverReason | null { return "rate_limit"; } if (isOverloadedErrorMessage(raw)) { - return "rate_limit"; + return "overloaded"; + } + if (isTransientHttpError(raw)) { + // Treat transient 5xx provider failures as retryable transport issues. + return "timeout"; + } + if (isJsonApiInternalServerError(raw)) { + return "timeout"; } if (isCloudCodeAssistFormatError(raw)) { return "format"; diff --git a/src/agents/pi-embedded-helpers/types.ts b/src/agents/pi-embedded-helpers/types.ts index 86ee1c4cda1..5ae47d672d3 100644 --- a/src/agents/pi-embedded-helpers/types.ts +++ b/src/agents/pi-embedded-helpers/types.ts @@ -5,6 +5,7 @@ export type FailoverReason = | "auth_permanent" | "format" | "rate_limit" + | "overloaded" | "billing" | "timeout" | "model_not_found" diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index a36410f033c..1e11632d3eb 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -42,7 +42,6 @@ import { isLikelyContextOverflowError, isFailoverAssistantError, isFailoverErrorMessage, - isOverloadedErrorMessage, parseImageSizeError, parseImageDimensionError, isRateLimitAssistantError, @@ -740,16 +739,9 @@ export async function runEmbeddedPiAgent( }); }; const resolveAuthProfileFailureReason = ( - errorText: string, failoverReason: FailoverReason | null, ): AuthProfileFailureReason | null => { - if (!failoverReason || failoverReason === "timeout") { - return null; - } - // Overloaded provider responses currently stay on the rate_limit failover lane - // so existing retry/failover behavior keeps working, but they should not - // be recorded as auth-profile failures. - if (failoverReason === "rate_limit" && isOverloadedErrorMessage(errorText)) { + if (!failoverReason || failoverReason === "timeout" || failoverReason === "overloaded") { return null; } return failoverReason; @@ -1162,10 +1154,8 @@ export async function runEmbeddedPiAgent( }; } const promptFailoverReason = classifyFailoverReason(errorText); - const promptProfileFailureReason = resolveAuthProfileFailureReason( - errorText, - promptFailoverReason, - ); + const promptProfileFailureReason = + resolveAuthProfileFailureReason(promptFailoverReason); await maybeMarkAuthProfileFailure({ profileId: lastProfileId, reason: promptProfileFailureReason, @@ -1219,10 +1209,8 @@ export async function runEmbeddedPiAgent( const billingFailure = isBillingAssistantError(lastAssistant); const failoverFailure = isFailoverAssistantError(lastAssistant); const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? ""); - const assistantProfileFailureReason = resolveAuthProfileFailureReason( - lastAssistant?.errorMessage ?? "", - assistantFailoverReason, - ); + const assistantProfileFailureReason = + resolveAuthProfileFailureReason(assistantFailoverReason); const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError; const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");