From fc07dee37e95e2cb5f74ab4ee364066edbb4ff70 Mon Sep 17 00:00:00 2001
From: Altay <altay@hey.com>
Date: Fri, 6 Mar 2026 15:27:57 +0300
Subject: [PATCH] fix(agents): classify overloaded failures separately

---
 CHANGELOG.md                                  |  2 +-
 src/agents/failover-error.test.ts             | 22 +++++++++++++----
 src/agents/failover-error.ts                  |  2 ++
 ...dded-helpers.isbillingerrormessage.test.ts | 16 ++++++-------
 src/agents/pi-embedded-helpers/errors.ts      | 24 +++++++++----------
 src/agents/pi-embedded-helpers/types.ts       |  1 +
 src/agents/pi-embedded-runner/run.ts          | 22 ++++-------------
 7 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9e43d7d2b42..3c17ef9c1c0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -118,7 +118,7 @@ Docs: https://docs.openclaw.ai
 - Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n.
 - Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant.
 - Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot.
-- Agents/overload auth-profile state: keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider.
+- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts and keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider.
 - Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
 - Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo.
 - Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc.
diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts
index 60e7510e67e..828a64cb749 100644
--- a/src/agents/failover-error.test.ts
+++ b/src/agents/failover-error.test.ts
@@ -69,13 +69,13 @@ describe("failover-error", () => {
     // Keep the status-only path behavior-preserving and conservative.
     expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();
     expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
-    expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
+    expect(resolveFailoverReasonFromError({ status: 503 })).toBe("overloaded");
     expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
     expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();
     expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();
     expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();
     expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();
-    expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit");
+    expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded");
   });
 
   it("classifies documented provider error shapes at the error boundary", () => {
@@ -90,7 +90,7 @@ describe("failover-error", () => {
         status: 529,
         message: ANTHROPIC_OVERLOADED_PAYLOAD,
       }),
-    ).toBe("rate_limit");
+    ).toBe("overloaded");
     expect(
       resolveFailoverReasonFromError({
         status: 429,
@@ -114,7 +114,7 @@ describe("failover-error", () => {
         status: 503,
         message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,
       }),
-    ).toBe("timeout");
+    ).toBe("overloaded");
     expect(
       resolveFailoverReasonFromError({
         status: 429,
@@ -126,7 +126,7 @@ describe("failover-error", () => {
         status: 503,
         message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,
       }),
-    ).toBe("timeout");
+    ).toBe("overloaded");
   });
 
   it("treats 400 insufficient_quota payloads as billing instead of format", () => {
@@ -151,6 +151,14 @@ describe("failover-error", () => {
     ).toBe("rate_limit");
   });
 
+  it("treats overloaded provider payloads as overloaded", () => {
+    expect(
+      resolveFailoverReasonFromError({
+        message: ANTHROPIC_OVERLOADED_PAYLOAD,
+      }),
+    ).toBe("overloaded");
+  });
+
   it("keeps raw-text 402 weekly/monthly limit errors in billing", () => {
     expect(
       resolveFailoverReasonFromError({
@@ -221,6 +229,10 @@ describe("failover-error", () => {
     expect(err?.model).toBe("claude-opus-4-5");
   });
 
+  it("maps overloaded to a 503 fallback status", () => {
+    expect(resolveFailoverStatus("overloaded")).toBe(503);
+  });
+
   it("coerces format errors with a 400 status", () => {
     const err = coerceToFailoverError("invalid request format", {
       provider: "google",
diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts
index 5c16d3508fd..a39685e1b16 100644
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@@ -49,6 +49,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine
       return 402;
     case "rate_limit":
       return 429;
+    case "overloaded":
+      return 503;
     case "auth":
       return 401;
     case "auth_permanent":
diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
index 9eb2657158b..f3b0c630061 100644
--- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
+++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
@@ -509,12 +509,12 @@ describe("classifyFailoverReason", () => {
   it("classifies documented provider error messages", () => {
     expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit");
     expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit");
-    expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("rate_limit");
+    expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("overloaded");
     expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing");
     expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing");
-    expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("timeout");
+    expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("overloaded");
     expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit");
-    expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("timeout");
+    expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("overloaded");
   });
 
   it("classifies internal and compatibility error messages", () => {
@@ -572,20 +572,20 @@ describe("classifyFailoverReason", () => {
       "rate_limit",
     );
   });
-  it("classifies provider high-demand / service-unavailable messages as rate_limit", () => {
+  it("classifies provider high-demand / service-unavailable messages as overloaded", () => {
     expect(
       classifyFailoverReason(
         "This model is currently experiencing high demand. Please try again later.",
       ),
-    ).toBe("rate_limit");
-    // "service unavailable" combined with overload/capacity indicator → rate_limit
+    ).toBe("overloaded");
+    // "service unavailable" combined with overload/capacity indicator → overloaded
     // (exercises the new regex — none of the standalone patterns match here)
-    expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("rate_limit");
+    expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("overloaded");
     expect(
       classifyFailoverReason(
         '{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}',
       ),
-    ).toBe("rate_limit");
+    ).toBe("overloaded");
   });
   it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => {
     // A generic "service unavailable" from a proxy/CDN should stay retryable,
diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts
index e7cd440d779..5efe875553b 100644
--- a/src/agents/pi-embedded-helpers/errors.ts
+++ b/src/agents/pi-embedded-helpers/errors.ts
@@ -293,13 +293,11 @@ export function classifyFailoverReasonFromHttpStatus(
   if (status === 408) {
     return "timeout";
   }
-  // Keep the status-only path conservative and behavior-preserving.
-  // Message-path HTTP heuristics are broader and should not leak in here.
-  if (status === 502 || status === 503 || status === 504) {
+  if (status === 502 || status === 504) {
     return "timeout";
   }
-  if (status === 529) {
-    return "rate_limit";
+  if (status === 503 || status === 529) {
+    return "overloaded";
   }
   if (status === 400) {
     // Some providers return quota/balance errors under HTTP 400, so do not
@@ -854,13 +852,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
   if (isModelNotFoundErrorMessage(raw)) {
     return "model_not_found";
   }
-  if (isTransientHttpError(raw)) {
-    // Treat transient 5xx provider failures as retryable transport issues.
-    return "timeout";
-  }
-  if (isJsonApiInternalServerError(raw)) {
-    return "timeout";
-  }
   if (isPeriodicUsageLimitErrorMessage(raw)) {
     return isBillingErrorMessage(raw) ? "billing" : "rate_limit";
   }
@@ -868,7 +859,14 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
     return "rate_limit";
   }
   if (isOverloadedErrorMessage(raw)) {
-    return "rate_limit";
+    return "overloaded";
+  }
+  if (isTransientHttpError(raw)) {
+    // Treat transient 5xx provider failures as retryable transport issues.
+    return "timeout";
+  }
+  if (isJsonApiInternalServerError(raw)) {
+    return "timeout";
   }
   if (isCloudCodeAssistFormatError(raw)) {
     return "format";
diff --git a/src/agents/pi-embedded-helpers/types.ts b/src/agents/pi-embedded-helpers/types.ts
index 86ee1c4cda1..5ae47d672d3 100644
--- a/src/agents/pi-embedded-helpers/types.ts
+++ b/src/agents/pi-embedded-helpers/types.ts
@@ -5,6 +5,7 @@ export type FailoverReason =
   | "auth_permanent"
   | "format"
   | "rate_limit"
+  | "overloaded"
   | "billing"
   | "timeout"
   | "model_not_found"
diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts
index a36410f033c..1e11632d3eb 100644
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -42,7 +42,6 @@ import {
   isLikelyContextOverflowError,
   isFailoverAssistantError,
   isFailoverErrorMessage,
-  isOverloadedErrorMessage,
   parseImageSizeError,
   parseImageDimensionError,
   isRateLimitAssistantError,
@@ -740,16 +739,9 @@ export async function runEmbeddedPiAgent(
         });
       };
       const resolveAuthProfileFailureReason = (
-        errorText: string,
         failoverReason: FailoverReason | null,
       ): AuthProfileFailureReason | null => {
-        if (!failoverReason || failoverReason === "timeout") {
-          return null;
-        }
-        // Overloaded provider responses currently stay on the rate_limit failover lane
-        // so existing retry/failover behavior keeps working, but they should not
-        // be recorded as auth-profile failures.
-        if (failoverReason === "rate_limit" && isOverloadedErrorMessage(errorText)) {
+        if (!failoverReason || failoverReason === "timeout" || failoverReason === "overloaded") {
           return null;
         }
         return failoverReason;
@@ -1162,10 +1154,8 @@ export async function runEmbeddedPiAgent(
               };
             }
             const promptFailoverReason = classifyFailoverReason(errorText);
-            const promptProfileFailureReason = resolveAuthProfileFailureReason(
-              errorText,
-              promptFailoverReason,
-            );
+            const promptProfileFailureReason =
+              resolveAuthProfileFailureReason(promptFailoverReason);
             await maybeMarkAuthProfileFailure({
               profileId: lastProfileId,
               reason: promptProfileFailureReason,
@@ -1219,10 +1209,8 @@ export async function runEmbeddedPiAgent(
           const billingFailure = isBillingAssistantError(lastAssistant);
           const failoverFailure = isFailoverAssistantError(lastAssistant);
           const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
-          const assistantProfileFailureReason = resolveAuthProfileFailureReason(
-            lastAssistant?.errorMessage ?? "",
-            assistantFailoverReason,
-          );
+          const assistantProfileFailureReason =
+            resolveAuthProfileFailureReason(assistantFailoverReason);
           const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
           const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");