fix(agents): classify overloaded failures separately

2026-03-06 15:27:57 +03:00 · 2026-03-06 15:27:57 +03:00 · fc07dee37e
commit fc07dee37e
parent 89240e1226
7 changed files with 45 additions and 44 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -118,7 +118,7 @@ Docs: https://docs.openclaw.ai
 - Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n.
 - Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant.
 - Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot.
- Agents/overload auth-profile state: keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider.
+- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts and keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider.
 - Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
 - Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo.
 - Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc.
--- a/src/agents/failover-error.test.ts
+++ b/src/agents/failover-error.test.ts
@ -69,13 +69,13 @@ describe("failover-error", () => {
    // Keep the status-only path behavior-preserving and conservative.
    expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
-    expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
+    expect(resolveFailoverReasonFromError({ status: 503 })).toBe("overloaded");
    expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();
-    expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit");
+    expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded");
  });

  it("classifies documented provider error shapes at the error boundary", () => {
@ -90,7 +90,7 @@ describe("failover-error", () => {
        status: 529,
        message: ANTHROPIC_OVERLOADED_PAYLOAD,
      }),
-    ).toBe("rate_limit");
+    ).toBe("overloaded");
    expect(
      resolveFailoverReasonFromError({
        status: 429,
@ -114,7 +114,7 @@ describe("failover-error", () => {
        status: 503,
        message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,
      }),
-    ).toBe("timeout");
+    ).toBe("overloaded");
    expect(
      resolveFailoverReasonFromError({
        status: 429,
@ -126,7 +126,7 @@ describe("failover-error", () => {
        status: 503,
        message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,
      }),
-    ).toBe("timeout");
+    ).toBe("overloaded");
  });

  it("treats 400 insufficient_quota payloads as billing instead of format", () => {
@ -151,6 +151,14 @@ describe("failover-error", () => {
    ).toBe("rate_limit");
  });

+  it("treats overloaded provider payloads as overloaded", () => {
+    expect(
+      resolveFailoverReasonFromError({
+        message: ANTHROPIC_OVERLOADED_PAYLOAD,
+      }),
+    ).toBe("overloaded");
+  });
+
  it("keeps raw-text 402 weekly/monthly limit errors in billing", () => {
    expect(
      resolveFailoverReasonFromError({
@ -221,6 +229,10 @@ describe("failover-error", () => {
    expect(err?.model).toBe("claude-opus-4-5");
  });

+  it("maps overloaded to a 503 fallback status", () => {
+    expect(resolveFailoverStatus("overloaded")).toBe(503);
+  });
+
  it("coerces format errors with a 400 status", () => {
    const err = coerceToFailoverError("invalid request format", {
      provider: "google",
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@ -49,6 +49,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine
      return 402;
    case "rate_limit":
      return 429;
+    case "overloaded":
+      return 503;
    case "auth":
      return 401;
    case "auth_permanent":
--- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
+++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
@ -509,12 +509,12 @@ describe("classifyFailoverReason", () => {
  it("classifies documented provider error messages", () => {
    expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit");
    expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit");
-    expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("rate_limit");
+    expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("overloaded");
    expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing");
    expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing");
-    expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("timeout");
+    expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("overloaded");
    expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit");
-    expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("timeout");
+    expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("overloaded");
  });

  it("classifies internal and compatibility error messages", () => {
@ -572,20 +572,20 @@ describe("classifyFailoverReason", () => {
      "rate_limit",
    );
  });
-  it("classifies provider high-demand / service-unavailable messages as rate_limit", () => {
+  it("classifies provider high-demand / service-unavailable messages as overloaded", () => {
    expect(
      classifyFailoverReason(
        "This model is currently experiencing high demand. Please try again later.",
      ),
-    ).toBe("rate_limit");
-    // "service unavailable" combined with overload/capacity indicator → rate_limit
+    ).toBe("overloaded");
+    // "service unavailable" combined with overload/capacity indicator → overloaded
    // (exercises the new regex — none of the standalone patterns match here)
-    expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("rate_limit");
+    expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("overloaded");
    expect(
      classifyFailoverReason(
        '{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}',
      ),
-    ).toBe("rate_limit");
+    ).toBe("overloaded");
  });
  it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => {
    // A generic "service unavailable" from a proxy/CDN should stay retryable,
--- a/src/agents/pi-embedded-helpers/errors.ts
+++ b/src/agents/pi-embedded-helpers/errors.ts
@ -293,13 +293,11 @@ export function classifyFailoverReasonFromHttpStatus(
  if (status === 408) {
    return "timeout";
  }
-  // Keep the status-only path conservative and behavior-preserving.
-  // Message-path HTTP heuristics are broader and should not leak in here.
-  if (status === 502 || status === 503 || status === 504) {
+  if (status === 502 || status === 504) {
    return "timeout";
  }
-  if (status === 529) {
-    return "rate_limit";
+  if (status === 503 || status === 529) {
+    return "overloaded";
  }
  if (status === 400) {
    // Some providers return quota/balance errors under HTTP 400, so do not
@ -854,13 +852,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
  if (isModelNotFoundErrorMessage(raw)) {
    return "model_not_found";
  }
-  if (isTransientHttpError(raw)) {
-    // Treat transient 5xx provider failures as retryable transport issues.
-    return "timeout";
-  }
-  if (isJsonApiInternalServerError(raw)) {
-    return "timeout";
-  }
  if (isPeriodicUsageLimitErrorMessage(raw)) {
    return isBillingErrorMessage(raw) ? "billing" : "rate_limit";
  }
@ -868,7 +859,14 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
    return "rate_limit";
  }
  if (isOverloadedErrorMessage(raw)) {
-    return "rate_limit";
+    return "overloaded";
+  }
+  if (isTransientHttpError(raw)) {
+    // Treat transient 5xx provider failures as retryable transport issues.
+    return "timeout";
+  }
+  if (isJsonApiInternalServerError(raw)) {
+    return "timeout";
  }
  if (isCloudCodeAssistFormatError(raw)) {
    return "format";
--- a/src/agents/pi-embedded-helpers/types.ts
+++ b/src/agents/pi-embedded-helpers/types.ts
@ -5,6 +5,7 @@ export type FailoverReason =
  | "auth_permanent"
  | "format"
  | "rate_limit"
+  | "overloaded"
  | "billing"
  | "timeout"
  | "model_not_found"
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@ -42,7 +42,6 @@ import {
  isLikelyContextOverflowError,
  isFailoverAssistantError,
  isFailoverErrorMessage,
-  isOverloadedErrorMessage,
  parseImageSizeError,
  parseImageDimensionError,
  isRateLimitAssistantError,
@ -740,16 +739,9 @@ export async function runEmbeddedPiAgent(
        });
      };
      const resolveAuthProfileFailureReason = (
-        errorText: string,
        failoverReason: FailoverReason | null,
      ): AuthProfileFailureReason | null => {
-        if (!failoverReason || failoverReason === "timeout") {
-          return null;
-        }
-        // Overloaded provider responses currently stay on the rate_limit failover lane
-        // so existing retry/failover behavior keeps working, but they should not
-        // be recorded as auth-profile failures.
-        if (failoverReason === "rate_limit" && isOverloadedErrorMessage(errorText)) {
+        if (!failoverReason || failoverReason === "timeout" || failoverReason === "overloaded") {
          return null;
        }
        return failoverReason;
@ -1162,10 +1154,8 @@ export async function runEmbeddedPiAgent(
              };
            }
            const promptFailoverReason = classifyFailoverReason(errorText);
-            const promptProfileFailureReason = resolveAuthProfileFailureReason(
-              errorText,
-              promptFailoverReason,
-            );
+            const promptProfileFailureReason =
+              resolveAuthProfileFailureReason(promptFailoverReason);
            await maybeMarkAuthProfileFailure({
              profileId: lastProfileId,
              reason: promptProfileFailureReason,
@ -1219,10 +1209,8 @@ export async function runEmbeddedPiAgent(
          const billingFailure = isBillingAssistantError(lastAssistant);
          const failoverFailure = isFailoverAssistantError(lastAssistant);
          const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
-          const assistantProfileFailureReason = resolveAuthProfileFailureReason(
-            lastAssistant?.errorMessage ?? "",
-            assistantFailoverReason,
-          );
+          const assistantProfileFailureReason =
+            resolveAuthProfileFailureReason(assistantFailoverReason);
          const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
          const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");