fix(agents): classify overloaded failures separately

This commit is contained in:
Altay 2026-03-06 15:27:57 +03:00
parent 89240e1226
commit fc07dee37e
No known key found for this signature in database
7 changed files with 45 additions and 44 deletions

View File

@ -118,7 +118,7 @@ Docs: https://docs.openclaw.ai
- Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n. - Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n.
- Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant. - Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant.
- Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot. - Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot.
- Agents/overload auth-profile state: keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider. - Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts and keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider.
- Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan. - Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
- Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo. - Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo.
- Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc. - Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc.

View File

@ -69,13 +69,13 @@ describe("failover-error", () => {
// Keep the status-only path behavior-preserving and conservative. // Keep the status-only path behavior-preserving and conservative.
expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout"); expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout"); expect(resolveFailoverReasonFromError({ status: 503 })).toBe("overloaded");
expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout"); expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull(); expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit"); expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded");
}); });
it("classifies documented provider error shapes at the error boundary", () => { it("classifies documented provider error shapes at the error boundary", () => {
@ -90,7 +90,7 @@ describe("failover-error", () => {
status: 529, status: 529,
message: ANTHROPIC_OVERLOADED_PAYLOAD, message: ANTHROPIC_OVERLOADED_PAYLOAD,
}), }),
).toBe("rate_limit"); ).toBe("overloaded");
expect( expect(
resolveFailoverReasonFromError({ resolveFailoverReasonFromError({
status: 429, status: 429,
@ -114,7 +114,7 @@ describe("failover-error", () => {
status: 503, status: 503,
message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE, message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,
}), }),
).toBe("timeout"); ).toBe("overloaded");
expect( expect(
resolveFailoverReasonFromError({ resolveFailoverReasonFromError({
status: 429, status: 429,
@ -126,7 +126,7 @@ describe("failover-error", () => {
status: 503, status: 503,
message: GROQ_SERVICE_UNAVAILABLE_MESSAGE, message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,
}), }),
).toBe("timeout"); ).toBe("overloaded");
}); });
it("treats 400 insufficient_quota payloads as billing instead of format", () => { it("treats 400 insufficient_quota payloads as billing instead of format", () => {
@ -151,6 +151,14 @@ describe("failover-error", () => {
).toBe("rate_limit"); ).toBe("rate_limit");
}); });
it("treats overloaded provider payloads as overloaded", () => {
expect(
resolveFailoverReasonFromError({
message: ANTHROPIC_OVERLOADED_PAYLOAD,
}),
).toBe("overloaded");
});
it("keeps raw-text 402 weekly/monthly limit errors in billing", () => { it("keeps raw-text 402 weekly/monthly limit errors in billing", () => {
expect( expect(
resolveFailoverReasonFromError({ resolveFailoverReasonFromError({
@ -221,6 +229,10 @@ describe("failover-error", () => {
expect(err?.model).toBe("claude-opus-4-5"); expect(err?.model).toBe("claude-opus-4-5");
}); });
it("maps overloaded to a 503 fallback status", () => {
expect(resolveFailoverStatus("overloaded")).toBe(503);
});
it("coerces format errors with a 400 status", () => { it("coerces format errors with a 400 status", () => {
const err = coerceToFailoverError("invalid request format", { const err = coerceToFailoverError("invalid request format", {
provider: "google", provider: "google",

View File

@ -49,6 +49,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine
return 402; return 402;
case "rate_limit": case "rate_limit":
return 429; return 429;
case "overloaded":
return 503;
case "auth": case "auth":
return 401; return 401;
case "auth_permanent": case "auth_permanent":

View File

@ -509,12 +509,12 @@ describe("classifyFailoverReason", () => {
it("classifies documented provider error messages", () => { it("classifies documented provider error messages", () => {
expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit"); expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit");
expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit"); expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit");
expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("rate_limit"); expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("overloaded");
expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing"); expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing");
expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing"); expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing");
expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("timeout"); expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("overloaded");
expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit"); expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit");
expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("timeout"); expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("overloaded");
}); });
it("classifies internal and compatibility error messages", () => { it("classifies internal and compatibility error messages", () => {
@ -572,20 +572,20 @@ describe("classifyFailoverReason", () => {
"rate_limit", "rate_limit",
); );
}); });
it("classifies provider high-demand / service-unavailable messages as rate_limit", () => { it("classifies provider high-demand / service-unavailable messages as overloaded", () => {
expect( expect(
classifyFailoverReason( classifyFailoverReason(
"This model is currently experiencing high demand. Please try again later.", "This model is currently experiencing high demand. Please try again later.",
), ),
).toBe("rate_limit"); ).toBe("overloaded");
// "service unavailable" combined with overload/capacity indicator → rate_limit // "service unavailable" combined with overload/capacity indicator → overloaded
// (exercises the new regex — none of the standalone patterns match here) // (exercises the new regex — none of the standalone patterns match here)
expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("rate_limit"); expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("overloaded");
expect( expect(
classifyFailoverReason( classifyFailoverReason(
'{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}', '{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}',
), ),
).toBe("rate_limit"); ).toBe("overloaded");
}); });
it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => { it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => {
// A generic "service unavailable" from a proxy/CDN should stay retryable, // A generic "service unavailable" from a proxy/CDN should stay retryable,

View File

@ -293,13 +293,11 @@ export function classifyFailoverReasonFromHttpStatus(
if (status === 408) { if (status === 408) {
return "timeout"; return "timeout";
} }
// Keep the status-only path conservative and behavior-preserving. if (status === 502 || status === 504) {
// Message-path HTTP heuristics are broader and should not leak in here.
if (status === 502 || status === 503 || status === 504) {
return "timeout"; return "timeout";
} }
if (status === 529) { if (status === 503 || status === 529) {
return "rate_limit"; return "overloaded";
} }
if (status === 400) { if (status === 400) {
// Some providers return quota/balance errors under HTTP 400, so do not // Some providers return quota/balance errors under HTTP 400, so do not
@ -854,13 +852,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
if (isModelNotFoundErrorMessage(raw)) { if (isModelNotFoundErrorMessage(raw)) {
return "model_not_found"; return "model_not_found";
} }
if (isTransientHttpError(raw)) {
// Treat transient 5xx provider failures as retryable transport issues.
return "timeout";
}
if (isJsonApiInternalServerError(raw)) {
return "timeout";
}
if (isPeriodicUsageLimitErrorMessage(raw)) { if (isPeriodicUsageLimitErrorMessage(raw)) {
return isBillingErrorMessage(raw) ? "billing" : "rate_limit"; return isBillingErrorMessage(raw) ? "billing" : "rate_limit";
} }
@ -868,7 +859,14 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
return "rate_limit"; return "rate_limit";
} }
if (isOverloadedErrorMessage(raw)) { if (isOverloadedErrorMessage(raw)) {
return "rate_limit"; return "overloaded";
}
if (isTransientHttpError(raw)) {
// Treat transient 5xx provider failures as retryable transport issues.
return "timeout";
}
if (isJsonApiInternalServerError(raw)) {
return "timeout";
} }
if (isCloudCodeAssistFormatError(raw)) { if (isCloudCodeAssistFormatError(raw)) {
return "format"; return "format";

View File

@ -5,6 +5,7 @@ export type FailoverReason =
| "auth_permanent" | "auth_permanent"
| "format" | "format"
| "rate_limit" | "rate_limit"
| "overloaded"
| "billing" | "billing"
| "timeout" | "timeout"
| "model_not_found" | "model_not_found"

View File

@ -42,7 +42,6 @@ import {
isLikelyContextOverflowError, isLikelyContextOverflowError,
isFailoverAssistantError, isFailoverAssistantError,
isFailoverErrorMessage, isFailoverErrorMessage,
isOverloadedErrorMessage,
parseImageSizeError, parseImageSizeError,
parseImageDimensionError, parseImageDimensionError,
isRateLimitAssistantError, isRateLimitAssistantError,
@ -740,16 +739,9 @@ export async function runEmbeddedPiAgent(
}); });
}; };
const resolveAuthProfileFailureReason = ( const resolveAuthProfileFailureReason = (
errorText: string,
failoverReason: FailoverReason | null, failoverReason: FailoverReason | null,
): AuthProfileFailureReason | null => { ): AuthProfileFailureReason | null => {
if (!failoverReason || failoverReason === "timeout") { if (!failoverReason || failoverReason === "timeout" || failoverReason === "overloaded") {
return null;
}
// Overloaded provider responses currently stay on the rate_limit failover lane
// so existing retry/failover behavior keeps working, but they should not
// be recorded as auth-profile failures.
if (failoverReason === "rate_limit" && isOverloadedErrorMessage(errorText)) {
return null; return null;
} }
return failoverReason; return failoverReason;
@ -1162,10 +1154,8 @@ export async function runEmbeddedPiAgent(
}; };
} }
const promptFailoverReason = classifyFailoverReason(errorText); const promptFailoverReason = classifyFailoverReason(errorText);
const promptProfileFailureReason = resolveAuthProfileFailureReason( const promptProfileFailureReason =
errorText, resolveAuthProfileFailureReason(promptFailoverReason);
promptFailoverReason,
);
await maybeMarkAuthProfileFailure({ await maybeMarkAuthProfileFailure({
profileId: lastProfileId, profileId: lastProfileId,
reason: promptProfileFailureReason, reason: promptProfileFailureReason,
@ -1219,10 +1209,8 @@ export async function runEmbeddedPiAgent(
const billingFailure = isBillingAssistantError(lastAssistant); const billingFailure = isBillingAssistantError(lastAssistant);
const failoverFailure = isFailoverAssistantError(lastAssistant); const failoverFailure = isFailoverAssistantError(lastAssistant);
const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? ""); const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
const assistantProfileFailureReason = resolveAuthProfileFailureReason( const assistantProfileFailureReason =
lastAssistant?.errorMessage ?? "", resolveAuthProfileFailureReason(assistantFailoverReason);
assistantFailoverReason,
);
const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError; const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? ""); const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");