Compare commits
5 Commits
main
...
fix/transi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
890dbf523f | ||
|
|
c7148f1a66 | ||
|
|
fc07dee37e | ||
|
|
89240e1226 | ||
|
|
d389977be4 |
@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n.
|
- Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n.
|
||||||
- Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant.
|
- Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant.
|
||||||
- Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot.
|
- Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot.
|
||||||
|
- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts, add short overload backoff before retry/failover, and keep overloaded prompt/assistant failures out of auth-profile failure state so transient provider overloads do not poison later profile selection on the same provider.
|
||||||
- Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
|
- Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
|
||||||
- Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo.
|
- Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo.
|
||||||
- Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc.
|
- Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc.
|
||||||
|
|||||||
@ -69,13 +69,13 @@ describe("failover-error", () => {
|
|||||||
// Keep the status-only path behavior-preserving and conservative.
|
// Keep the status-only path behavior-preserving and conservative.
|
||||||
expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();
|
expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();
|
||||||
expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
|
expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
|
||||||
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
|
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("overloaded");
|
||||||
expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
|
expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
|
||||||
expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();
|
expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();
|
||||||
expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();
|
expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();
|
||||||
expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();
|
expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();
|
||||||
expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();
|
expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();
|
||||||
expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit");
|
expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("classifies documented provider error shapes at the error boundary", () => {
|
it("classifies documented provider error shapes at the error boundary", () => {
|
||||||
@ -90,7 +90,7 @@ describe("failover-error", () => {
|
|||||||
status: 529,
|
status: 529,
|
||||||
message: ANTHROPIC_OVERLOADED_PAYLOAD,
|
message: ANTHROPIC_OVERLOADED_PAYLOAD,
|
||||||
}),
|
}),
|
||||||
).toBe("rate_limit");
|
).toBe("overloaded");
|
||||||
expect(
|
expect(
|
||||||
resolveFailoverReasonFromError({
|
resolveFailoverReasonFromError({
|
||||||
status: 429,
|
status: 429,
|
||||||
@ -114,7 +114,7 @@ describe("failover-error", () => {
|
|||||||
status: 503,
|
status: 503,
|
||||||
message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,
|
message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,
|
||||||
}),
|
}),
|
||||||
).toBe("timeout");
|
).toBe("overloaded");
|
||||||
expect(
|
expect(
|
||||||
resolveFailoverReasonFromError({
|
resolveFailoverReasonFromError({
|
||||||
status: 429,
|
status: 429,
|
||||||
@ -126,7 +126,7 @@ describe("failover-error", () => {
|
|||||||
status: 503,
|
status: 503,
|
||||||
message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,
|
message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,
|
||||||
}),
|
}),
|
||||||
).toBe("timeout");
|
).toBe("overloaded");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("treats 400 insufficient_quota payloads as billing instead of format", () => {
|
it("treats 400 insufficient_quota payloads as billing instead of format", () => {
|
||||||
@ -151,6 +151,14 @@ describe("failover-error", () => {
|
|||||||
).toBe("rate_limit");
|
).toBe("rate_limit");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("treats overloaded provider payloads as overloaded", () => {
|
||||||
|
expect(
|
||||||
|
resolveFailoverReasonFromError({
|
||||||
|
message: ANTHROPIC_OVERLOADED_PAYLOAD,
|
||||||
|
}),
|
||||||
|
).toBe("overloaded");
|
||||||
|
});
|
||||||
|
|
||||||
it("keeps raw-text 402 weekly/monthly limit errors in billing", () => {
|
it("keeps raw-text 402 weekly/monthly limit errors in billing", () => {
|
||||||
expect(
|
expect(
|
||||||
resolveFailoverReasonFromError({
|
resolveFailoverReasonFromError({
|
||||||
@ -221,6 +229,10 @@ describe("failover-error", () => {
|
|||||||
expect(err?.model).toBe("claude-opus-4-5");
|
expect(err?.model).toBe("claude-opus-4-5");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("maps overloaded to a 503 fallback status", () => {
|
||||||
|
expect(resolveFailoverStatus("overloaded")).toBe(503);
|
||||||
|
});
|
||||||
|
|
||||||
it("coerces format errors with a 400 status", () => {
|
it("coerces format errors with a 400 status", () => {
|
||||||
const err = coerceToFailoverError("invalid request format", {
|
const err = coerceToFailoverError("invalid request format", {
|
||||||
provider: "google",
|
provider: "google",
|
||||||
|
|||||||
@ -49,6 +49,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine
|
|||||||
return 402;
|
return 402;
|
||||||
case "rate_limit":
|
case "rate_limit":
|
||||||
return 429;
|
return 429;
|
||||||
|
case "overloaded":
|
||||||
|
return 503;
|
||||||
case "auth":
|
case "auth":
|
||||||
return 401;
|
return 401;
|
||||||
case "auth_permanent":
|
case "auth_permanent":
|
||||||
|
|||||||
@ -509,12 +509,12 @@ describe("classifyFailoverReason", () => {
|
|||||||
it("classifies documented provider error messages", () => {
|
it("classifies documented provider error messages", () => {
|
||||||
expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit");
|
expect(classifyFailoverReason(OPENAI_RATE_LIMIT_MESSAGE)).toBe("rate_limit");
|
||||||
expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit");
|
expect(classifyFailoverReason(GEMINI_RESOURCE_EXHAUSTED_MESSAGE)).toBe("rate_limit");
|
||||||
expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("rate_limit");
|
expect(classifyFailoverReason(ANTHROPIC_OVERLOADED_PAYLOAD)).toBe("overloaded");
|
||||||
expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing");
|
expect(classifyFailoverReason(OPENROUTER_CREDITS_MESSAGE)).toBe("billing");
|
||||||
expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing");
|
expect(classifyFailoverReason(TOGETHER_PAYMENT_REQUIRED_MESSAGE)).toBe("billing");
|
||||||
expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("timeout");
|
expect(classifyFailoverReason(TOGETHER_ENGINE_OVERLOADED_MESSAGE)).toBe("overloaded");
|
||||||
expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit");
|
expect(classifyFailoverReason(GROQ_TOO_MANY_REQUESTS_MESSAGE)).toBe("rate_limit");
|
||||||
expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("timeout");
|
expect(classifyFailoverReason(GROQ_SERVICE_UNAVAILABLE_MESSAGE)).toBe("overloaded");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("classifies internal and compatibility error messages", () => {
|
it("classifies internal and compatibility error messages", () => {
|
||||||
@ -572,20 +572,20 @@ describe("classifyFailoverReason", () => {
|
|||||||
"rate_limit",
|
"rate_limit",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
it("classifies provider high-demand / service-unavailable messages as rate_limit", () => {
|
it("classifies provider high-demand / service-unavailable messages as overloaded", () => {
|
||||||
expect(
|
expect(
|
||||||
classifyFailoverReason(
|
classifyFailoverReason(
|
||||||
"This model is currently experiencing high demand. Please try again later.",
|
"This model is currently experiencing high demand. Please try again later.",
|
||||||
),
|
),
|
||||||
).toBe("rate_limit");
|
).toBe("overloaded");
|
||||||
// "service unavailable" combined with overload/capacity indicator → rate_limit
|
// "service unavailable" combined with overload/capacity indicator → overloaded
|
||||||
// (exercises the new regex — none of the standalone patterns match here)
|
// (exercises the new regex — none of the standalone patterns match here)
|
||||||
expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("rate_limit");
|
expect(classifyFailoverReason("service unavailable due to capacity limits")).toBe("overloaded");
|
||||||
expect(
|
expect(
|
||||||
classifyFailoverReason(
|
classifyFailoverReason(
|
||||||
'{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}',
|
'{"error":{"code":503,"message":"The model is overloaded. Please try later","status":"UNAVAILABLE"}}',
|
||||||
),
|
),
|
||||||
).toBe("rate_limit");
|
).toBe("overloaded");
|
||||||
});
|
});
|
||||||
it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => {
|
it("classifies bare 'service unavailable' as timeout instead of rate_limit (#32828)", () => {
|
||||||
// A generic "service unavailable" from a proxy/CDN should stay retryable,
|
// A generic "service unavailable" from a proxy/CDN should stay retryable,
|
||||||
|
|||||||
@ -293,13 +293,11 @@ export function classifyFailoverReasonFromHttpStatus(
|
|||||||
if (status === 408) {
|
if (status === 408) {
|
||||||
return "timeout";
|
return "timeout";
|
||||||
}
|
}
|
||||||
// Keep the status-only path conservative and behavior-preserving.
|
if (status === 502 || status === 504) {
|
||||||
// Message-path HTTP heuristics are broader and should not leak in here.
|
|
||||||
if (status === 502 || status === 503 || status === 504) {
|
|
||||||
return "timeout";
|
return "timeout";
|
||||||
}
|
}
|
||||||
if (status === 529) {
|
if (status === 503 || status === 529) {
|
||||||
return "rate_limit";
|
return "overloaded";
|
||||||
}
|
}
|
||||||
if (status === 400) {
|
if (status === 400) {
|
||||||
// Some providers return quota/balance errors under HTTP 400, so do not
|
// Some providers return quota/balance errors under HTTP 400, so do not
|
||||||
@ -854,13 +852,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
|
|||||||
if (isModelNotFoundErrorMessage(raw)) {
|
if (isModelNotFoundErrorMessage(raw)) {
|
||||||
return "model_not_found";
|
return "model_not_found";
|
||||||
}
|
}
|
||||||
if (isTransientHttpError(raw)) {
|
|
||||||
// Treat transient 5xx provider failures as retryable transport issues.
|
|
||||||
return "timeout";
|
|
||||||
}
|
|
||||||
if (isJsonApiInternalServerError(raw)) {
|
|
||||||
return "timeout";
|
|
||||||
}
|
|
||||||
if (isPeriodicUsageLimitErrorMessage(raw)) {
|
if (isPeriodicUsageLimitErrorMessage(raw)) {
|
||||||
return isBillingErrorMessage(raw) ? "billing" : "rate_limit";
|
return isBillingErrorMessage(raw) ? "billing" : "rate_limit";
|
||||||
}
|
}
|
||||||
@ -868,7 +859,14 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
|
|||||||
return "rate_limit";
|
return "rate_limit";
|
||||||
}
|
}
|
||||||
if (isOverloadedErrorMessage(raw)) {
|
if (isOverloadedErrorMessage(raw)) {
|
||||||
return "rate_limit";
|
return "overloaded";
|
||||||
|
}
|
||||||
|
if (isTransientHttpError(raw)) {
|
||||||
|
// Treat transient 5xx provider failures as retryable transport issues.
|
||||||
|
return "timeout";
|
||||||
|
}
|
||||||
|
if (isJsonApiInternalServerError(raw)) {
|
||||||
|
return "timeout";
|
||||||
}
|
}
|
||||||
if (isCloudCodeAssistFormatError(raw)) {
|
if (isCloudCodeAssistFormatError(raw)) {
|
||||||
return "format";
|
return "format";
|
||||||
|
|||||||
@ -5,6 +5,7 @@ export type FailoverReason =
|
|||||||
| "auth_permanent"
|
| "auth_permanent"
|
||||||
| "format"
|
| "format"
|
||||||
| "rate_limit"
|
| "rate_limit"
|
||||||
|
| "overloaded"
|
||||||
| "billing"
|
| "billing"
|
||||||
| "timeout"
|
| "timeout"
|
||||||
| "model_not_found"
|
| "model_not_found"
|
||||||
|
|||||||
@ -9,11 +9,28 @@ import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js
|
|||||||
|
|
||||||
const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise<EmbeddedRunAttemptResult>>();
|
const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise<EmbeddedRunAttemptResult>>();
|
||||||
const resolveCopilotApiTokenMock = vi.fn();
|
const resolveCopilotApiTokenMock = vi.fn();
|
||||||
|
const { computeBackoffMock, sleepWithAbortMock } = vi.hoisted(() => ({
|
||||||
|
computeBackoffMock: vi.fn(
|
||||||
|
(
|
||||||
|
_policy: { initialMs: number; maxMs: number; factor: number; jitter: number },
|
||||||
|
_attempt: number,
|
||||||
|
) => 321,
|
||||||
|
),
|
||||||
|
sleepWithAbortMock: vi.fn(async (_ms: number, _abortSignal?: AbortSignal) => undefined),
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("./pi-embedded-runner/run/attempt.js", () => ({
|
vi.mock("./pi-embedded-runner/run/attempt.js", () => ({
|
||||||
runEmbeddedAttempt: (params: unknown) => runEmbeddedAttemptMock(params),
|
runEmbeddedAttempt: (params: unknown) => runEmbeddedAttemptMock(params),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("../infra/backoff.js", () => ({
|
||||||
|
computeBackoff: (
|
||||||
|
policy: { initialMs: number; maxMs: number; factor: number; jitter: number },
|
||||||
|
attempt: number,
|
||||||
|
) => computeBackoffMock(policy, attempt),
|
||||||
|
sleepWithAbort: (ms: number, abortSignal?: AbortSignal) => sleepWithAbortMock(ms, abortSignal),
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("../providers/github-copilot-token.js", () => ({
|
vi.mock("../providers/github-copilot-token.js", () => ({
|
||||||
DEFAULT_COPILOT_API_BASE_URL: "https://api.individual.githubcopilot.com",
|
DEFAULT_COPILOT_API_BASE_URL: "https://api.individual.githubcopilot.com",
|
||||||
resolveCopilotApiToken: (...args: unknown[]) => resolveCopilotApiTokenMock(...args),
|
resolveCopilotApiToken: (...args: unknown[]) => resolveCopilotApiTokenMock(...args),
|
||||||
@ -43,6 +60,8 @@ beforeEach(() => {
|
|||||||
vi.useRealTimers();
|
vi.useRealTimers();
|
||||||
runEmbeddedAttemptMock.mockClear();
|
runEmbeddedAttemptMock.mockClear();
|
||||||
resolveCopilotApiTokenMock.mockReset();
|
resolveCopilotApiTokenMock.mockReset();
|
||||||
|
computeBackoffMock.mockClear();
|
||||||
|
sleepWithAbortMock.mockClear();
|
||||||
});
|
});
|
||||||
|
|
||||||
const baseUsage = {
|
const baseUsage = {
|
||||||
@ -252,6 +271,24 @@ const mockFailedThenSuccessfulAttempt = (errorMessage = "rate limit") => {
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const mockPromptErrorThenSuccessfulAttempt = (errorMessage: string) => {
|
||||||
|
runEmbeddedAttemptMock
|
||||||
|
.mockResolvedValueOnce(
|
||||||
|
makeAttempt({
|
||||||
|
promptError: new Error(errorMessage),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.mockResolvedValueOnce(
|
||||||
|
makeAttempt({
|
||||||
|
assistantTexts: ["ok"],
|
||||||
|
lastAssistant: buildAssistant({
|
||||||
|
stopReason: "stop",
|
||||||
|
content: [{ type: "text", text: "ok" }],
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
async function runAutoPinnedOpenAiTurn(params: {
|
async function runAutoPinnedOpenAiTurn(params: {
|
||||||
agentDir: string;
|
agentDir: string;
|
||||||
workspaceDir: string;
|
workspaceDir: string;
|
||||||
@ -320,6 +357,28 @@ async function runAutoPinnedRotationCase(params: {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function runAutoPinnedPromptErrorRotationCase(params: {
|
||||||
|
errorMessage: string;
|
||||||
|
sessionKey: string;
|
||||||
|
runId: string;
|
||||||
|
}) {
|
||||||
|
runEmbeddedAttemptMock.mockClear();
|
||||||
|
return withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
|
||||||
|
await writeAuthStore(agentDir);
|
||||||
|
mockPromptErrorThenSuccessfulAttempt(params.errorMessage);
|
||||||
|
await runAutoPinnedOpenAiTurn({
|
||||||
|
agentDir,
|
||||||
|
workspaceDir,
|
||||||
|
sessionKey: params.sessionKey,
|
||||||
|
runId: params.runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2);
|
||||||
|
const usageStats = await readUsageStats(agentDir);
|
||||||
|
return { usageStats };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function mockSingleSuccessfulAttempt() {
|
function mockSingleSuccessfulAttempt() {
|
||||||
runEmbeddedAttemptMock.mockResolvedValueOnce(
|
runEmbeddedAttemptMock.mockResolvedValueOnce(
|
||||||
makeAttempt({
|
makeAttempt({
|
||||||
@ -639,13 +698,48 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
|
|||||||
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
|
it("rotates for overloaded assistant failures across auto-pinned profiles", async () => {
|
||||||
const { usageStats } = await runAutoPinnedRotationCase({
|
const { usageStats } = await runAutoPinnedRotationCase({
|
||||||
errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
|
errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
|
||||||
sessionKey: "agent:test:overloaded-rotation",
|
sessionKey: "agent:test:overloaded-rotation",
|
||||||
runId: "run:overloaded-rotation",
|
runId: "run:overloaded-rotation",
|
||||||
});
|
});
|
||||||
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
||||||
|
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
||||||
|
expect(computeBackoffMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(computeBackoffMock).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
initialMs: 250,
|
||||||
|
maxMs: 1500,
|
||||||
|
factor: 2,
|
||||||
|
jitter: 0.2,
|
||||||
|
}),
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
|
||||||
|
const { usageStats } = await runAutoPinnedPromptErrorRotationCase({
|
||||||
|
errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
|
||||||
|
sessionKey: "agent:test:overloaded-prompt-rotation",
|
||||||
|
runId: "run:overloaded-prompt-rotation",
|
||||||
|
});
|
||||||
|
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
||||||
|
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
||||||
|
expect(computeBackoffMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(computeBackoffMock).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
initialMs: 250,
|
||||||
|
maxMs: 1500,
|
||||||
|
factor: 2,
|
||||||
|
jitter: 0.2,
|
||||||
|
}),
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("rotates on timeout without cooling down the timed-out profile", async () => {
|
it("rotates on timeout without cooling down the timed-out profile", async () => {
|
||||||
@ -656,6 +750,8 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
|
|||||||
});
|
});
|
||||||
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
||||||
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
||||||
|
expect(computeBackoffMock).not.toHaveBeenCalled();
|
||||||
|
expect(sleepWithAbortMock).not.toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
|
|
||||||
it("rotates on bare service unavailable without cooling down the profile", async () => {
|
it("rotates on bare service unavailable without cooling down the profile", async () => {
|
||||||
@ -668,6 +764,54 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
|
|||||||
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("resets overload failover backoff after a successful turn", async () => {
|
||||||
|
await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
|
||||||
|
await writeAuthStore(agentDir);
|
||||||
|
|
||||||
|
mockFailedThenSuccessfulAttempt(
|
||||||
|
'{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
|
||||||
|
);
|
||||||
|
await runAutoPinnedOpenAiTurn({
|
||||||
|
agentDir,
|
||||||
|
workspaceDir,
|
||||||
|
sessionKey: "agent:test:overloaded-backoff-reset-1",
|
||||||
|
runId: "run:overloaded-backoff-reset-1",
|
||||||
|
});
|
||||||
|
|
||||||
|
mockFailedThenSuccessfulAttempt(
|
||||||
|
'{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
|
||||||
|
);
|
||||||
|
await runAutoPinnedOpenAiTurn({
|
||||||
|
agentDir,
|
||||||
|
workspaceDir,
|
||||||
|
sessionKey: "agent:test:overloaded-backoff-reset-2",
|
||||||
|
runId: "run:overloaded-backoff-reset-2",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(computeBackoffMock).toHaveBeenCalledTimes(2);
|
||||||
|
expect(computeBackoffMock).toHaveBeenNthCalledWith(
|
||||||
|
1,
|
||||||
|
expect.objectContaining({
|
||||||
|
initialMs: 250,
|
||||||
|
maxMs: 1500,
|
||||||
|
factor: 2,
|
||||||
|
jitter: 0.2,
|
||||||
|
}),
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
expect(computeBackoffMock).toHaveBeenNthCalledWith(
|
||||||
|
2,
|
||||||
|
expect.objectContaining({
|
||||||
|
initialMs: 250,
|
||||||
|
maxMs: 1500,
|
||||||
|
factor: 2,
|
||||||
|
jitter: 0.2,
|
||||||
|
}),
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("does not rotate for compaction timeouts", async () => {
|
it("does not rotate for compaction timeouts", async () => {
|
||||||
await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
|
await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
|
||||||
await writeAuthStore(agentDir);
|
await writeAuthStore(agentDir);
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import { randomBytes } from "node:crypto";
|
import { randomBytes } from "node:crypto";
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import type { ThinkLevel } from "../../auto-reply/thinking.js";
|
import type { ThinkLevel } from "../../auto-reply/thinking.js";
|
||||||
|
import { computeBackoff, sleepWithAbort, type BackoffPolicy } from "../../infra/backoff.js";
|
||||||
import { generateSecureToken } from "../../infra/secure-random.js";
|
import { generateSecureToken } from "../../infra/secure-random.js";
|
||||||
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
|
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
|
||||||
import type { PluginHookBeforeAgentStartResult } from "../../plugins/types.js";
|
import type { PluginHookBeforeAgentStartResult } from "../../plugins/types.js";
|
||||||
@ -10,6 +11,7 @@ import { resolveOpenClawAgentDir } from "../agent-paths.js";
|
|||||||
import { hasConfiguredModelFallbacks } from "../agent-scope.js";
|
import { hasConfiguredModelFallbacks } from "../agent-scope.js";
|
||||||
import {
|
import {
|
||||||
isProfileInCooldown,
|
isProfileInCooldown,
|
||||||
|
type AuthProfileFailureReason,
|
||||||
markAuthProfileFailure,
|
markAuthProfileFailure,
|
||||||
markAuthProfileGood,
|
markAuthProfileGood,
|
||||||
markAuthProfileUsed,
|
markAuthProfileUsed,
|
||||||
@ -76,6 +78,12 @@ type CopilotTokenState = {
|
|||||||
const COPILOT_REFRESH_MARGIN_MS = 5 * 60 * 1000;
|
const COPILOT_REFRESH_MARGIN_MS = 5 * 60 * 1000;
|
||||||
const COPILOT_REFRESH_RETRY_MS = 60 * 1000;
|
const COPILOT_REFRESH_RETRY_MS = 60 * 1000;
|
||||||
const COPILOT_REFRESH_MIN_DELAY_MS = 5 * 1000;
|
const COPILOT_REFRESH_MIN_DELAY_MS = 5 * 1000;
|
||||||
|
const OVERLOAD_FAILOVER_BACKOFF_POLICY: BackoffPolicy = {
|
||||||
|
initialMs: 250,
|
||||||
|
maxMs: 1_500,
|
||||||
|
factor: 2,
|
||||||
|
jitter: 0.2,
|
||||||
|
};
|
||||||
|
|
||||||
// Avoid Anthropic's refusal test token poisoning session transcripts.
|
// Avoid Anthropic's refusal test token poisoning session transcripts.
|
||||||
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
|
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
|
||||||
@ -719,9 +727,10 @@ export async function runEmbeddedPiAgent(
|
|||||||
let lastRunPromptUsage: ReturnType<typeof normalizeUsage> | undefined;
|
let lastRunPromptUsage: ReturnType<typeof normalizeUsage> | undefined;
|
||||||
let autoCompactionCount = 0;
|
let autoCompactionCount = 0;
|
||||||
let runLoopIterations = 0;
|
let runLoopIterations = 0;
|
||||||
|
let overloadFailoverAttempts = 0;
|
||||||
const maybeMarkAuthProfileFailure = async (failure: {
|
const maybeMarkAuthProfileFailure = async (failure: {
|
||||||
profileId?: string;
|
profileId?: string;
|
||||||
reason?: Parameters<typeof markAuthProfileFailure>[0]["reason"] | null;
|
reason?: AuthProfileFailureReason | null;
|
||||||
config?: RunEmbeddedPiAgentParams["config"];
|
config?: RunEmbeddedPiAgentParams["config"];
|
||||||
agentDir?: RunEmbeddedPiAgentParams["agentDir"];
|
agentDir?: RunEmbeddedPiAgentParams["agentDir"];
|
||||||
}) => {
|
}) => {
|
||||||
@ -737,6 +746,22 @@ export async function runEmbeddedPiAgent(
|
|||||||
agentDir,
|
agentDir,
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
const resolveAuthProfileFailureReason = (
|
||||||
|
failoverReason: FailoverReason | null,
|
||||||
|
): AuthProfileFailureReason | null => {
|
||||||
|
if (!failoverReason || failoverReason === "timeout" || failoverReason === "overloaded") {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return failoverReason;
|
||||||
|
};
|
||||||
|
const maybeBackoffBeforeOverloadFailover = async (reason: FailoverReason | null) => {
|
||||||
|
if (reason !== "overloaded") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
overloadFailoverAttempts += 1;
|
||||||
|
const delayMs = computeBackoff(OVERLOAD_FAILOVER_BACKOFF_POLICY, overloadFailoverAttempts);
|
||||||
|
await sleepWithAbort(delayMs, params.abortSignal);
|
||||||
|
};
|
||||||
try {
|
try {
|
||||||
let authRetryPending = false;
|
let authRetryPending = false;
|
||||||
// Hoisted so the retry-limit error path can use the most recent API total.
|
// Hoisted so the retry-limit error path can use the most recent API total.
|
||||||
@ -1145,15 +1170,19 @@ export async function runEmbeddedPiAgent(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
const promptFailoverReason = classifyFailoverReason(errorText);
|
const promptFailoverReason = classifyFailoverReason(errorText);
|
||||||
|
const promptProfileFailureReason =
|
||||||
|
resolveAuthProfileFailureReason(promptFailoverReason);
|
||||||
await maybeMarkAuthProfileFailure({
|
await maybeMarkAuthProfileFailure({
|
||||||
profileId: lastProfileId,
|
profileId: lastProfileId,
|
||||||
reason: promptFailoverReason,
|
reason: promptProfileFailureReason,
|
||||||
});
|
});
|
||||||
|
const promptFailoverFailure = isFailoverErrorMessage(errorText);
|
||||||
if (
|
if (
|
||||||
isFailoverErrorMessage(errorText) &&
|
promptFailoverFailure &&
|
||||||
promptFailoverReason !== "timeout" &&
|
promptFailoverReason !== "timeout" &&
|
||||||
(await advanceAuthProfile())
|
(await advanceAuthProfile())
|
||||||
) {
|
) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const fallbackThinking = pickFallbackThinkingLevel({
|
const fallbackThinking = pickFallbackThinkingLevel({
|
||||||
@ -1169,7 +1198,8 @@ export async function runEmbeddedPiAgent(
|
|||||||
}
|
}
|
||||||
// FIX: Throw FailoverError for prompt errors when fallbacks configured
|
// FIX: Throw FailoverError for prompt errors when fallbacks configured
|
||||||
// This enables model fallback for quota/rate limit errors during prompt submission
|
// This enables model fallback for quota/rate limit errors during prompt submission
|
||||||
if (fallbackConfigured && isFailoverErrorMessage(errorText)) {
|
if (fallbackConfigured && promptFailoverFailure) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
|
||||||
throw new FailoverError(errorText, {
|
throw new FailoverError(errorText, {
|
||||||
reason: promptFailoverReason ?? "unknown",
|
reason: promptFailoverReason ?? "unknown",
|
||||||
provider,
|
provider,
|
||||||
@ -1198,6 +1228,8 @@ export async function runEmbeddedPiAgent(
|
|||||||
const billingFailure = isBillingAssistantError(lastAssistant);
|
const billingFailure = isBillingAssistantError(lastAssistant);
|
||||||
const failoverFailure = isFailoverAssistantError(lastAssistant);
|
const failoverFailure = isFailoverAssistantError(lastAssistant);
|
||||||
const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
|
const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
|
||||||
|
const assistantProfileFailureReason =
|
||||||
|
resolveAuthProfileFailureReason(assistantFailoverReason);
|
||||||
const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
|
const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
|
||||||
const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");
|
const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");
|
||||||
|
|
||||||
@ -1237,10 +1269,7 @@ export async function runEmbeddedPiAgent(
|
|||||||
|
|
||||||
if (shouldRotate) {
|
if (shouldRotate) {
|
||||||
if (lastProfileId) {
|
if (lastProfileId) {
|
||||||
const reason =
|
const reason = timedOut ? "timeout" : assistantProfileFailureReason;
|
||||||
timedOut || assistantFailoverReason === "timeout"
|
|
||||||
? "timeout"
|
|
||||||
: (assistantFailoverReason ?? "unknown");
|
|
||||||
// Skip cooldown for timeouts: a timeout is model/network-specific,
|
// Skip cooldown for timeouts: a timeout is model/network-specific,
|
||||||
// not an auth issue. Marking the profile would poison fallback models
|
// not an auth issue. Marking the profile would poison fallback models
|
||||||
// on the same provider (e.g. gpt-5.3 timeout blocks gpt-5.2).
|
// on the same provider (e.g. gpt-5.3 timeout blocks gpt-5.2).
|
||||||
@ -1260,10 +1289,12 @@ export async function runEmbeddedPiAgent(
|
|||||||
|
|
||||||
const rotated = await advanceAuthProfile();
|
const rotated = await advanceAuthProfile();
|
||||||
if (rotated) {
|
if (rotated) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(assistantFailoverReason);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fallbackConfigured) {
|
if (fallbackConfigured) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(assistantFailoverReason);
|
||||||
// Prefer formatted error message (user-friendly) over raw errorMessage
|
// Prefer formatted error message (user-friendly) over raw errorMessage
|
||||||
const message =
|
const message =
|
||||||
(lastAssistant
|
(lastAssistant
|
||||||
@ -1369,6 +1400,7 @@ export async function runEmbeddedPiAgent(
|
|||||||
`embedded run done: runId=${params.runId} sessionId=${params.sessionId} durationMs=${Date.now() - started} aborted=${aborted}`,
|
`embedded run done: runId=${params.runId} sessionId=${params.sessionId} durationMs=${Date.now() - started} aborted=${aborted}`,
|
||||||
);
|
);
|
||||||
if (lastProfileId) {
|
if (lastProfileId) {
|
||||||
|
overloadFailoverAttempts = 0;
|
||||||
await markAuthProfileGood({
|
await markAuthProfileGood({
|
||||||
store: authStore,
|
store: authStore,
|
||||||
provider,
|
provider,
|
||||||
|
|||||||
@ -9,6 +9,7 @@ describe("mapFailoverReasonToProbeStatus", () => {
|
|||||||
it("keeps existing failover reason mappings", () => {
|
it("keeps existing failover reason mappings", () => {
|
||||||
expect(mapFailoverReasonToProbeStatus("auth")).toBe("auth");
|
expect(mapFailoverReasonToProbeStatus("auth")).toBe("auth");
|
||||||
expect(mapFailoverReasonToProbeStatus("rate_limit")).toBe("rate_limit");
|
expect(mapFailoverReasonToProbeStatus("rate_limit")).toBe("rate_limit");
|
||||||
|
expect(mapFailoverReasonToProbeStatus("overloaded")).toBe("rate_limit");
|
||||||
expect(mapFailoverReasonToProbeStatus("billing")).toBe("billing");
|
expect(mapFailoverReasonToProbeStatus("billing")).toBe("billing");
|
||||||
expect(mapFailoverReasonToProbeStatus("timeout")).toBe("timeout");
|
expect(mapFailoverReasonToProbeStatus("timeout")).toBe("timeout");
|
||||||
expect(mapFailoverReasonToProbeStatus("format")).toBe("format");
|
expect(mapFailoverReasonToProbeStatus("format")).toBe("format");
|
||||||
|
|||||||
@ -106,7 +106,7 @@ export function mapFailoverReasonToProbeStatus(reason?: string | null): AuthProb
|
|||||||
// surface in the auth bucket instead of showing as unknown.
|
// surface in the auth bucket instead of showing as unknown.
|
||||||
return "auth";
|
return "auth";
|
||||||
}
|
}
|
||||||
if (reason === "rate_limit") {
|
if (reason === "rate_limit" || reason === "overloaded") {
|
||||||
return "rate_limit";
|
return "rate_limit";
|
||||||
}
|
}
|
||||||
if (reason === "billing") {
|
if (reason === "billing") {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user