From dd78c52f17080d9c889f9119dd2f9de2f88a7abf Mon Sep 17 00:00:00 2001 From: Jarvis Date: Fri, 13 Mar 2026 21:12:53 +0800 Subject: [PATCH 1/2] fix(auth): preserve per-model cooldown windows --- ...th-profiles.markauthprofilefailure.test.ts | 15 +++++ src/agents/auth-profiles/types.ts | 2 + src/agents/auth-profiles/usage.test.ts | 58 ++++++++++++++++++- src/agents/auth-profiles/usage.ts | 40 ++++++++++++- src/agents/model-fallback.test.ts | 44 ++++++++++++++ src/agents/model-fallback.ts | 4 +- ...pi-agent.auth-profile-rotation.e2e.test.ts | 41 +++++++++++++ src/agents/pi-embedded-runner/run.ts | 11 +++- 8 files changed, 208 insertions(+), 7 deletions(-) diff --git a/src/agents/auth-profiles.markauthprofilefailure.test.ts b/src/agents/auth-profiles.markauthprofilefailure.test.ts index 5c4d73197b3..dafd588031e 100644 --- a/src/agents/auth-profiles.markauthprofilefailure.test.ts +++ b/src/agents/auth-profiles.markauthprofilefailure.test.ts @@ -114,6 +114,21 @@ describe("markAuthProfileFailure", () => { expect(reloaded.usageStats?.["anthropic:default"]?.cooldownUntil).toBe(firstCooldownUntil); }); }); + it("records the model that triggered a rate-limit cooldown", async () => { + await withAuthProfileStore(async ({ agentDir, store }) => { + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "rate_limit", + modelId: "claude-opus-4-6", + agentDir, + }); + + const stats = store.usageStats?.["anthropic:default"]; + expect(stats?.cooldownReason).toBe("rate_limit"); + expect(stats?.cooldownModel).toBe("claude-opus-4-6"); + }); + }); it("records overloaded failures in the cooldown bucket", async () => { await withAuthProfileStore(async ({ agentDir, store }) => { await markAuthProfileFailure({ diff --git a/src/agents/auth-profiles/types.ts b/src/agents/auth-profiles/types.ts index 127a444939b..848268385a2 100644 --- a/src/agents/auth-profiles/types.ts +++ b/src/agents/auth-profiles/types.ts @@ -51,6 +51,8 @@ export type AuthProfileFailureReason = export type ProfileUsageStats = { lastUsed?: number; cooldownUntil?: number; + cooldownReason?: AuthProfileFailureReason; + cooldownModel?: string; disabledUntil?: number; disabledReason?: AuthProfileFailureReason; errorCount?: number; diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 6dd5697cc99..328b729d2e1 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -94,6 +94,35 @@ describe("isProfileInCooldown", () => { expect(isProfileInCooldown(store, "anthropic:default")).toBe(true); }); + it("does not block a different model when the cooldown came from a rate limit on another model", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() + 60_000, + cooldownReason: "rate_limit", + cooldownModel: "claude-opus-4-6", + }, + }); + expect(isProfileInCooldown(store, "anthropic:default", undefined, "claude-sonnet-4-6")).toBe( + false, + ); + expect(isProfileInCooldown(store, "anthropic:default", undefined, "claude-opus-4-6")).toBe( + true, + ); + }); + + it("still blocks a different model for non-rate-limit cooldowns", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() + 60_000, + cooldownReason: "overloaded", + cooldownModel: "claude-opus-4-6", + }, + }); + expect(isProfileInCooldown(store, "anthropic:default", undefined, "claude-sonnet-4-6")).toBe( + true, + ); + }); + it("returns false when cooldownUntil has passed", () => { const store = makeStore({ "anthropic:default": { cooldownUntil: Date.now() - 1_000 }, @@ -538,7 +567,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () async function markFailureAt(params: { store: ReturnType; now: number; - reason: "rate_limit" | "billing" | "auth_permanent"; + reason: "rate_limit" | "billing" | "auth_permanent" | "overloaded"; + modelId?: string; }): Promise { vi.useFakeTimers(); vi.setSystemTime(params.now); @@ -547,6 +577,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () store: params.store, profileId: "anthropic:default", reason: params.reason, + modelId: params.modelId, }); } finally { vi.useRealTimers(); @@ -608,6 +639,31 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () }); } + it("keeps rate-limit cooldown metadata unchanged while the active window is still in effect", async () => { + const now = 1_000_000; + const existingStats: WindowStats = { + cooldownUntil: now + 50 * 60 * 1000, + cooldownReason: "rate_limit", + cooldownModel: "claude-opus-4-6", + errorCount: 3, + failureCounts: { rate_limit: 3 }, + lastFailureAt: now - 10 * 60 * 1000, + }; + const store = makeStore({ "anthropic:default": existingStats }); + + await markFailureAt({ + store, + now, + reason: "overloaded", + modelId: "claude-sonnet-4-6", + }); + + const stats = store.usageStats?.["anthropic:default"]; + expect(stats?.cooldownUntil).toBe(existingStats.cooldownUntil); + expect(stats?.cooldownReason).toBe("rate_limit"); + expect(stats?.cooldownModel).toBe("claude-opus-4-6"); + }); + // When a cooldown/disabled window expires, the error count resets to prevent // stale counters from escalating the next cooldown (the root cause of // infinite cooldown loops — see #40989). The next failure should compute diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 20e1cbaa497..57a667dbf00 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -44,6 +44,7 @@ export function isProfileInCooldown( store: AuthProfileStore, profileId: string, now?: number, + forModel?: string, ): boolean { if (isAuthCooldownBypassedForProvider(store.profiles[profileId]?.provider)) { return false; @@ -54,6 +55,16 @@ export function isProfileInCooldown( } const unusableUntil = resolveProfileUnusableUntil(stats); const ts = now ?? Date.now(); + if ( + stats.cooldownReason === "rate_limit" && + typeof forModel === "string" && + forModel.trim().length > 0 && + typeof stats.cooldownModel === "string" && + stats.cooldownModel.trim().length > 0 && + stats.cooldownModel.trim() !== forModel.trim() + ) { + return false; + } return unusableUntil ? ts < unusableUntil : false; } @@ -397,6 +408,7 @@ function computeNextProfileUsageStats(params: { existing: ProfileUsageStats; now: number; reason: AuthProfileFailureReason; + modelId?: string; cfgResolved: ResolvedAuthCooldownConfig; }): ProfileUsageStats { const windowMs = params.cfgResolved.failureWindowMs; @@ -442,15 +454,36 @@ function computeNextProfileUsageStats(params: { recomputedUntil: params.now + backoffMs, }); updatedStats.disabledReason = params.reason; + updatedStats.cooldownReason = undefined; + updatedStats.cooldownModel = undefined; } else { const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount); + const existingCooldownUntil = params.existing.cooldownUntil; + const keepsExistingCooldownWindow = + typeof existingCooldownUntil === "number" && + Number.isFinite(existingCooldownUntil) && + existingCooldownUntil > params.now; // Keep active cooldown windows immutable so retries within the window // cannot push recovery further out. updatedStats.cooldownUntil = keepActiveWindowOrRecompute({ - existingUntil: params.existing.cooldownUntil, + existingUntil: existingCooldownUntil, now: params.now, recomputedUntil: params.now + backoffMs, }); + if (keepsExistingCooldownWindow) { + // Keep metadata aligned with the preserved active window so a later + // transient failure cannot widen a model-scoped cooldown back to profile-wide. + updatedStats.cooldownReason = params.existing.cooldownReason; + updatedStats.cooldownModel = params.existing.cooldownModel; + } else { + updatedStats.cooldownReason = params.reason; + updatedStats.cooldownModel = + params.reason === "rate_limit" && + typeof params.modelId === "string" && + params.modelId.trim().length > 0 + ? params.modelId.trim() + : undefined; + } } return updatedStats; @@ -465,11 +498,12 @@ export async function markAuthProfileFailure(params: { store: AuthProfileStore; profileId: string; reason: AuthProfileFailureReason; + modelId?: string; cfg?: OpenClawConfig; agentDir?: string; runId?: string; }): Promise { - const { store, profileId, reason, agentDir, cfg, runId } = params; + const { store, profileId, reason, modelId, agentDir, cfg, runId } = params; const profile = store.profiles[profileId]; if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) { return; @@ -497,6 +531,7 @@ export async function markAuthProfileFailure(params: { existing: previousStats ?? {}, now, reason, + modelId, cfgResolved, }); nextStats = computed; @@ -535,6 +570,7 @@ export async function markAuthProfileFailure(params: { existing: previousStats ?? {}, now, reason, + modelId, cfgResolved, }); nextStats = computed; diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index f8422b4aa14..0be05f83657 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -1211,6 +1211,50 @@ describe("runWithModelFallback", () => { }); }); + it("does not skip a provider when the stored rate-limit cooldown is for a different model", async () => { + const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-test-")); + const store: AuthProfileStore = { + version: AUTH_STORE_VERSION, + profiles: { + "anthropic:default": { type: "api_key", provider: "anthropic", key: "test-key" }, + "groq:default": { type: "api_key", provider: "groq", key: "test-key" }, + }, + usageStats: { + "anthropic:default": { + cooldownUntil: Date.now() + 300000, + cooldownReason: "rate_limit", + cooldownModel: "claude-opus-4-6", + }, + }, + }; + saveAuthProfileStore(store, tmpDir); + + const cfg = makeCfg({ + agents: { + defaults: { + model: { + primary: "anthropic/claude-sonnet-4-5", + fallbacks: ["groq/llama-3.3-70b-versatile"], + }, + }, + }, + }); + + const run = vi.fn().mockResolvedValueOnce("sonnet success"); + + const result = await runWithModelFallback({ + cfg, + provider: "anthropic", + model: "claude-sonnet-4-5", + run, + agentDir: tmpDir, + }); + + expect(result.result).toBe("sonnet success"); + expect(run).toHaveBeenCalledTimes(1); + expect(run).toHaveBeenNthCalledWith(1, "anthropic", "claude-sonnet-4-5"); + }); + it("skips same-provider models on auth cooldown but still tries no-profile fallback providers", async () => { const { dir } = await makeAuthStoreWithCooldown("anthropic", "auth"); const cfg = makeCfg({ diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index d14ede7658b..aece96a153b 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -542,7 +542,9 @@ export async function runWithModelFallback(params: { store: authStore, provider: candidate.provider, }); - const isAnyProfileAvailable = profileIds.some((id) => !isProfileInCooldown(authStore, id)); + const isAnyProfileAvailable = profileIds.some( + (id) => !isProfileInCooldown(authStore, id, undefined, candidate.model), + ); if (profileIds.length > 0 && !isAnyProfileAvailable) { // All profiles for this provider are in cooldown. diff --git a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts index 0aa665e0635..cb711a48913 100644 --- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts +++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts @@ -218,6 +218,8 @@ const writeAuthStore = async ( { lastUsed?: number; cooldownUntil?: number; + cooldownReason?: AuthProfileFailureReason; + cooldownModel?: string; disabledUntil?: number; disabledReason?: AuthProfileFailureReason; failureCounts?: Partial>; @@ -332,6 +334,8 @@ async function readUsageStats(agentDir: string) { { lastUsed?: number; cooldownUntil?: number; + cooldownReason?: AuthProfileFailureReason; + cooldownModel?: string; disabledUntil?: number; disabledReason?: AuthProfileFailureReason; } @@ -1126,6 +1130,43 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); }); + it("does not block a locked profile when its rate-limit cooldown came from a different model", async () => { + await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => { + await writeAuthStore(agentDir, { + usageStats: { + "openai:p1": { + lastUsed: 1, + cooldownUntil: now + 60 * 60 * 1000, + cooldownReason: "rate_limit", + cooldownModel: "mock-2", + }, + "openai:p2": { lastUsed: 2 }, + }, + }); + + mockSingleSuccessfulAttempt(); + + const result = await runEmbeddedPiAgent({ + sessionId: "session:test", + sessionKey: "agent:test:model-specific-cooldown-lock", + sessionFile: path.join(workspaceDir, "session.jsonl"), + workspaceDir, + agentDir, + config: makeConfig(), + prompt: "hello", + provider: "openai", + model: "mock-1", + authProfileId: "openai:p1", + authProfileIdSource: "user", + timeoutMs: 5_000, + runId: "run:model-specific-cooldown-lock", + }); + + expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(1); + expect(result.payloads?.[0]?.text ?? "").toContain("ok"); + }); + }); + it("treats agent-level fallbacks as configured when defaults have none", async () => { await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => { await writeAuthStore(agentDir, { diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 32afe874442..6db2e3167bd 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -638,7 +638,7 @@ export async function runEmbeddedPiAgent( let nextIndex = profileIndex + 1; while (nextIndex < profileCandidates.length) { const candidate = profileCandidates[nextIndex]; - if (candidate && isProfileInCooldown(authStore, candidate)) { + if (candidate && isProfileInCooldown(authStore, candidate, undefined, modelId)) { nextIndex += 1; continue; } @@ -665,7 +665,9 @@ export async function runEmbeddedPiAgent( ); const allAutoProfilesInCooldown = autoProfileCandidates.length > 0 && - autoProfileCandidates.every((candidate) => isProfileInCooldown(authStore, candidate)); + autoProfileCandidates.every((candidate) => + isProfileInCooldown(authStore, candidate, undefined, modelId), + ); const unavailableReason = allAutoProfilesInCooldown ? (resolveProfilesUnavailableReason({ store: authStore, @@ -684,7 +686,9 @@ export async function runEmbeddedPiAgent( while (profileIndex < profileCandidates.length) { const candidate = profileCandidates[profileIndex]; const inCooldown = - candidate && candidate !== lockedProfileId && isProfileInCooldown(authStore, candidate); + candidate && + candidate !== lockedProfileId && + isProfileInCooldown(authStore, candidate, undefined, modelId); if (inCooldown) { if (allowTransientCooldownProbe && !didTransientCooldownProbe) { didTransientCooldownProbe = true; @@ -763,6 +767,7 @@ export async function runEmbeddedPiAgent( store: authStore, profileId, reason, + modelId, cfg: params.config, agentDir, runId: params.runId, From 87c4111a09fc632cfe6e59de735f98618338f3fb Mon Sep 17 00:00:00 2001 From: Jarvis Date: Sat, 14 Mar 2026 13:06:51 +0800 Subject: [PATCH 2/2] fix(auth): honor disabled windows in cooldown checks --- src/agents/auth-profiles/usage.test.ts | 20 ++++++++++++++++++++ src/agents/auth-profiles/usage.ts | 3 +++ 2 files changed, 23 insertions(+) diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 328b729d2e1..28309551a6e 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -123,6 +123,22 @@ describe("isProfileInCooldown", () => { ); }); + it("still blocks when a disabled window is active even if the stored rate-limit cooldown is for another model", () => { + const store = makeStore({ + "anthropic:default": { + cooldownUntil: Date.now() + 60_000, + cooldownReason: "rate_limit", + cooldownModel: "claude-opus-4-6", + disabledUntil: Date.now() + 5 * 60_000, + disabledReason: "billing", + }, + }); + + expect(isProfileInCooldown(store, "anthropic:default", undefined, "claude-sonnet-4-6")).toBe( + true, + ); + }); + it("returns false when cooldownUntil has passed", () => { const store = makeStore({ "anthropic:default": { cooldownUntil: Date.now() - 1_000 }, @@ -368,6 +384,8 @@ describe("clearExpiredCooldowns", () => { cooldownUntil: Date.now() - 1_000, disabledUntil: future, disabledReason: "billing", + cooldownReason: "rate_limit", + cooldownModel: "claude-opus-4-6", errorCount: 5, failureCounts: { rate_limit: 3, billing: 2 }, }, @@ -378,6 +396,8 @@ describe("clearExpiredCooldowns", () => { const stats = store.usageStats?.["anthropic:default"]; // cooldownUntil cleared expect(stats?.cooldownUntil).toBeUndefined(); + expect(stats?.cooldownReason).toBeUndefined(); + expect(stats?.cooldownModel).toBeUndefined(); // disabledUntil still active — not touched expect(stats?.disabledUntil).toBe(future); expect(stats?.disabledReason).toBe("billing"); diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 57a667dbf00..a08ea5ecf8d 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -56,6 +56,7 @@ export function isProfileInCooldown( const unusableUntil = resolveProfileUnusableUntil(stats); const ts = now ?? Date.now(); if ( + !isActiveUnusableWindow(stats.disabledUntil, ts) && stats.cooldownReason === "rate_limit" && typeof forModel === "string" && forModel.trim().length > 0 && @@ -223,6 +224,8 @@ export function clearExpiredCooldowns(store: AuthProfileStore, now?: number): bo if (cooldownExpired) { stats.cooldownUntil = undefined; + stats.cooldownReason = undefined; + stats.cooldownModel = undefined; profileMutated = true; } if (disabledExpired) {