diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c5065b2350..d1449580fb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Auth/Profiles: prevent cooldown deadline from being reset on every retry when the backoff is already saturated. Previously each failed request overwrote `cooldownUntil` with `now + backoffMs`, so a 60-minute cooldown was perpetually extended by cron or inbound retries, trapping the gateway in an unrecoverable loop that required manual `usageStats` deletion to resolve. (#23516) - Channels/Security: fail closed on missing provider group policy config by defaulting runtime group policy to `allowlist` (instead of inheriting `channels.defaults.groupPolicy`) when `channels.` is absent across message channels, and align runtime + security warnings/docs to the same fallback behavior (Slack, Discord, iMessage, Telegram, WhatsApp, Signal, LINE, Matrix, Mattermost, Google Chat, IRC, Nextcloud Talk, Feishu, and Zalo user flows; plus Discord message/native-command paths). (#23367) Thanks @bmendonca3. - Gateway/Onboarding: harden remote gateway onboarding defaults and guidance by defaulting discovered direct URLs to `wss://`, rejecting insecure non-loopback `ws://` targets in onboarding validation, and expanding remote-security remediation messaging across gateway client/call/doctor flows. (#23476) Thanks @bmendonca3. - CLI/Sessions: pass the configured sessions directory when resolving transcript paths in `agentCommand`, so custom `session.store` locations resume sessions reliably. Thanks @davidrudduck. diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index b5c92f64651..597cb2d7ad3 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -4,6 +4,7 @@ import { clearAuthProfileCooldown, clearExpiredCooldowns, isProfileInCooldown, + markAuthProfileFailure, resolveProfileUnusableUntil, } from "./usage.js"; @@ -347,3 +348,99 @@ describe("clearAuthProfileCooldown", () => { expect(store.usageStats).toBeUndefined(); }); }); + +describe("markAuthProfileFailure — cooldown is never reset to an earlier deadline", () => { + // Regression for https://github.com/openclaw/openclaw/issues/23516 + // When all providers are at saturation backoff (60 min) and retries fire every 30 min, + // each retry was resetting cooldownUntil to now+60m, preventing recovery. + + it("does not shorten an existing cooldown when a retry fires mid-window", async () => { + const now = 1_000_000; + // Profile already has 50 min remaining on its cooldown + const existingCooldownUntil = now + 50 * 60 * 1000; + const store = makeStore({ + "anthropic:default": { + cooldownUntil: existingCooldownUntil, + errorCount: 3, // already at saturation (60-min backoff) + lastFailureAt: now - 10 * 60 * 1000, + }, + }); + + vi.useFakeTimers(); + vi.setSystemTime(now); + try { + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "rate_limit", + }); + } finally { + vi.useRealTimers(); + } + + const stats = store.usageStats?.["anthropic:default"]; + // cooldownUntil must NOT have been reset to now+60m (= now+3_600_000 < existingCooldownUntil) + // It should remain at the original deadline or be extended, never shortened. + expect(stats?.cooldownUntil).toBeGreaterThanOrEqual(existingCooldownUntil); + }); + + it("does extend cooldownUntil when the new backoff would end later", async () => { + const now = 1_000_000; + // Profile has only 5 min remaining but the next backoff level gives 60 min + const existingCooldownUntil = now + 5 * 60 * 1000; + const store = makeStore({ + "anthropic:default": { + cooldownUntil: existingCooldownUntil, + errorCount: 2, // next step: 60-min backoff + lastFailureAt: now - 60_000, + }, + }); + + vi.useFakeTimers(); + vi.setSystemTime(now); + try { + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "rate_limit", + }); + } finally { + vi.useRealTimers(); + } + + const stats = store.usageStats?.["anthropic:default"]; + // now+60min > existingCooldownUntil (now+5min), so it should be extended + expect(stats?.cooldownUntil).toBeGreaterThan(existingCooldownUntil); + }); + + it("does not shorten an existing disabledUntil on a billing retry", async () => { + const now = 1_000_000; + // Profile already has 20 hours remaining on a billing disable + const existingDisabledUntil = now + 20 * 60 * 60 * 1000; + const store = makeStore({ + "anthropic:default": { + disabledUntil: existingDisabledUntil, + disabledReason: "billing", + errorCount: 5, + failureCounts: { billing: 5 }, + lastFailureAt: now - 60_000, + }, + }); + + vi.useFakeTimers(); + vi.setSystemTime(now); + try { + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "billing", + }); + } finally { + vi.useRealTimers(); + } + + const stats = store.usageStats?.["anthropic:default"]; + // disabledUntil must not have been shortened + expect(stats?.disabledUntil).toBeGreaterThanOrEqual(existingDisabledUntil); + }); +}); diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 1bfda226873..509710f4f1f 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -287,11 +287,25 @@ function computeNextProfileUsageStats(params: { baseMs: params.cfgResolved.billingBackoffMs, maxMs: params.cfgResolved.billingMaxMs, }); - updatedStats.disabledUntil = params.now + backoffMs; + const newDisabledUntil = params.now + backoffMs; + // Only advance disabledUntil — never shorten an existing window. + // A retry that fires while the profile is already disabled must not reset + // the deadline to an earlier time; it may extend it if the new backoff is longer. + if (!params.existing.disabledUntil || newDisabledUntil > params.existing.disabledUntil) { + updatedStats.disabledUntil = newDisabledUntil; + } updatedStats.disabledReason = "billing"; } else { const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount); - updatedStats.cooldownUntil = params.now + backoffMs; + const newCooldownUntil = params.now + backoffMs; + // Only advance cooldownUntil — never shorten an existing window. + // When the backoff saturates (60 min) and retries fire every 30 min, each + // retry was resetting cooldownUntil to now+60m, preventing the profile from + // ever recovering. We only write a new deadline when it is strictly later + // than the one already in the store. + if (!params.existing.cooldownUntil || newCooldownUntil > params.existing.cooldownUntil) { + updatedStats.cooldownUntil = newCooldownUntil; + } } return updatedStats;