diff --git a/CHANGELOG.md b/CHANGELOG.md index 0650d9343ba..bc7f9e97a11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai - Install/Discord Voice: make `@discordjs/opus` an optional dependency so `openclaw` install/update no longer hard-fails when native Opus builds fail, while keeping `opusscript` as the runtime fallback decoder for Discord voice flows. (#23737, #23733, #23703) Thanks @jeadland, @Sheetaa, and @Breakyman. - Agents/Exec: honor explicit agent context when resolving `tools.exec` defaults for runs with opaque/non-agent session keys, so per-agent `host/security/ask` policies are applied consistently. (#11832) +- Agents/Auth profiles: infer `all profiles unavailable` failover reasons from active profile cooldown/disabled stats (instead of hardcoded `rate_limit`) so auth/billing OAuth outages surface accurately in fallback errors. (#23996) Thanks @DerpyNoodlez. - Security/Sessions: redact sensitive token patterns from `sessions_history` tool output and surface `contentRedacted` metadata when masking occurs. (#16928) Thanks @aether-ai-agent. - Sandbox/Docker: default sandbox container user to the workspace owner `uid:gid` when `agents.*.sandbox.docker.user` is unset, fixing non-root gateway file-tool permissions under capability-dropped containers. (#20979) - Doctor/Security: add an explicit warning that `approvals.exec.enabled=false` disables forwarding only, while enforcement remains driven by host-local `exec-approvals.json` policy. (#15047) diff --git a/src/agents/auth-profiles.ts b/src/agents/auth-profiles.ts index fc731e87a8b..42941e6b1c8 100644 --- a/src/agents/auth-profiles.ts +++ b/src/agents/auth-profiles.ts @@ -40,5 +40,6 @@ export { markAuthProfileCooldown, markAuthProfileFailure, markAuthProfileUsed, + resolveProfilesUnavailableReason, resolveProfileUnusableUntilForDisplay, } from "./auth-profiles/usage.js"; diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 6baef101f54..3d7c2305d3f 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -5,6 +5,7 @@ import { clearExpiredCooldowns, isProfileInCooldown, markAuthProfileFailure, + resolveProfilesUnavailableReason, resolveProfileUnusableUntil, } from "./usage.js"; @@ -85,6 +86,101 @@ describe("isProfileInCooldown", () => { }); }); +describe("resolveProfilesUnavailableReason", () => { + it("prefers active disabledReason when profiles are disabled", () => { + const now = Date.now(); + const store = makeStore({ + "anthropic:default": { + disabledUntil: now + 60_000, + disabledReason: "billing", + }, + }); + + expect( + resolveProfilesUnavailableReason({ + store, + profileIds: ["anthropic:default"], + now, + }), + ).toBe("billing"); + }); + + it("uses recorded non-rate-limit failure counts for active cooldown windows", () => { + const now = Date.now(); + const store = makeStore({ + "anthropic:default": { + cooldownUntil: now + 60_000, + failureCounts: { auth: 3, rate_limit: 1 }, + }, + }); + + expect( + resolveProfilesUnavailableReason({ + store, + profileIds: ["anthropic:default"], + now, + }), + ).toBe("auth"); + }); + + it("falls back to rate_limit when active cooldown has no reason history", () => { + const now = Date.now(); + const store = makeStore({ + "anthropic:default": { + cooldownUntil: now + 60_000, + }, + }); + + expect( + resolveProfilesUnavailableReason({ + store, + profileIds: ["anthropic:default"], + now, + }), + ).toBe("rate_limit"); + }); + + it("ignores expired windows and returns null when no profile is actively unavailable", () => { + const now = Date.now(); + const store = makeStore({ + "anthropic:default": { + cooldownUntil: now - 1_000, + failureCounts: { auth: 5 }, + }, + "anthropic:backup": { + disabledUntil: now - 500, + disabledReason: "billing", + }, + }); + + expect( + resolveProfilesUnavailableReason({ + store, + profileIds: ["anthropic:default", "anthropic:backup"], + now, + }), + ).toBeNull(); + }); + + it("breaks ties by reason priority for equal active failure counts", () => { + const now = Date.now(); + const store = makeStore({ + "anthropic:default": { + cooldownUntil: now + 60_000, + failureCounts: { timeout: 2, auth: 2 }, + }, + }); + + expect( + resolveProfilesUnavailableReason({ + store, + profileIds: ["anthropic:default"], + now, + }), + ).toBe("auth"); + }); +}); + // --------------------------------------------------------------------------- // clearExpiredCooldowns // --------------------------------------------------------------------------- diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 65816b52949..cc25aabdf67 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -3,6 +3,20 @@ import { normalizeProviderId } from "../model-selection.js"; import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js"; import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js"; +const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [ + "auth", + "billing", + "format", + "model_not_found", + "timeout", + "rate_limit", + "unknown", +]; +const FAILURE_REASON_SET = new Set(FAILURE_REASON_PRIORITY); +const FAILURE_REASON_ORDER = new Map( + FAILURE_REASON_PRIORITY.map((reason, index) => [reason, index]), +); + export function resolveProfileUnusableUntil( stats: Pick, ): number | null { @@ -27,6 +41,85 @@ export function isProfileInCooldown(store: AuthProfileStore, profileId: string): return unusableUntil ? Date.now() < unusableUntil : false; } +function isActiveUnusableWindow(until: number | undefined, now: number): boolean { + return typeof until === "number" && Number.isFinite(until) && until > 0 && now < until; +} + +/** + * Infer the most likely reason all candidate profiles are currently unavailable. + * + * We prefer explicit active `disabledReason` values (for example billing/auth) + * over generic cooldown buckets, then fall back to failure-count signals. + */ +export function resolveProfilesUnavailableReason(params: { + store: AuthProfileStore; + profileIds: string[]; + now?: number; +}): AuthProfileFailureReason | null { + const now = params.now ?? Date.now(); + const scores = new Map(); + const addScore = (reason: AuthProfileFailureReason, value: number) => { + if (!FAILURE_REASON_SET.has(reason) || value <= 0 || !Number.isFinite(value)) { + return; + } + scores.set(reason, (scores.get(reason) ?? 0) + value); + }; + + for (const profileId of params.profileIds) { + const stats = params.store.usageStats?.[profileId]; + if (!stats) { + continue; + } + + const disabledActive = isActiveUnusableWindow(stats.disabledUntil, now); + if (disabledActive && stats.disabledReason && FAILURE_REASON_SET.has(stats.disabledReason)) { + // Disabled reasons are explicit and high-signal; weight heavily. + addScore(stats.disabledReason, 1_000); + continue; + } + + const cooldownActive = isActiveUnusableWindow(stats.cooldownUntil, now); + if (!cooldownActive) { + continue; + } + + let recordedReason = false; + for (const [rawReason, rawCount] of Object.entries(stats.failureCounts ?? {})) { + const reason = rawReason as AuthProfileFailureReason; + const count = typeof rawCount === "number" ? rawCount : 0; + if (!FAILURE_REASON_SET.has(reason) || count <= 0) { + continue; + } + addScore(reason, count); + recordedReason = true; + } + if (!recordedReason) { + addScore("rate_limit", 1); + } + } + + if (scores.size === 0) { + return null; + } + + let best: AuthProfileFailureReason | null = null; + let bestScore = -1; + let bestPriority = Number.MAX_SAFE_INTEGER; + for (const reason of FAILURE_REASON_PRIORITY) { + const score = scores.get(reason); + if (typeof score !== "number") { + continue; + } + const priority = FAILURE_REASON_ORDER.get(reason) ?? Number.MAX_SAFE_INTEGER; + if (score > bestScore || (score === bestScore && priority < bestPriority)) { + best = reason; + bestScore = score; + bestPriority = priority; + } + } + return best; +} + /** * Return the soonest `unusableUntil` timestamp (ms epoch) among the given * profiles, or `null` when no profile has a recorded cooldown. Note: the diff --git a/src/agents/model-fallback.probe.test.ts b/src/agents/model-fallback.probe.test.ts index bb88a53cb0a..0c222ec2115 100644 --- a/src/agents/model-fallback.probe.test.ts +++ b/src/agents/model-fallback.probe.test.ts @@ -8,6 +8,7 @@ vi.mock("./auth-profiles.js", () => ({ ensureAuthProfileStore: vi.fn(), getSoonestCooldownExpiry: vi.fn(), isProfileInCooldown: vi.fn(), + resolveProfilesUnavailableReason: vi.fn(), resolveAuthProfileOrder: vi.fn(), })); @@ -15,6 +16,7 @@ import { ensureAuthProfileStore, getSoonestCooldownExpiry, isProfileInCooldown, + resolveProfilesUnavailableReason, resolveAuthProfileOrder, } from "./auth-profiles.js"; import { _probeThrottleInternals, runWithModelFallback } from "./model-fallback.js"; @@ -22,6 +24,7 @@ import { _probeThrottleInternals, runWithModelFallback } from "./model-fallback. const mockedEnsureAuthProfileStore = vi.mocked(ensureAuthProfileStore); const mockedGetSoonestCooldownExpiry = vi.mocked(getSoonestCooldownExpiry); const mockedIsProfileInCooldown = vi.mocked(isProfileInCooldown); +const mockedResolveProfilesUnavailableReason = vi.mocked(resolveProfilesUnavailableReason); const mockedResolveAuthProfileOrder = vi.mocked(resolveAuthProfileOrder); const makeCfg = makeModelFallbackCfg; @@ -98,6 +101,7 @@ describe("runWithModelFallback – probe logic", () => { mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) => { return profileId.startsWith("openai"); }); + mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit"); }); afterEach(() => { @@ -119,6 +123,22 @@ describe("runWithModelFallback – probe logic", () => { expectFallbackUsed(result, run); }); + it("uses inferred unavailable reason when skipping a cooldowned primary model", async () => { + const cfg = makeCfg(); + const expiresIn30Min = NOW + 30 * 60 * 1000; + mockedGetSoonestCooldownExpiry.mockReturnValue(expiresIn30Min); + mockedResolveProfilesUnavailableReason.mockReturnValue("billing"); + + const run = vi.fn().mockResolvedValue("ok"); + + const result = await runPrimaryCandidate(cfg, run); + + expect(result.result).toBe("ok"); + expect(run).toHaveBeenCalledTimes(1); + expect(run).toHaveBeenCalledWith("anthropic", "claude-haiku-3-5"); + expect(result.attempts[0]?.reason).toBe("billing"); + }); + it("probes primary model when within 2-min margin of cooldown expiry", async () => { const cfg = makeCfg(); // Cooldown expires in 1 minute — within 2-min probe margin diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index f7404306863..add5560ea24 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -348,6 +348,49 @@ describe("runWithModelFallback", () => { expect(result.attempts[0]?.reason).toBe("rate_limit"); }); + it("propagates disabled reason when all profiles are unavailable", async () => { + const provider = `disabled-test-${crypto.randomUUID()}`; + const profileId = `${provider}:default`; + const now = Date.now(); + + const store: AuthProfileStore = { + version: AUTH_STORE_VERSION, + profiles: { + [profileId]: { + type: "api_key", + provider, + key: "test-key", + }, + }, + usageStats: { + [profileId]: { + disabledUntil: now + 5 * 60_000, + disabledReason: "billing", + failureCounts: { rate_limit: 4 }, + }, + }, + }; + + const cfg = makeProviderFallbackCfg(provider); + const run = vi.fn().mockImplementation(async (providerId, modelId) => { + if (providerId === "fallback") { + return "ok"; + } + throw new Error(`unexpected provider: ${providerId}/${modelId}`); + }); + + const result = await runWithStoredAuth({ + cfg, + store, + provider, + run, + }); + + expect(result.result).toBe("ok"); + expect(run.mock.calls).toEqual([["fallback", "ok-model"]]); + expect(result.attempts[0]?.reason).toBe("billing"); + }); + it("does not skip when any profile is available", async () => { const provider = `cooldown-mixed-${crypto.randomUUID()}`; const profileA = `${provider}:a`; diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index 609966c5b51..7a7a192e8d4 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -3,6 +3,7 @@ import { ensureAuthProfileStore, getSoonestCooldownExpiry, isProfileInCooldown, + resolveProfilesUnavailableReason, resolveAuthProfileOrder, } from "./auth-profiles.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js"; @@ -342,12 +343,18 @@ export async function runWithModelFallback(params: { profileIds, }); if (!shouldProbe) { + const inferredReason = + resolveProfilesUnavailableReason({ + store: authStore, + profileIds, + now, + }) ?? "rate_limit"; // Skip without attempting attempts.push({ provider: candidate.provider, model: candidate.model, error: `Provider ${candidate.provider} is in cooldown (all profiles unavailable)`, - reason: "rate_limit", + reason: inferredReason, }); continue; } diff --git a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.test.ts b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.test.ts index 974bd181726..b254df7430b 100644 --- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.test.ts +++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.test.ts @@ -4,6 +4,7 @@ import path from "node:path"; import type { AssistantMessage } from "@mariozechner/pi-ai"; import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../config/config.js"; +import type { AuthProfileFailureReason } from "./auth-profiles.js"; import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js"; const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise>(); @@ -112,7 +113,16 @@ const writeAuthStore = async ( agentDir: string, opts?: { includeAnthropic?: boolean; - usageStats?: Record; + usageStats?: Record< + string, + { + lastUsed?: number; + cooldownUntil?: number; + disabledUntil?: number; + disabledReason?: AuthProfileFailureReason; + failureCounts?: Partial>; + } + >; }, ) => { const authPath = path.join(agentDir, "auth-profiles.json"); @@ -184,7 +194,17 @@ async function runAutoPinnedOpenAiTurn(params: { async function readUsageStats(agentDir: string) { const stored = JSON.parse( await fs.readFile(path.join(agentDir, "auth-profiles.json"), "utf-8"), - ) as { usageStats?: Record }; + ) as { + usageStats?: Record< + string, + { + lastUsed?: number; + cooldownUntil?: number; + disabledUntil?: number; + disabledReason?: AuthProfileFailureReason; + } + >; + }; return stored.usageStats ?? {}; } @@ -496,6 +516,50 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); }); + it("fails over with disabled reason when all profiles are unavailable", async () => { + await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => { + await writeAuthStore(agentDir, { + usageStats: { + "openai:p1": { + lastUsed: 1, + disabledUntil: now + 60 * 60 * 1000, + disabledReason: "billing", + failureCounts: { rate_limit: 4 }, + }, + "openai:p2": { + lastUsed: 2, + disabledUntil: now + 60 * 60 * 1000, + disabledReason: "billing", + }, + }, + }); + + await expect( + runEmbeddedPiAgent({ + sessionId: "session:test", + sessionKey: "agent:test:disabled-failover", + sessionFile: path.join(workspaceDir, "session.jsonl"), + workspaceDir, + agentDir, + config: makeConfig({ fallbacks: ["openai/mock-2"] }), + prompt: "hello", + provider: "openai", + model: "mock-1", + authProfileIdSource: "auto", + timeoutMs: 5_000, + runId: "run:disabled-failover", + }), + ).rejects.toMatchObject({ + name: "FailoverError", + reason: "billing", + provider: "openai", + model: "mock-1", + }); + + expect(runEmbeddedAttemptMock).not.toHaveBeenCalled(); + }); + }); + it("fails over when auth is unavailable and fallbacks are configured", async () => { const previousOpenAiKey = process.env.OPENAI_API_KEY; delete process.env.OPENAI_API_KEY; diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index d326ec556c2..9ae15591b1b 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -12,6 +12,7 @@ import { markAuthProfileFailure, markAuthProfileGood, markAuthProfileUsed, + resolveProfilesUnavailableReason, } from "../auth-profiles.js"; import { CONTEXT_WINDOW_HARD_MIN_TOKENS, @@ -364,9 +365,18 @@ export async function runEmbeddedPiAgent( const resolveAuthProfileFailoverReason = (params: { allInCooldown: boolean; message: string; + profileIds?: Array; }): FailoverReason => { if (params.allInCooldown) { - return "rate_limit"; + const profileIds = (params.profileIds ?? profileCandidates).filter( + (id): id is string => typeof id === "string" && id.length > 0, + ); + return ( + resolveProfilesUnavailableReason({ + store: authStore, + profileIds, + }) ?? "rate_limit" + ); } const classified = classifyFailoverReason(params.message); return classified ?? "auth"; @@ -385,6 +395,7 @@ export async function runEmbeddedPiAgent( const reason = resolveAuthProfileFailoverReason({ allInCooldown: params.allInCooldown, message, + profileIds: profileCandidates, }); if (fallbackConfigured) { throw new FailoverError(message, {