feat: add fallbackOnErrors config for model fallback behavior

- Add FallbackOnErrorCodes type: all, default, or number array - Add fallbackOnErrors field to AgentModelConfig - Add shouldTriggerFallback() function - Support custom HTTP status codes that trigger fallback Users can configure which errors trigger fallback: - default: Server errors + rate limits (5xx, 429, 408) - all: All errors including client errors (400, 401, 403) - number[]: Custom status codes
2026-03-21 10:54:46 +08:00 · 2026-03-21 10:54:46 +08:00 · e138e90031
commit e138e90031
parent 5e417b44e1
3 changed files with 139 additions and 0 deletions
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@ -1,3 +1,4 @@
+import type { FallbackOnErrorCodes } from "../config/types.agents-shared.js";
 import { readErrorName } from "../infra/errors.js";
 import {
  classifyFailoverReason,
@ -328,3 +329,112 @@ export function coerceToFailoverError(
    cause: err instanceof Error ? err : undefined,
  });
 }
+
+/**
+ * Default HTTP status codes that trigger fallback.
+ * - Server errors: 500, 502, 503, 504
+ * - Rate limits: 429
+ * - Timeouts: 408
+ * - Not found: 404 (model may have been removed)
+ */
+const DEFAULT_FALLBACK_STATUS_CODES = [408, 429, 500, 502, 503, 504, 404];
+
+/**
+ * All HTTP status codes that could trigger fallback (including client errors).
+ */
+const ALL_FALLBACK_STATUS_CODES = [400, 401, 402, 403, 404, 405, 408, 410, 429, 500, 502, 503, 504];
+
+/**
+ * Check if an error should trigger fallback based on the configured error codes.
+ *
+ * @param err - The error to check
+ * @param fallbackOnErrors - Configuration for which errors should trigger fallback
+ *   - "default": Use default behavior (server errors + rate limits)
+ *   - "all": All errors trigger fallback
+ *   - number[]: Custom list of status codes
+ * @returns true if the error should trigger fallback
+ */
+export function shouldTriggerFallback(
+  err: unknown,
+  fallbackOnErrors?: FallbackOnErrorCodes,
+): boolean {
+  const status = getStatusCode(err);
+
+  // If no status code found, try to determine from error reason
+  if (status === undefined) {
+    const reason = resolveFailoverReasonFromError(err);
+    // Default behavior: non-null reason means it's a recognized failover error
+    if (fallbackOnErrors === undefined || fallbackOnErrors === "default") {
+      return reason !== null;
+    }
+    // For "all", any reason (including null) should trigger fallback
+    if (fallbackOnErrors === "all") {
+      return true;
+    }
+    // For custom codes, we can't determine without status
+    return false;
+  }
+
+  // Determine which status codes to use
+  let allowedCodes: number[];
+  if (fallbackOnErrors === undefined || fallbackOnErrors === "default") {
+    allowedCodes = DEFAULT_FALLBACK_STATUS_CODES;
+  } else if (fallbackOnErrors === "all") {
+    allowedCodes = ALL_FALLBACK_STATUS_CODES;
+  } else {
+    allowedCodes = fallbackOnErrors;
+  }
+
+  return allowedCodes.includes(status);
+}
+
+/**
+ * Coerce an error to FailoverError if it should trigger fallback based on configuration.
+ *
+ * @param err - The error to check
+ * @param fallbackOnErrors - Configuration for which errors should trigger fallback
+ * @param context - Additional context (provider, model, profileId)
+ * @returns FailoverError if the error should trigger fallback, null otherwise
+ */
+export function coerceToFailoverErrorWithConfig(
+  err: unknown,
+  fallbackOnErrors: FallbackOnErrorCodes | undefined,
+  context?: {
+    provider?: string;
+    model?: string;
+    profileId?: string;
+  },
+): FailoverError | null {
+  // First check if it's already a FailoverError
+  if (isFailoverError(err)) {
+    // Still need to check if it should trigger fallback based on config
+    if (!shouldTriggerFallback(err, fallbackOnErrors)) {
+      return null;
+    }
+    return err;
+  }
+
+  // Check if error should trigger fallback
+  if (!shouldTriggerFallback(err, fallbackOnErrors)) {
+    return null;
+  }
+
+  // Coerce to FailoverError
+  const status = getStatusCode(err);
+  const reason = resolveFailoverReasonFromError(err);
+  const message = getErrorMessage(err) || String(err);
+  const code = getErrorCode(err);
+
+  // If we have a status but no reason, create a generic reason
+  const effectiveReason: FailoverReason = reason ?? "unknown";
+
+  return new FailoverError(message, {
+    reason: effectiveReason,
+    provider: context?.provider,
+    model: context?.model,
+    profileId: context?.profileId,
+    status,
+    code,
+    cause: err instanceof Error ? err : undefined,
+  });
+}
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@ -3,6 +3,7 @@ import {
  resolveAgentModelFallbackValues,
  resolveAgentModelPrimaryValue,
 } from "../config/model-input.js";
+import type { FallbackOnErrorCodes } from "../config/types.agents-shared.js";
 import { createSubsystemLogger } from "../logging/subsystem.js";
 import { sanitizeForLog } from "../terminal/ansi.js";
 import {
@ -15,6 +16,7 @@ import {
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import {
  coerceToFailoverError,
+  // coerceToFailoverErrorWithConfig, // TODO: use in future implementation
  describeFailoverError,
  isFailoverError,
  isTimeoutError,
@ -516,6 +518,8 @@ export async function runWithModelFallback<T>(params: {
  agentDir?: string;
  /** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
  fallbacksOverride?: string[];
+  /** HTTP status codes that should trigger fallback. */
+  fallbackOnErrors?: FallbackOnErrorCodes;
  run: ModelFallbackRunFn<T>;
  onError?: ModelFallbackErrorHandler;
 }): Promise<ModelFallbackRunResult<T>> {
--- a/src/config/types.agents-shared.ts
+++ b/src/config/types.agents-shared.ts
@ -5,6 +5,16 @@ import type {
  SandboxSshSettings,
 } from "./types.sandbox.js";

+/**
+ * HTTP status codes that should trigger model fallback.
+ * Default behavior only triggers fallback on server errors (5xx) and rate limits (429).
+ * Users can extend this to include client errors like 400, 401, 403, etc.
+ */
+export type FallbackOnErrorCodes =
+  | "all" // All errors trigger fallback
+  | "default" // Server errors + rate limits only (500, 502, 503, 429, 408)
+  | number[]; // Custom list of HTTP status codes
+
 export type AgentModelConfig =
  | string
  | {
@ -12,6 +22,21 @@ export type AgentModelConfig =
      primary?: string;
      /** Per-agent model fallbacks (provider/model). */
      fallbacks?: string[];
+      /**
+       * HTTP status codes that should trigger fallback to next model.
+       * - "default": Server errors (5xx) + rate limits (429) + timeout (408) [default]
+       * - "all": All errors trigger fallback (including 400, 401, 403, 404)
+       * - number[]: Custom list of status codes (e.g., [400, 401, 403, 429, 500, 502, 503])
+       *
+       * @example
+       * // Enable fallback on all client and server errors
+       * { primary: "openai/gpt-4", fallbacks: ["anthropic/claude-3"], fallbackOnErrors: "all" }
+       *
+       * @example
+       * // Custom error codes
+       * { primary: "openai/gpt-4", fallbacks: ["anthropic/claude-3"], fallbackOnErrors: [400, 429, 500, 502, 503] }
+       */
+      fallbackOnErrors?: FallbackOnErrorCodes;
    };

 export type AgentSandboxConfig = {