fix: wire up fallbackOnErrors parameter and fix edge cases

- Wire fallbackOnErrors through runFallbackCandidate and runFallbackAttempt - Use coerceToFailoverErrorWithConfig when fallbackOnErrors is provided - Fix "all" mode to use status >= 400 instead of fixed list - Fix "all" mode to not trigger fallback for non-HTTP errors - Update comments to match actual default status codes (include 504, 404) - Add fallbackOnErrors to runWithImageModelFallback for consistency Addresses review feedback from Greptile: - Critical: fallbackOnErrors parameter is now actually used - Logic: "all" now correctly covers all 4xx/5xx status codes - Logic: "all" no longer triggers fallback for unrelated runtime errors - Style: Comments now match actual implementation
2026-03-21 11:22:03 +08:00 · 2026-03-21 11:22:03 +08:00 · 1cad2605dd
commit 1cad2605dd
parent e138e90031
3 changed files with 48 additions and 41 deletions
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@ -339,18 +339,13 @@ export function coerceToFailoverError(
 */
 const DEFAULT_FALLBACK_STATUS_CODES = [408, 429, 500, 502, 503, 504, 404];

-/**
- * All HTTP status codes that could trigger fallback (including client errors).
- */
-const ALL_FALLBACK_STATUS_CODES = [400, 401, 402, 403, 404, 405, 408, 410, 429, 500, 502, 503, 504];
-
 /**
 * Check if an error should trigger fallback based on the configured error codes.
 *
 * @param err - The error to check
 * @param fallbackOnErrors - Configuration for which errors should trigger fallback
- *   - "default": Use default behavior (server errors + rate limits)
- *   - "all": All errors trigger fallback
+ *   - "default": Use default behavior (server errors + rate limits + timeout + not found)
+ *   - "all": All HTTP errors (4xx and 5xx) trigger fallback
 *   - number[]: Custom list of status codes
 * @returns true if the error should trigger fallback
 */
@ -363,29 +358,21 @@ export function shouldTriggerFallback(
  // If no status code found, try to determine from error reason
  if (status === undefined) {
    const reason = resolveFailoverReasonFromError(err);
-    // Default behavior: non-null reason means it's a recognized failover error
-    if (fallbackOnErrors === undefined || fallbackOnErrors === "default") {
-      return reason !== null;
-    }
-    // For "all", any reason (including null) should trigger fallback
-    if (fallbackOnErrors === "all") {
-      return true;
-    }
-    // For custom codes, we can't determine without status
-    return false;
+    // For any mode, only trigger fallback if we have a recognized failover reason
+    // This prevents unrelated runtime errors (parse errors, filesystem errors) from triggering fallback
+    return reason !== null;
  }

-  // Determine which status codes to use
-  let allowedCodes: number[];
+  // Determine if status code should trigger fallback
  if (fallbackOnErrors === undefined || fallbackOnErrors === "default") {
-    allowedCodes = DEFAULT_FALLBACK_STATUS_CODES;
+    return DEFAULT_FALLBACK_STATUS_CODES.includes(status);
  } else if (fallbackOnErrors === "all") {
-    allowedCodes = ALL_FALLBACK_STATUS_CODES;
+    // "all" means all HTTP errors (4xx and 5xx)
+    return status >= 400;
  } else {
-    allowedCodes = fallbackOnErrors;
+    // Custom list of status codes
+    return fallbackOnErrors.includes(status);
  }
-
-  return allowedCodes.includes(status);
 }

 /**
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@ -16,7 +16,7 @@ import {
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import {
  coerceToFailoverError,
-  // coerceToFailoverErrorWithConfig, // TODO: use in future implementation
+  coerceToFailoverErrorWithConfig,
  describeFailoverError,
  isFailoverError,
  isTimeoutError,
@ -132,6 +132,7 @@ async function runFallbackCandidate<T>(params: {
  provider: string;
  model: string;
  options?: ModelFallbackRunOptions;
+  fallbackOnErrors?: FallbackOnErrorCodes;
 }): Promise<{ ok: true; result: T } | { ok: false; error: unknown }> {
  try {
    const result = params.options
@ -144,10 +145,16 @@ async function runFallbackCandidate<T>(params: {
  } catch (err) {
    // Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
    // so they become FailoverErrors and continue the fallback loop instead of aborting.
-    const normalizedFailover = coerceToFailoverError(err, {
-      provider: params.provider,
-      model: params.model,
-    });
+    // Use config-aware error coercion if fallbackOnErrors is provided.
+    const normalizedFailover = params.fallbackOnErrors
+      ? coerceToFailoverErrorWithConfig(err, params.fallbackOnErrors, {
+          provider: params.provider,
+          model: params.model,
+        })
+      : coerceToFailoverError(err, {
+          provider: params.provider,
+          model: params.model,
+        });
    if (shouldRethrowAbort(err) && !normalizedFailover) {
      throw err;
    }
@ -161,12 +168,14 @@ async function runFallbackAttempt<T>(params: {
  model: string;
  attempts: FallbackAttempt[];
  options?: ModelFallbackRunOptions;
+  fallbackOnErrors?: FallbackOnErrorCodes;
 }): Promise<{ success: ModelFallbackRunResult<T> } | { error: unknown }> {
  const runResult = await runFallbackCandidate({
    run: params.run,
    provider: params.provider,
    model: params.model,
    options: params.options,
+    fallbackOnErrors: params.fallbackOnErrors,
  });
  if (runResult.ok) {
    return {
@ -667,6 +676,7 @@ export async function runWithModelFallback<T>(params: {
      ...candidate,
      attempts,
      options: runOptions,
+      fallbackOnErrors: params.fallbackOnErrors,
    });
    if ("success" in attemptRun) {
      if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
@ -715,11 +725,15 @@ export async function runWithModelFallback<T>(params: {
      if (isLikelyContextOverflowError(errMessage)) {
        throw err;
      }
-      const normalized =
-        coerceToFailoverError(err, {
-          provider: candidate.provider,
-          model: candidate.model,
-        }) ?? err;
+      const normalized = params.fallbackOnErrors
+        ? coerceToFailoverErrorWithConfig(err, params.fallbackOnErrors, {
+            provider: candidate.provider,
+            model: candidate.model,
+          }) ?? err
+        : coerceToFailoverError(err, {
+            provider: candidate.provider,
+            model: candidate.model,
+          }) ?? err;

      // Even unrecognized errors should not abort the fallback loop when
      // there are remaining candidates.  Only abort/context-overflow errors
@ -783,6 +797,7 @@ export async function runWithImageModelFallback<T>(params: {
  modelOverride?: string;
  run: (provider: string, model: string) => Promise<T>;
  onError?: ModelFallbackErrorHandler;
+  fallbackOnErrors?: FallbackOnErrorCodes;
 }): Promise<ModelFallbackRunResult<T>> {
  const candidates = resolveImageFallbackCandidates({
    cfg: params.cfg,
@ -800,7 +815,12 @@ export async function runWithImageModelFallback<T>(params: {

  for (let i = 0; i < candidates.length; i += 1) {
    const candidate = candidates[i];
-    const attemptRun = await runFallbackAttempt({ run: params.run, ...candidate, attempts });
+    const attemptRun = await runFallbackAttempt({
+      run: params.run,
+      ...candidate,
+      attempts,
+      fallbackOnErrors: params.fallbackOnErrors,
+    });
    if ("success" in attemptRun) {
      return attemptRun.success;
    }
--- a/src/config/types.agents-shared.ts
+++ b/src/config/types.agents-shared.ts
@ -7,12 +7,12 @@ import type {

 /**
 * HTTP status codes that should trigger model fallback.
- * Default behavior only triggers fallback on server errors (5xx) and rate limits (429).
- * Users can extend this to include client errors like 400, 401, 403, etc.
+ * Default behavior triggers fallback on server errors, rate limits, timeouts, and not-found errors.
+ * Users can extend this to include all client errors with "all" or specify custom codes.
 */
 export type FallbackOnErrorCodes =
-  | "all" // All errors trigger fallback
-  | "default" // Server errors + rate limits only (500, 502, 503, 429, 408)
+  | "all" // All HTTP errors (4xx and 5xx) trigger fallback
+  | "default" // Server errors (500, 502, 503, 504) + rate limits (429) + timeout (408) + not found (404)
  | number[]; // Custom list of HTTP status codes

 export type AgentModelConfig =
@ -24,8 +24,8 @@ export type AgentModelConfig =
      fallbacks?: string[];
      /**
       * HTTP status codes that should trigger fallback to next model.
-       * - "default": Server errors (5xx) + rate limits (429) + timeout (408) [default]
-       * - "all": All errors trigger fallback (including 400, 401, 403, 404)
+       * - "default": Server errors (500, 502, 503, 504) + rate limits (429) + timeout (408) + not found (404) [default]
+       * - "all": All HTTP errors (4xx and 5xx) trigger fallback
       * - number[]: Custom list of status codes (e.g., [400, 401, 403, 429, 500, 502, 503])
       *
       * @example