From 1cad2605ddcbeb6a4aa3d76f562b9a8df6731e58 Mon Sep 17 00:00:00 2001
From: subaochen <subaochen@126.com>
Date: Sat, 21 Mar 2026 11:22:03 +0800
Subject: [PATCH] fix: wire up fallbackOnErrors parameter and fix edge cases

- Wire fallbackOnErrors through runFallbackCandidate and runFallbackAttempt
- Use coerceToFailoverErrorWithConfig when fallbackOnErrors is provided
- Fix "all" mode to use status >= 400 instead of fixed list
- Fix "all" mode to not trigger fallback for non-HTTP errors
- Update comments to match actual default status codes (include 504, 404)
- Add fallbackOnErrors to runWithImageModelFallback for consistency

Addresses review feedback from Greptile:
- Critical: fallbackOnErrors parameter is now actually used
- Logic: "all" now correctly covers all 4xx/5xx status codes
- Logic: "all" no longer triggers fallback for unrelated runtime errors
- Style: Comments now match actual implementation
---
 src/agents/failover-error.ts      | 35 ++++++++------------------
 src/agents/model-fallback.ts      | 42 +++++++++++++++++++++++--------
 src/config/types.agents-shared.ts | 12 ++++-----
 3 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts
index 67a9834ab98..ce6f3a3729c 100644
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@@ -339,18 +339,13 @@ export function coerceToFailoverError(
  */
 const DEFAULT_FALLBACK_STATUS_CODES = [408, 429, 500, 502, 503, 504, 404];
 
-/**
- * All HTTP status codes that could trigger fallback (including client errors).
- */
-const ALL_FALLBACK_STATUS_CODES = [400, 401, 402, 403, 404, 405, 408, 410, 429, 500, 502, 503, 504];
-
 /**
  * Check if an error should trigger fallback based on the configured error codes.
  *
  * @param err - The error to check
  * @param fallbackOnErrors - Configuration for which errors should trigger fallback
- *   - "default": Use default behavior (server errors + rate limits)
- *   - "all": All errors trigger fallback
+ *   - "default": Use default behavior (server errors + rate limits + timeout + not found)
+ *   - "all": All HTTP errors (4xx and 5xx) trigger fallback
  *   - number[]: Custom list of status codes
  * @returns true if the error should trigger fallback
  */
@@ -363,29 +358,21 @@ export function shouldTriggerFallback(
   // If no status code found, try to determine from error reason
   if (status === undefined) {
     const reason = resolveFailoverReasonFromError(err);
-    // Default behavior: non-null reason means it's a recognized failover error
-    if (fallbackOnErrors === undefined || fallbackOnErrors === "default") {
-      return reason !== null;
-    }
-    // For "all", any reason (including null) should trigger fallback
-    if (fallbackOnErrors === "all") {
-      return true;
-    }
-    // For custom codes, we can't determine without status
-    return false;
+    // For any mode, only trigger fallback if we have a recognized failover reason
+    // This prevents unrelated runtime errors (parse errors, filesystem errors) from triggering fallback
+    return reason !== null;
   }
 
-  // Determine which status codes to use
-  let allowedCodes: number[];
+  // Determine if status code should trigger fallback
   if (fallbackOnErrors === undefined || fallbackOnErrors === "default") {
-    allowedCodes = DEFAULT_FALLBACK_STATUS_CODES;
+    return DEFAULT_FALLBACK_STATUS_CODES.includes(status);
   } else if (fallbackOnErrors === "all") {
-    allowedCodes = ALL_FALLBACK_STATUS_CODES;
+    // "all" means all HTTP errors (4xx and 5xx)
+    return status >= 400;
   } else {
-    allowedCodes = fallbackOnErrors;
+    // Custom list of status codes
+    return fallbackOnErrors.includes(status);
   }
-
-  return allowedCodes.includes(status);
 }
 
 /**
diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts
index 4576982cba9..63225e6a2f9 100644
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -16,7 +16,7 @@ import {
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import {
   coerceToFailoverError,
-  // coerceToFailoverErrorWithConfig, // TODO: use in future implementation
+  coerceToFailoverErrorWithConfig,
   describeFailoverError,
   isFailoverError,
   isTimeoutError,
@@ -132,6 +132,7 @@ async function runFallbackCandidate<T>(params: {
   provider: string;
   model: string;
   options?: ModelFallbackRunOptions;
+  fallbackOnErrors?: FallbackOnErrorCodes;
 }): Promise<{ ok: true; result: T } | { ok: false; error: unknown }> {
   try {
     const result = params.options
@@ -144,10 +145,16 @@ async function runFallbackCandidate<T>(params: {
   } catch (err) {
     // Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
     // so they become FailoverErrors and continue the fallback loop instead of aborting.
-    const normalizedFailover = coerceToFailoverError(err, {
-      provider: params.provider,
-      model: params.model,
-    });
+    // Use config-aware error coercion if fallbackOnErrors is provided.
+    const normalizedFailover = params.fallbackOnErrors
+      ? coerceToFailoverErrorWithConfig(err, params.fallbackOnErrors, {
+          provider: params.provider,
+          model: params.model,
+        })
+      : coerceToFailoverError(err, {
+          provider: params.provider,
+          model: params.model,
+        });
     if (shouldRethrowAbort(err) && !normalizedFailover) {
       throw err;
     }
@@ -161,12 +168,14 @@ async function runFallbackAttempt<T>(params: {
   model: string;
   attempts: FallbackAttempt[];
   options?: ModelFallbackRunOptions;
+  fallbackOnErrors?: FallbackOnErrorCodes;
 }): Promise<{ success: ModelFallbackRunResult<T> } | { error: unknown }> {
   const runResult = await runFallbackCandidate({
     run: params.run,
     provider: params.provider,
     model: params.model,
     options: params.options,
+    fallbackOnErrors: params.fallbackOnErrors,
   });
   if (runResult.ok) {
     return {
@@ -667,6 +676,7 @@ export async function runWithModelFallback<T>(params: {
       ...candidate,
       attempts,
       options: runOptions,
+      fallbackOnErrors: params.fallbackOnErrors,
     });
     if ("success" in attemptRun) {
       if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
@@ -715,11 +725,15 @@ export async function runWithModelFallback<T>(params: {
       if (isLikelyContextOverflowError(errMessage)) {
         throw err;
       }
-      const normalized =
-        coerceToFailoverError(err, {
-          provider: candidate.provider,
-          model: candidate.model,
-        }) ?? err;
+      const normalized = params.fallbackOnErrors
+        ? coerceToFailoverErrorWithConfig(err, params.fallbackOnErrors, {
+            provider: candidate.provider,
+            model: candidate.model,
+          }) ?? err
+        : coerceToFailoverError(err, {
+            provider: candidate.provider,
+            model: candidate.model,
+          }) ?? err;
 
       // Even unrecognized errors should not abort the fallback loop when
       // there are remaining candidates.  Only abort/context-overflow errors
@@ -783,6 +797,7 @@ export async function runWithImageModelFallback<T>(params: {
   modelOverride?: string;
   run: (provider: string, model: string) => Promise<T>;
   onError?: ModelFallbackErrorHandler;
+  fallbackOnErrors?: FallbackOnErrorCodes;
 }): Promise<ModelFallbackRunResult<T>> {
   const candidates = resolveImageFallbackCandidates({
     cfg: params.cfg,
@@ -800,7 +815,12 @@ export async function runWithImageModelFallback<T>(params: {
 
   for (let i = 0; i < candidates.length; i += 1) {
     const candidate = candidates[i];
-    const attemptRun = await runFallbackAttempt({ run: params.run, ...candidate, attempts });
+    const attemptRun = await runFallbackAttempt({
+      run: params.run,
+      ...candidate,
+      attempts,
+      fallbackOnErrors: params.fallbackOnErrors,
+    });
     if ("success" in attemptRun) {
       return attemptRun.success;
     }
diff --git a/src/config/types.agents-shared.ts b/src/config/types.agents-shared.ts
index d8901ba7228..73dca0d6360 100644
--- a/src/config/types.agents-shared.ts
+++ b/src/config/types.agents-shared.ts
@@ -7,12 +7,12 @@ import type {
 
 /**
  * HTTP status codes that should trigger model fallback.
- * Default behavior only triggers fallback on server errors (5xx) and rate limits (429).
- * Users can extend this to include client errors like 400, 401, 403, etc.
+ * Default behavior triggers fallback on server errors, rate limits, timeouts, and not-found errors.
+ * Users can extend this to include all client errors with "all" or specify custom codes.
  */
 export type FallbackOnErrorCodes =
-  | "all" // All errors trigger fallback
-  | "default" // Server errors + rate limits only (500, 502, 503, 429, 408)
+  | "all" // All HTTP errors (4xx and 5xx) trigger fallback
+  | "default" // Server errors (500, 502, 503, 504) + rate limits (429) + timeout (408) + not found (404)
   | number[]; // Custom list of HTTP status codes
 
 export type AgentModelConfig =
@@ -24,8 +24,8 @@ export type AgentModelConfig =
       fallbacks?: string[];
       /**
        * HTTP status codes that should trigger fallback to next model.
-       * - "default": Server errors (5xx) + rate limits (429) + timeout (408) [default]
-       * - "all": All errors trigger fallback (including 400, 401, 403, 404)
+       * - "default": Server errors (500, 502, 503, 504) + rate limits (429) + timeout (408) + not found (404) [default]
+       * - "all": All HTTP errors (4xx and 5xx) trigger fallback
        * - number[]: Custom list of status codes (e.g., [400, 401, 403, 429, 500, 502, 503])
        *
        * @example