openclaw/src/agents/model-fallback.ts

import type { OpenClawConfig } from "../config/config.js";
import {
  resolveAgentModelFallbackValues,
  resolveAgentModelPrimaryValue,
} from "../config/model-input.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import { sanitizeForLog } from "../terminal/ansi.js";
import {
  ensureAuthProfileStore,
  getSoonestCooldownExpiry,
  isProfileInCooldown,
  resolveProfilesUnavailableReason,
  resolveAuthProfileOrder,
} from "./auth-profiles.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
import {
  coerceToFailoverError,
  describeFailoverError,
  isFailoverError,
  isTimeoutError,
} from "./failover-error.js";
import { logModelFallbackDecision } from "./model-fallback-observation.js";
import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
import {
  buildConfiguredAllowlistKeys,
  buildModelAliasIndex,
  modelKey,
  normalizeModelRef,
  resolveConfiguredModelRef,
  resolveModelRefFromString,
} from "./model-selection.js";
import type { FailoverReason } from "./pi-embedded-helpers.js";
import { isLikelyContextOverflowError } from "./pi-embedded-helpers.js";

const log = createSubsystemLogger("model-fallback");

export type ModelFallbackRunOptions = {
  allowTransientCooldownProbe?: boolean;
};

type ModelFallbackRunFn<T> = (
  provider: string,
  model: string,
  options?: ModelFallbackRunOptions,
) => Promise<T>;

/**
 * Fallback abort check. Only treats explicit AbortError names as user aborts.
 * Message-based checks (e.g., "aborted") can mask timeouts and skip fallback.
 */
function isFallbackAbortError(err: unknown): boolean {
  if (!err || typeof err !== "object") {
    return false;
  }
  if (isFailoverError(err)) {
    return false;
  }
  const name = "name" in err ? String(err.name) : "";
  return name === "AbortError";
}

function shouldRethrowAbort(err: unknown): boolean {
  return isFallbackAbortError(err) && !isTimeoutError(err);
}

function createModelCandidateCollector(allowlist: Set<string> | null | undefined): {
  candidates: ModelCandidate[];
  addExplicitCandidate: (candidate: ModelCandidate) => void;
  addAllowlistedCandidate: (candidate: ModelCandidate) => void;
} {
  const seen = new Set<string>();
  const candidates: ModelCandidate[] = [];

  const addCandidate = (candidate: ModelCandidate, enforceAllowlist: boolean) => {
    if (!candidate.provider || !candidate.model) {
      return;
    }
    const key = modelKey(candidate.provider, candidate.model);
    if (seen.has(key)) {
      return;
    }
    if (enforceAllowlist && allowlist && !allowlist.has(key)) {
      return;
    }
    seen.add(key);
    candidates.push(candidate);
  };

  const addExplicitCandidate = (candidate: ModelCandidate) => {
    addCandidate(candidate, false);
  };
  const addAllowlistedCandidate = (candidate: ModelCandidate) => {
    addCandidate(candidate, true);
  };

  return { candidates, addExplicitCandidate, addAllowlistedCandidate };
}

type ModelFallbackErrorHandler = (attempt: {
  provider: string;
  model: string;
  error: unknown;
  attempt: number;
  total: number;
}) => void | Promise<void>;

type ModelFallbackRunResult<T> = {
  result: T;
  provider: string;
  model: string;
  attempts: FallbackAttempt[];
};

function buildFallbackSuccess<T>(params: {
  result: T;
  provider: string;
  model: string;
  attempts: FallbackAttempt[];
}): ModelFallbackRunResult<T> {
  return {
    result: params.result,
    provider: params.provider,
    model: params.model,
    attempts: params.attempts,
  };
}

async function runFallbackCandidate<T>(params: {
  run: ModelFallbackRunFn<T>;
  provider: string;
  model: string;
  options?: ModelFallbackRunOptions;
}): Promise<{ ok: true; result: T } | { ok: false; error: unknown }> {
  try {
    const result = params.options
      ? await params.run(params.provider, params.model, params.options)
      : await params.run(params.provider, params.model);
    return {
      ok: true,
      result,
    };
  } catch (err) {
    // Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
    // so they become FailoverErrors and continue the fallback loop instead of aborting.
    const normalizedFailover = coerceToFailoverError(err, {
      provider: params.provider,
      model: params.model,
    });
    if (shouldRethrowAbort(err) && !normalizedFailover) {
      throw err;
    }
    return { ok: false, error: normalizedFailover ?? err };
  }
}

async function runFallbackAttempt<T>(params: {
  run: ModelFallbackRunFn<T>;
  provider: string;
  model: string;
  attempts: FallbackAttempt[];
  options?: ModelFallbackRunOptions;
}): Promise<{ success: ModelFallbackRunResult<T> } | { error: unknown }> {
  const runResult = await runFallbackCandidate({
    run: params.run,
    provider: params.provider,
    model: params.model,
    options: params.options,
  });
  if (runResult.ok) {
    return {
      success: buildFallbackSuccess({
        result: runResult.result,
        provider: params.provider,
        model: params.model,
        attempts: params.attempts,
      }),
    };
  }
  return { error: runResult.error };
}

function sameModelCandidate(a: ModelCandidate, b: ModelCandidate): boolean {
  return a.provider === b.provider && a.model === b.model;
}

function throwFallbackFailureSummary(params: {
  attempts: FallbackAttempt[];
  candidates: ModelCandidate[];
  lastError: unknown;
  label: string;
  formatAttempt: (attempt: FallbackAttempt) => string;
}): never {
  if (params.attempts.length <= 1 && params.lastError) {
    throw params.lastError;
  }
  const summary =
    params.attempts.length > 0 ? params.attempts.map(params.formatAttempt).join(" | ") : "unknown";
  throw new Error(
    `All ${params.label} failed (${params.attempts.length || params.candidates.length}): ${summary}`,
    {
      cause: params.lastError instanceof Error ? params.lastError : undefined,
    },
  );
}

function resolveImageFallbackCandidates(params: {
  cfg: OpenClawConfig | undefined;
  defaultProvider: string;
  modelOverride?: string;
}): ModelCandidate[] {
  const aliasIndex = buildModelAliasIndex({
    cfg: params.cfg ?? {},
    defaultProvider: params.defaultProvider,
  });
  const allowlist = buildConfiguredAllowlistKeys({
    cfg: params.cfg,
    defaultProvider: params.defaultProvider,
  });
  const { candidates, addExplicitCandidate, addAllowlistedCandidate } =
    createModelCandidateCollector(allowlist);

  const addRaw = (raw: string, opts?: { allowlist?: boolean }) => {
    const resolved = resolveModelRefFromString({
      raw: String(raw ?? ""),
      defaultProvider: params.defaultProvider,
      aliasIndex,
    });
    if (!resolved) {
      return;
    }
    if (opts?.allowlist) {
      addAllowlistedCandidate(resolved.ref);
      return;
    }
    addExplicitCandidate(resolved.ref);
  };

  if (params.modelOverride?.trim()) {
    addRaw(params.modelOverride);
  } else {
    const primary = resolveAgentModelPrimaryValue(params.cfg?.agents?.defaults?.imageModel);
    if (primary?.trim()) {
      addRaw(primary);
    }
  }

  const imageFallbacks = resolveAgentModelFallbackValues(params.cfg?.agents?.defaults?.imageModel);

  for (const raw of imageFallbacks) {
    // Explicitly configured image fallbacks should remain reachable even when a
    // model allowlist is present.
    addRaw(raw);
  }

  return candidates;
}

function resolveFallbackCandidates(params: {
  cfg: OpenClawConfig | undefined;
  provider: string;
  model: string;
  /** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
  fallbacksOverride?: string[];
}): ModelCandidate[] {
  const primary = params.cfg
    ? resolveConfiguredModelRef({
        cfg: params.cfg,
        defaultProvider: DEFAULT_PROVIDER,
        defaultModel: DEFAULT_MODEL,
      })
    : null;
  const defaultProvider = primary?.provider ?? DEFAULT_PROVIDER;
  const defaultModel = primary?.model ?? DEFAULT_MODEL;
  const providerRaw = String(params.provider ?? "").trim() || defaultProvider;
  const modelRaw = String(params.model ?? "").trim() || defaultModel;
  const normalizedPrimary = normalizeModelRef(providerRaw, modelRaw);
  const configuredPrimary = normalizeModelRef(defaultProvider, defaultModel);
  const aliasIndex = buildModelAliasIndex({
    cfg: params.cfg ?? {},
    defaultProvider,
  });
  const allowlist = buildConfiguredAllowlistKeys({
    cfg: params.cfg,
    defaultProvider,
  });
  const { candidates, addExplicitCandidate } = createModelCandidateCollector(allowlist);

  addExplicitCandidate(normalizedPrimary);

  const modelFallbacks = (() => {
    if (params.fallbacksOverride !== undefined) {
      return params.fallbacksOverride;
    }
    const configuredFallbacks = resolveAgentModelFallbackValues(
      params.cfg?.agents?.defaults?.model,
    );
    // When user runs a different provider than config, only use configured fallbacks
    // if the current model is already in that chain (e.g. session on first fallback).
    if (normalizedPrimary.provider !== configuredPrimary.provider) {
      const isConfiguredFallback = configuredFallbacks.some((raw) => {
        const resolved = resolveModelRefFromString({
          raw: String(raw ?? ""),
          defaultProvider,
          aliasIndex,
        });
        return resolved ? sameModelCandidate(resolved.ref, normalizedPrimary) : false;
      });
      return isConfiguredFallback ? configuredFallbacks : [];
    }
    // Same provider: always use full fallback chain (model version differences within provider).
    return configuredFallbacks;
  })();

  for (const raw of modelFallbacks) {
    const resolved = resolveModelRefFromString({
      raw: String(raw ?? ""),
      defaultProvider,
      aliasIndex,
    });
    if (!resolved) {
      continue;
    }
    // Fallbacks are explicit user intent; do not silently filter them by the
    // model allowlist.
    addExplicitCandidate(resolved.ref);
  }

  if (params.fallbacksOverride === undefined && primary?.provider && primary.model) {
    addExplicitCandidate({ provider: primary.provider, model: primary.model });
  }

  return candidates;
}

const lastProbeAttempt = new Map<string, number>();
const MIN_PROBE_INTERVAL_MS = 30_000; // 30 seconds between probes per key
const PROBE_MARGIN_MS = 2 * 60 * 1000;
const PROBE_SCOPE_DELIMITER = "::";
const PROBE_STATE_TTL_MS = 24 * 60 * 60 * 1000;
const MAX_PROBE_KEYS = 256;

function resolveProbeThrottleKey(provider: string, agentDir?: string): string {
  const scope = String(agentDir ?? "").trim();
  return scope ? `${scope}${PROBE_SCOPE_DELIMITER}${provider}` : provider;
}

function pruneProbeState(now: number): void {
  for (const [key, ts] of lastProbeAttempt) {
    if (!Number.isFinite(ts) || ts <= 0 || now - ts > PROBE_STATE_TTL_MS) {
      lastProbeAttempt.delete(key);
    }
  }
}

function enforceProbeStateCap(): void {
  while (lastProbeAttempt.size > MAX_PROBE_KEYS) {
    let oldestKey: string | null = null;
    let oldestTs = Number.POSITIVE_INFINITY;
    for (const [key, ts] of lastProbeAttempt) {
      if (ts < oldestTs) {
        oldestKey = key;
        oldestTs = ts;
      }
    }
    if (!oldestKey) {
      break;
    }
    lastProbeAttempt.delete(oldestKey);
  }
}

function isProbeThrottleOpen(now: number, throttleKey: string): boolean {
  pruneProbeState(now);
  const lastProbe = lastProbeAttempt.get(throttleKey) ?? 0;
  return now - lastProbe >= MIN_PROBE_INTERVAL_MS;
}

function markProbeAttempt(now: number, throttleKey: string): void {
  pruneProbeState(now);
  lastProbeAttempt.set(throttleKey, now);
  enforceProbeStateCap();
}

function shouldProbePrimaryDuringCooldown(params: {
  isPrimary: boolean;
  hasFallbackCandidates: boolean;
  now: number;
  throttleKey: string;
  authStore: ReturnType<typeof ensureAuthProfileStore>;
  profileIds: string[];
}): boolean {
  if (!params.isPrimary || !params.hasFallbackCandidates) {
    return false;
  }

  if (!isProbeThrottleOpen(params.now, params.throttleKey)) {
    return false;
  }

  const soonest = getSoonestCooldownExpiry(params.authStore, params.profileIds);
  if (soonest === null || !Number.isFinite(soonest)) {
    return true;
  }

  // Probe when cooldown already expired or within the configured margin.
  return params.now >= soonest - PROBE_MARGIN_MS;
}

/** @internal – exposed for unit tests only */
export const _probeThrottleInternals = {
  lastProbeAttempt,
  MIN_PROBE_INTERVAL_MS,
  PROBE_MARGIN_MS,
  PROBE_STATE_TTL_MS,
  MAX_PROBE_KEYS,
  resolveProbeThrottleKey,
  isProbeThrottleOpen,
  pruneProbeState,
  markProbeAttempt,
} as const;

type CooldownDecision =
  | {
      type: "skip";
      reason: FailoverReason;
      error: string;
    }
  | {
      type: "attempt";
      reason: FailoverReason;
      markProbe: boolean;
    };

function resolveCooldownDecision(params: {
  candidate: ModelCandidate;
  isPrimary: boolean;
  requestedModel: boolean;
  hasFallbackCandidates: boolean;
  now: number;
  probeThrottleKey: string;
  authStore: ReturnType<typeof ensureAuthProfileStore>;
  profileIds: string[];
}): CooldownDecision {
  const shouldProbe = shouldProbePrimaryDuringCooldown({
    isPrimary: params.isPrimary,
    hasFallbackCandidates: params.hasFallbackCandidates,
    now: params.now,
    throttleKey: params.probeThrottleKey,
    authStore: params.authStore,
    profileIds: params.profileIds,
  });

  const inferredReason =
    resolveProfilesUnavailableReason({
      store: params.authStore,
      profileIds: params.profileIds,
      now: params.now,
    }) ?? "unknown";
  const isPersistentAuthIssue = inferredReason === "auth" || inferredReason === "auth_permanent";
  if (isPersistentAuthIssue) {
    return {
      type: "skip",
      reason: inferredReason,
      error: `Provider ${params.candidate.provider} has ${inferredReason} issue (skipping all models)`,
    };
  }

  // Billing is semi-persistent: the user may fix their balance, or a transient
  // 402 might have been misclassified. Probe single-provider setups on the
  // standard throttle so they can recover without a restart; when fallbacks
  // exist, only probe near cooldown expiry so the fallback chain stays preferred.
  if (inferredReason === "billing") {
    const shouldProbeSingleProviderBilling =
      params.isPrimary &&
      !params.hasFallbackCandidates &&
      isProbeThrottleOpen(params.now, params.probeThrottleKey);
    if (params.isPrimary && (shouldProbe || shouldProbeSingleProviderBilling)) {
      return { type: "attempt", reason: inferredReason, markProbe: true };
    }
    return {
      type: "skip",
      reason: inferredReason,
      error: `Provider ${params.candidate.provider} has ${inferredReason} issue (skipping all models)`,
    };
  }

  // For primary: try when requested model or when probe allows.
  // For same-provider fallbacks: only relax cooldown on transient provider
  // limits, which are often model-scoped and can recover on a sibling model.
  const shouldAttemptDespiteCooldown =
    (params.isPrimary && (!params.requestedModel || shouldProbe)) ||
    (!params.isPrimary &&
      (inferredReason === "rate_limit" ||
        inferredReason === "overloaded" ||
        inferredReason === "unknown"));
  if (!shouldAttemptDespiteCooldown) {
    return {
      type: "skip",
      reason: inferredReason,
      error: `Provider ${params.candidate.provider} is in cooldown (all profiles unavailable)`,
    };
  }

  return {
    type: "attempt",
    reason: inferredReason,
    markProbe: params.isPrimary && shouldProbe,
  };
}

export async function runWithModelFallback<T>(params: {
  cfg: OpenClawConfig | undefined;
  provider: string;
  model: string;
  runId?: string;
  agentDir?: string;
  /** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
  fallbacksOverride?: string[];
  run: ModelFallbackRunFn<T>;
  onError?: ModelFallbackErrorHandler;
}): Promise<ModelFallbackRunResult<T>> {
  const candidates = resolveFallbackCandidates({
    cfg: params.cfg,
    provider: params.provider,
    model: params.model,
    fallbacksOverride: params.fallbacksOverride,
  });
  const authStore = params.cfg
    ? ensureAuthProfileStore(params.agentDir, { allowKeychainPrompt: false })
    : null;
  const attempts: FallbackAttempt[] = [];
  let lastError: unknown;
  const cooldownProbeUsedProviders = new Set<string>();

  const hasFallbackCandidates = candidates.length > 1;

  for (let i = 0; i < candidates.length; i += 1) {
    const candidate = candidates[i];
    const isPrimary = i === 0;
    const requestedModel =
      params.provider === candidate.provider && params.model === candidate.model;
    let runOptions: ModelFallbackRunOptions | undefined;
    let attemptedDuringCooldown = false;
    let transientProbeProviderForAttempt: string | null = null;
    if (authStore) {
      const profileIds = resolveAuthProfileOrder({
        cfg: params.cfg,
        store: authStore,
        provider: candidate.provider,
      });
      const isAnyProfileAvailable = profileIds.some((id) => !isProfileInCooldown(authStore, id));

      if (profileIds.length > 0 && !isAnyProfileAvailable) {
        // All profiles for this provider are in cooldown.
        const now = Date.now();
        const probeThrottleKey = resolveProbeThrottleKey(candidate.provider, params.agentDir);
        const decision = resolveCooldownDecision({
          candidate,
          isPrimary,
          requestedModel,
          hasFallbackCandidates,
          now,
          probeThrottleKey,
          authStore,
          profileIds,
        });

        if (decision.type === "skip") {
          attempts.push({
            provider: candidate.provider,
            model: candidate.model,
            error: decision.error,
            reason: decision.reason,
          });
          logModelFallbackDecision({
            decision: "skip_candidate",
            runId: params.runId,
            requestedProvider: params.provider,
            requestedModel: params.model,
            candidate,
            attempt: i + 1,
            total: candidates.length,
            reason: decision.reason,
            error: decision.error,
            nextCandidate: candidates[i + 1],
            isPrimary,
            requestedModelMatched: requestedModel,
            fallbackConfigured: hasFallbackCandidates,
            profileCount: profileIds.length,
          });
          continue;
        }

        if (decision.markProbe) {
          markProbeAttempt(now, probeThrottleKey);
        }
        if (
          decision.reason === "rate_limit" ||
          decision.reason === "overloaded" ||
          decision.reason === "billing" ||
          decision.reason === "unknown"
        ) {
          // Probe at most once per provider per fallback run when all profiles
          // are cooldowned. Re-probing every same-provider candidate can stall
          // cross-provider fallback on providers with long internal retries.
          const isTransientCooldownReason =
            decision.reason === "rate_limit" ||
            decision.reason === "overloaded" ||
            decision.reason === "unknown";
          if (isTransientCooldownReason && cooldownProbeUsedProviders.has(candidate.provider)) {
            const error = `Provider ${candidate.provider} is in cooldown (probe already attempted this run)`;
            attempts.push({
              provider: candidate.provider,
              model: candidate.model,
              error,
              reason: decision.reason,
            });
            logModelFallbackDecision({
              decision: "skip_candidate",
              runId: params.runId,
              requestedProvider: params.provider,
              requestedModel: params.model,
              candidate,
              attempt: i + 1,
              total: candidates.length,
              reason: decision.reason,
              error,
              nextCandidate: candidates[i + 1],
              isPrimary,
              requestedModelMatched: requestedModel,
              fallbackConfigured: hasFallbackCandidates,
              profileCount: profileIds.length,
            });
            continue;
          }
          runOptions = { allowTransientCooldownProbe: true };
          if (isTransientCooldownReason) {
            transientProbeProviderForAttempt = candidate.provider;
          }
        }
        attemptedDuringCooldown = true;
        logModelFallbackDecision({
          decision: "probe_cooldown_candidate",
          runId: params.runId,
          requestedProvider: params.provider,
          requestedModel: params.model,
          candidate,
          attempt: i + 1,
          total: candidates.length,
          reason: decision.reason,
          nextCandidate: candidates[i + 1],
          isPrimary,
          requestedModelMatched: requestedModel,
          fallbackConfigured: hasFallbackCandidates,
          allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe,
          profileCount: profileIds.length,
        });
      }
    }

    const attemptRun = await runFallbackAttempt({
      run: params.run,
      ...candidate,
      attempts,
      options: runOptions,
    });
    if ("success" in attemptRun) {
      if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
        logModelFallbackDecision({
          decision: "candidate_succeeded",
          runId: params.runId,
          requestedProvider: params.provider,
          requestedModel: params.model,
          candidate,
          attempt: i + 1,
          total: candidates.length,
          previousAttempts: attempts,
          isPrimary,
          requestedModelMatched: requestedModel,
          fallbackConfigured: hasFallbackCandidates,
        });
      }
      const notFoundAttempt =
        i > 0 ? attempts.find((a) => a.reason === "model_not_found") : undefined;
      if (notFoundAttempt) {
        log.warn(
          `Model "${sanitizeForLog(notFoundAttempt.provider)}/${sanitizeForLog(notFoundAttempt.model)}" not found. Fell back to "${sanitizeForLog(candidate.provider)}/${sanitizeForLog(candidate.model)}".`,
        );
      }
      return attemptRun.success;
    }
    const err = attemptRun.error;
    {
      if (transientProbeProviderForAttempt) {
        const probeFailureReason = describeFailoverError(err).reason;
        const shouldPreserveTransientProbeSlot =
          probeFailureReason === "model_not_found" ||
          probeFailureReason === "format" ||
          probeFailureReason === "auth" ||
          probeFailureReason === "auth_permanent" ||
          probeFailureReason === "session_expired";
        if (!shouldPreserveTransientProbeSlot) {
          cooldownProbeUsedProviders.add(transientProbeProviderForAttempt);
        }
      }
      // Context overflow errors should be handled by the inner runner's
      // compaction/retry logic, not by model fallback.  If one escapes as a
      // throw, rethrow it immediately rather than trying a different model
      // that may have a smaller context window and fail worse.
      const errMessage = err instanceof Error ? err.message : String(err);
      if (isLikelyContextOverflowError(errMessage)) {
        throw err;
      }
      const normalized =
        coerceToFailoverError(err, {
          provider: candidate.provider,
          model: candidate.model,
        }) ?? err;

      // Even unrecognized errors should not abort the fallback loop when
      // there are remaining candidates.  Only abort/context-overflow errors
      // (handled above) are truly non-retryable.
      const isKnownFailover = isFailoverError(normalized);
      if (!isKnownFailover && i === candidates.length - 1) {
        throw err;
      }

      lastError = isKnownFailover ? normalized : err;
      const described = describeFailoverError(normalized);
      attempts.push({
        provider: candidate.provider,
        model: candidate.model,
        error: described.message,
        reason: described.reason ?? "unknown",
        status: described.status,
        code: described.code,
      });
      logModelFallbackDecision({
        decision: "candidate_failed",
        runId: params.runId,
        requestedProvider: params.provider,
        requestedModel: params.model,
        candidate,
        attempt: i + 1,
        total: candidates.length,
        reason: described.reason,
        status: described.status,
        code: described.code,
        error: described.message,
        nextCandidate: candidates[i + 1],
        isPrimary,
        requestedModelMatched: requestedModel,
        fallbackConfigured: hasFallbackCandidates,
      });
      await params.onError?.({
        provider: candidate.provider,
        model: candidate.model,
        error: isKnownFailover ? normalized : err,
        attempt: i + 1,
        total: candidates.length,
      });
    }
  }

  throwFallbackFailureSummary({
    attempts,
    candidates,
    lastError,
    label: "models",
    formatAttempt: (attempt) =>
      `${attempt.provider}/${attempt.model}: ${attempt.error}${
        attempt.reason ? ` (${attempt.reason})` : ""
      }`,
  });
}

export async function runWithImageModelFallback<T>(params: {
  cfg: OpenClawConfig | undefined;
  modelOverride?: string;
  run: (provider: string, model: string) => Promise<T>;
  onError?: ModelFallbackErrorHandler;
}): Promise<ModelFallbackRunResult<T>> {
  const candidates = resolveImageFallbackCandidates({
    cfg: params.cfg,
    defaultProvider: DEFAULT_PROVIDER,
    modelOverride: params.modelOverride,
  });
  if (candidates.length === 0) {
    throw new Error(
      "No image model configured. Set agents.defaults.imageModel.primary or agents.defaults.imageModel.fallbacks.",
    );
  }

  const attempts: FallbackAttempt[] = [];
  let lastError: unknown;

  for (let i = 0; i < candidates.length; i += 1) {
    const candidate = candidates[i];
    const attemptRun = await runFallbackAttempt({ run: params.run, ...candidate, attempts });
    if ("success" in attemptRun) {
      return attemptRun.success;
    }
    {
      const err = attemptRun.error;
      lastError = err;
      attempts.push({
        provider: candidate.provider,
        model: candidate.model,
        error: err instanceof Error ? err.message : String(err),
      });
      await params.onError?.({
        provider: candidate.provider,
        model: candidate.model,
        error: err,
        attempt: i + 1,
        total: candidates.length,
      });
    }
  }

  throwFallbackFailureSummary({
    attempts,
    candidates,
    lastError,
    label: "image models",
    formatAttempt: (attempt) => `${attempt.provider}/${attempt.model}: ${attempt.error}`,
  });
}
-												refactor: rename to openclaw

											
										
										
											2026-01-30 03:15:10 +01:00
+								import type { OpenClawConfig } from "../config/config.js";
-												fix(agents): fall back to agents.defaults.model when agent has no model config (#24210)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 0f272b102763736001a82cfda23f35ff2ee9cac8
Co-authored-by: bianbiandashen <16240681+bianbiandashen@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-23 16:18:55 +08:00
+								import {
 								  resolveAgentModelFallbackValues,
 								  resolveAgentModelPrimaryValue,
 								} from "../config/model-input.js";
-												fix(agents): warn clearly on unresolved model ids (#39215, thanks @ademczuk)

Co-authored-by: ademczuk <andrew.demczuk@gmail.com>

											
										
										
											2026-03-07 22:50:27 +00:00
+								import { createSubsystemLogger } from "../logging/subsystem.js";
 								import { sanitizeForLog } from "../terminal/ansi.js";
-												chore: Enable "experimentalSortImports" in Oxfmt and reformat all imorts.

											
										
										
											2026-02-01 10:03:47 +09:00
+								import {
 								  ensureAuthProfileStore,
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								  getSoonestCooldownExpiry,
-												chore: Enable "experimentalSortImports" in Oxfmt and reformat all imorts.

											
										
										
											2026-02-01 10:03:47 +09:00
+								  isProfileInCooldown,
-												Agents: infer auth-profile unavailable failover reason

											
										
										
											2026-02-22 16:10:24 -08:00
+								  resolveProfilesUnavailableReason,
-												chore: Enable "experimentalSortImports" in Oxfmt and reformat all imorts.

											
										
										
											2026-02-01 10:03:47 +09:00
+								  resolveAuthProfileOrder,
 								} from "./auth-profiles.js";
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
-												fix(fallback): handle timeout aborts

Co-authored-by: Mykyta Bozhenko <21245729+cheeeee@users.noreply.github.com>

											
										
										
											2026-01-18 07:52:19 +00:00
+								import {
 								  coerceToFailoverError,
 								  describeFailoverError,
 								  isFailoverError,
 								  isTimeoutError,
 								} from "./failover-error.js";
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								import { logModelFallbackDecision } from "./model-fallback-observation.js";
 								import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								import {
-												chore: apply local workspace updates (#9911)

* chore: apply local workspace updates

* fix: resolve prep findings after rebase (#9898) (thanks @gumadeiras)

* refactor: centralize model allowlist normalization (#9898) (thanks @gumadeiras)

* fix: guard model allowlist initialization (#9911)

* docs: update changelog scope for #9911

* docs: remove model names from changelog entry (#9911)

* fix: satisfy type-aware lint in model allowlist (#9911)
											
										
										
											2026-02-05 16:54:44 -05:00
+								  buildConfiguredAllowlistKeys,
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  buildModelAliasIndex,
 								  modelKey,
-												Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)

* Agents: add subagent orchestration controls

* Agents: add subagent orchestration controls (WIP uncommitted changes)

* feat(subagents): add depth-based spawn gating for sub-sub-agents

* feat(subagents): tool policy, registry, and announce chain for nested agents

* feat(subagents): system prompt, docs, changelog for nested sub-agents

* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback

Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.

Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.

Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.

* fix(subagents): track spawn depth in session store and fix announce routing for nested agents

* Fix compaction status tracking and dedupe overflow compaction triggers

* fix(subagents): enforce depth block via session store and implement cascade kill

* fix: inject group chat context into system prompt

* fix(subagents): always write model to session store at spawn time

* Preserve spawnDepth when agent handler rewrites session entry

* fix(subagents): suppress announce on steer-restart

* fix(subagents): fallback spawned session model to runtime default

* fix(subagents): enforce spawn depth when caller key resolves by sessionId

* feat(subagents): implement active-first ordering for numeric targets and enhance task display

- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.

* fix(subagents): show model for active runs via run record fallback

When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.

Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.

Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay

* feat(chat): implement session key resolution and reset on sidebar navigation

- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.

* fix: subagent timeout=0 passthrough and fallback prompt duplication

Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
  is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
  0 → MAX_SAFE_TIMEOUT_MS)

Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
  message instead of the full original prompt since the session file already
  contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)

* feat(subagents): truncate long task descriptions in subagents command output

- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.

* refactor(subagents): update subagent registry path resolution and improve command output formatting

- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.

* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted

The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.

undefined flowed through the chain as:
  sessions_spawn → timeout: undefined (since undefined != null is false)
  → gateway agent handler → agentCommand opts.timeout: undefined
  → resolveAgentTimeoutMs({ overrideSeconds: undefined })
  → DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)

This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.

Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.

* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)

* fix: thread timeout override through getReplyFromConfig dispatch path

getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).

This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.

* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling

- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.

* feat(tests): add unit tests for steer failure behavior in openclaw-tools

- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.

* fix(subagents): replace stop command with kill in slash commands and documentation

- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.

* feat(tests): add unit tests for readLatestAssistantReply function

- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.

* feat(tests): enhance subagent kill-all cascade tests and announce formatting

- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.

* refactor(subagent): update announce formatting and remove unused constants

- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.

* feat(tests): enhance billing error handling in user-facing text

- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.

* feat(subagent): enhance workflow guidance and auto-announcement clarity

- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.

* fix(cron): avoid announcing interim subagent spawn acks

* chore: clean post-rebase imports

* fix(cron): fall back to child replies when parent stays interim

* fix(subagents): make active-run guidance advisory

* fix(subagents): update announce flow to handle active descendants and enhance test coverage

- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.

* fix(subagents): enhance announce flow and formatting for user updates

- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.

* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)

* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)

* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)

* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
											
										
										
											2026-02-14 22:03:45 -08:00
+								  normalizeModelRef,
-												fix: harden Gmail hook model defaults (#472) (thanks @koala73)

											
										
										
											2026-01-09 19:59:45 +01:00
+								  resolveConfiguredModelRef,
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  resolveModelRefFromString,
 								} from "./model-selection.js";
-												style: align formatting with oxfmt 0.33

											
										
										
											2026-02-18 01:34:35 +00:00
+								import type { FailoverReason } from "./pi-embedded-helpers.js";
-												Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)

* Agents: add subagent orchestration controls

* Agents: add subagent orchestration controls (WIP uncommitted changes)

* feat(subagents): add depth-based spawn gating for sub-sub-agents

* feat(subagents): tool policy, registry, and announce chain for nested agents

* feat(subagents): system prompt, docs, changelog for nested sub-agents

* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback

Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.

Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.

Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.

* fix(subagents): track spawn depth in session store and fix announce routing for nested agents

* Fix compaction status tracking and dedupe overflow compaction triggers

* fix(subagents): enforce depth block via session store and implement cascade kill

* fix: inject group chat context into system prompt

* fix(subagents): always write model to session store at spawn time

* Preserve spawnDepth when agent handler rewrites session entry

* fix(subagents): suppress announce on steer-restart

* fix(subagents): fallback spawned session model to runtime default

* fix(subagents): enforce spawn depth when caller key resolves by sessionId

* feat(subagents): implement active-first ordering for numeric targets and enhance task display

- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.

* fix(subagents): show model for active runs via run record fallback

When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.

Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.

Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay

* feat(chat): implement session key resolution and reset on sidebar navigation

- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.

* fix: subagent timeout=0 passthrough and fallback prompt duplication

Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
  is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
  0 → MAX_SAFE_TIMEOUT_MS)

Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
  message instead of the full original prompt since the session file already
  contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)

* feat(subagents): truncate long task descriptions in subagents command output

- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.

* refactor(subagents): update subagent registry path resolution and improve command output formatting

- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.

* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted

The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.

undefined flowed through the chain as:
  sessions_spawn → timeout: undefined (since undefined != null is false)
  → gateway agent handler → agentCommand opts.timeout: undefined
  → resolveAgentTimeoutMs({ overrideSeconds: undefined })
  → DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)

This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.

Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.

* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)

* fix: thread timeout override through getReplyFromConfig dispatch path

getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).

This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.

* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling

- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.

* feat(tests): add unit tests for steer failure behavior in openclaw-tools

- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.

* fix(subagents): replace stop command with kill in slash commands and documentation

- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.

* feat(tests): add unit tests for readLatestAssistantReply function

- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.

* feat(tests): enhance subagent kill-all cascade tests and announce formatting

- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.

* refactor(subagent): update announce formatting and remove unused constants

- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.

* feat(tests): enhance billing error handling in user-facing text

- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.

* feat(subagent): enhance workflow guidance and auto-announcement clarity

- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.

* fix(cron): avoid announcing interim subagent spawn acks

* chore: clean post-rebase imports

* fix(cron): fall back to child replies when parent stays interim

* fix(subagents): make active-run guidance advisory

* fix(subagents): update announce flow to handle active descendants and enhance test coverage

- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.

* fix(subagents): enhance announce flow and formatting for user updates

- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.

* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)

* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)

* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)

* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
											
										
										
											2026-02-14 22:03:45 -08:00
+								import { isLikelyContextOverflowError } from "./pi-embedded-helpers.js";
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
-												fix(agents): warn clearly on unresolved model ids (#39215, thanks @ademczuk)

Co-authored-by: ademczuk <andrew.demczuk@gmail.com>

											
										
										
											2026-03-07 22:50:27 +00:00
+								const log = createSubsystemLogger("model-fallback");
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								export type ModelFallbackRunOptions = {
-												fix(agents): handle overloaded failover separately (#38301)

* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
											
										
										
											2026-03-07 01:42:11 +03:00
+								  allowTransientCooldownProbe?: boolean;
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								};
 								type ModelFallbackRunFn<T> = (
 								  provider: string,
 								  model: string,
 								  options?: ModelFallbackRunOptions,
 								) => Promise<T>;
-												Deduplicate more

											
										
										
											2026-02-09 18:56:58 -08:00
+								/**
-												Update contributing, deduplicate more functions

											
										
										
											2026-02-09 19:21:33 -08:00
+								 * Fallback abort check. Only treats explicit AbortError names as user aborts.
-												Deduplicate more

											
										
										
											2026-02-09 18:56:58 -08:00
+								 * Message-based checks (e.g., "aborted") can mask timeouts and skip fallback.
 								 */
-												Update contributing, deduplicate more functions

											
										
										
											2026-02-09 19:21:33 -08:00
+								function isFallbackAbortError(err: unknown): boolean {
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								  if (!err || typeof err !== "object") {
 								    return false;
 								  }
 								  if (isFailoverError(err)) {
 								    return false;
 								  }
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  const name = "name" in err ? String(err.name) : "";
-												fix: allow fallback on timeout aborts

Co-authored-by: Larus Ivarsson <larusivar@gmail.com>

											
										
										
											2026-01-20 19:23:06 +00:00
+								  return name === "AbortError";
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								}
-												fix(fallback): handle timeout aborts

Co-authored-by: Mykyta Bozhenko <21245729+cheeeee@users.noreply.github.com>

											
										
										
											2026-01-18 07:52:19 +00:00
+								function shouldRethrowAbort(err: unknown): boolean {
-												Update contributing, deduplicate more functions

											
										
										
											2026-02-09 19:21:33 -08:00
+								  return isFallbackAbortError(err) && !isTimeoutError(err);
-												fix(fallback): handle timeout aborts

Co-authored-by: Mykyta Bozhenko <21245729+cheeeee@users.noreply.github.com>

											
										
										
											2026-01-18 07:52:19 +00:00
+								}
-												refactor(agents): dedupe model fallback candidate logic

											
										
										
											2026-02-15 06:07:01 +00:00
+								function createModelCandidateCollector(allowlist: Set<string> | null | undefined): {
 								  candidates: ModelCandidate[];
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								  addExplicitCandidate: (candidate: ModelCandidate) => void;
 								  addAllowlistedCandidate: (candidate: ModelCandidate) => void;
-												refactor(agents): dedupe model fallback candidate logic

											
										
										
											2026-02-15 06:07:01 +00:00
+								} {
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								  const seen = new Set<string>();
 								  const candidates: ModelCandidate[] = [];
-												chore: migrate to oxlint and oxfmt

Co-authored-by: Christoph Nakazawa <christoph.pojer@gmail.com>

											
										
										
											2026-01-14 14:31:43 +00:00
+								  const addCandidate = (candidate: ModelCandidate, enforceAllowlist: boolean) => {
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								    if (!candidate.provider || !candidate.model) {
 								      return;
 								    }
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								    const key = modelKey(candidate.provider, candidate.model);
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								    if (seen.has(key)) {
 								      return;
 								    }
 								    if (enforceAllowlist && allowlist && !allowlist.has(key)) {
 								      return;
 								    }
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								    seen.add(key);
 								    candidates.push(candidate);
 								  };
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								  const addExplicitCandidate = (candidate: ModelCandidate) => {
 								    addCandidate(candidate, false);
 								  };
 								  const addAllowlistedCandidate = (candidate: ModelCandidate) => {
 								    addCandidate(candidate, true);
 								  };
 								  return { candidates, addExplicitCandidate, addAllowlistedCandidate };
-												refactor(agents): dedupe model fallback candidate logic

											
										
										
											2026-02-15 06:07:01 +00:00
+								}
 								type ModelFallbackErrorHandler = (attempt: {
 								  provider: string;
 								  model: string;
 								  error: unknown;
 								  attempt: number;
 								  total: number;
 								}) => void | Promise<void>;
 								type ModelFallbackRunResult<T> = {
 								  result: T;
 								  provider: string;
 								  model: string;
 								  attempts: FallbackAttempt[];
 								};
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								function buildFallbackSuccess<T>(params: {
 								  result: T;
 								  provider: string;
 								  model: string;
 								  attempts: FallbackAttempt[];
 								}): ModelFallbackRunResult<T> {
 								  return {
 								    result: params.result,
 								    provider: params.provider,
 								    model: params.model,
 								    attempts: params.attempts,
 								  };
 								}
 								async function runFallbackCandidate<T>(params: {
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								  run: ModelFallbackRunFn<T>;
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								  provider: string;
 								  model: string;
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								  options?: ModelFallbackRunOptions;
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								}): Promise<{ ok: true; result: T } | { ok: false; error: unknown }> {
 								  try {
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								    const result = params.options
 								      ? await params.run(params.provider, params.model, params.options)
 								      : await params.run(params.provider, params.model);
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								    return {
 								      ok: true,
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								      result,
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								    };
 								  } catch (err) {
-												fix(agents): normalize abort-wrapped RESOURCE_EXHAUSTED into failover errors (#11972)

											
										
										
											2026-03-08 12:21:41 +00:00
+								    // Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
 								    // so they become FailoverErrors and continue the fallback loop instead of aborting.
 								    const normalizedFailover = coerceToFailoverError(err, {
 								      provider: params.provider,
 								      model: params.model,
 								    });
 								    if (shouldRethrowAbort(err) && !normalizedFailover) {
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								      throw err;
 								    }
-												fix(agents): normalize abort-wrapped RESOURCE_EXHAUSTED into failover errors (#11972)

											
										
										
											2026-03-08 12:21:41 +00:00
+								    return { ok: false, error: normalizedFailover ?? err };
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								  }
 								}
 								async function runFallbackAttempt<T>(params: {
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								  run: ModelFallbackRunFn<T>;
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								  provider: string;
 								  model: string;
 								  attempts: FallbackAttempt[];
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								  options?: ModelFallbackRunOptions;
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								}): Promise<{ success: ModelFallbackRunResult<T> } | { error: unknown }> {
 								  const runResult = await runFallbackCandidate({
 								    run: params.run,
 								    provider: params.provider,
 								    model: params.model,
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								    options: params.options,
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								  });
 								  if (runResult.ok) {
 								    return {
 								      success: buildFallbackSuccess({
 								        result: runResult.result,
 								        provider: params.provider,
 								        model: params.model,
 								        attempts: params.attempts,
 								      }),
 								    };
 								  }
 								  return { error: runResult.error };
 								}
-												fix(auth/session): preserve override reset behavior and repair oauth profile-id drift (openclaw#18820) thanks @Glucksberg

Verified:
- pnpm build
- pnpm check
- pnpm test:macmini

Co-authored-by: Glucksberg <80581902+Glucksberg@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>

											
										
										
											2026-02-19 23:16:26 -04:00
+								function sameModelCandidate(a: ModelCandidate, b: ModelCandidate): boolean {
 								  return a.provider === b.provider && a.model === b.model;
 								}
-												refactor(agents): share fallback failure summary builder

											
										
										
											2026-02-19 00:10:08 +00:00
+								function throwFallbackFailureSummary(params: {
 								  attempts: FallbackAttempt[];
 								  candidates: ModelCandidate[];
 								  lastError: unknown;
 								  label: string;
 								  formatAttempt: (attempt: FallbackAttempt) => string;
 								}): never {
 								  if (params.attempts.length <= 1 && params.lastError) {
 								    throw params.lastError;
 								  }
 								  const summary =
 								    params.attempts.length > 0 ? params.attempts.map(params.formatAttempt).join(" | ") : "unknown";
 								  throw new Error(
 								    `All ${params.label} failed (${params.attempts.length || params.candidates.length}): ${summary}`,
 								    {
 								      cause: params.lastError instanceof Error ? params.lastError : undefined,
 								    },
 								  );
 								}
-												refactor(agents): dedupe model fallback candidate logic

											
										
										
											2026-02-15 06:07:01 +00:00
+								function resolveImageFallbackCandidates(params: {
 								  cfg: OpenClawConfig | undefined;
 								  defaultProvider: string;
 								  modelOverride?: string;
 								}): ModelCandidate[] {
 								  const aliasIndex = buildModelAliasIndex({
 								    cfg: params.cfg ?? {},
 								    defaultProvider: params.defaultProvider,
 								  });
 								  const allowlist = buildConfiguredAllowlistKeys({
 								    cfg: params.cfg,
 								    defaultProvider: params.defaultProvider,
 								  });
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								  const { candidates, addExplicitCandidate, addAllowlistedCandidate } =
 								    createModelCandidateCollector(allowlist);
-												refactor(agents): dedupe model fallback candidate logic

											
										
										
											2026-02-15 06:07:01 +00:00
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								  const addRaw = (raw: string, opts?: { allowlist?: boolean }) => {
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								    const resolved = resolveModelRefFromString({
 								      raw: String(raw ?? ""),
 								      defaultProvider: params.defaultProvider,
 								      aliasIndex,
 								    });
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								    if (!resolved) {
 								      return;
 								    }
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								    if (opts?.allowlist) {
 								      addAllowlistedCandidate(resolved.ref);
 								      return;
 								    }
 								    addExplicitCandidate(resolved.ref);
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								  };
 								  if (params.modelOverride?.trim()) {
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								    addRaw(params.modelOverride);
-												feat!: redesign model config + auth profiles

											
										
										
											2026-01-06 00:56:29 +00:00
+								  } else {
-												fix(agents): fall back to agents.defaults.model when agent has no model config (#24210)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 0f272b102763736001a82cfda23f35ff2ee9cac8
Co-authored-by: bianbiandashen <16240681+bianbiandashen@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-23 16:18:55 +08:00
+								    const primary = resolveAgentModelPrimaryValue(params.cfg?.agents?.defaults?.imageModel);
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								    if (primary?.trim()) {
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								      addRaw(primary);
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								    }
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								  }
-												fix(agents): fall back to agents.defaults.model when agent has no model config (#24210)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 0f272b102763736001a82cfda23f35ff2ee9cac8
Co-authored-by: bianbiandashen <16240681+bianbiandashen@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-23 16:18:55 +08:00
+								  const imageFallbacks = resolveAgentModelFallbackValues(params.cfg?.agents?.defaults?.imageModel);
-												feat!: redesign model config + auth profiles

											
										
										
											2026-01-06 00:56:29 +00:00
 								  for (const raw of imageFallbacks) {
-												fix(agents): harden model fallback failover paths

											
										
										
											2026-02-25 03:46:34 +00:00
+								    // Explicitly configured image fallbacks should remain reachable even when a
 								    // model allowlist is present.
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								    addRaw(raw);
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								  }
 								  return candidates;
 								}
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								function resolveFallbackCandidates(params: {
-												refactor: rename to openclaw

											
										
										
											2026-01-30 03:15:10 +01:00
+								  cfg: OpenClawConfig | undefined;
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  provider: string;
 								  model: string;
-												Config: support per-agent model fallbacks

											
										
										
											2026-01-09 14:59:02 +01:00
+								  /** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
 								  fallbacksOverride?: string[];
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								}): ModelCandidate[] {
-												fix: harden Gmail hook model defaults (#472) (thanks @koala73)

											
										
										
											2026-01-09 19:59:45 +01:00
+								  const primary = params.cfg
 								    ? resolveConfiguredModelRef({
 								        cfg: params.cfg,
 								        defaultProvider: DEFAULT_PROVIDER,
 								        defaultModel: DEFAULT_MODEL,
 								      })
 								    : null;
-												fix: default model fallback when provider/model missing (#954) (thanks @roshanasingh4)

											
										
										
											2026-01-15 16:58:41 +00:00
+								  const defaultProvider = primary?.provider ?? DEFAULT_PROVIDER;
 								  const defaultModel = primary?.model ?? DEFAULT_MODEL;
-												Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)

* Agents: add subagent orchestration controls

* Agents: add subagent orchestration controls (WIP uncommitted changes)

* feat(subagents): add depth-based spawn gating for sub-sub-agents

* feat(subagents): tool policy, registry, and announce chain for nested agents

* feat(subagents): system prompt, docs, changelog for nested sub-agents

* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback

Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.

Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.

Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.

* fix(subagents): track spawn depth in session store and fix announce routing for nested agents

* Fix compaction status tracking and dedupe overflow compaction triggers

* fix(subagents): enforce depth block via session store and implement cascade kill

* fix: inject group chat context into system prompt

* fix(subagents): always write model to session store at spawn time

* Preserve spawnDepth when agent handler rewrites session entry

* fix(subagents): suppress announce on steer-restart

* fix(subagents): fallback spawned session model to runtime default

* fix(subagents): enforce spawn depth when caller key resolves by sessionId

* feat(subagents): implement active-first ordering for numeric targets and enhance task display

- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.

* fix(subagents): show model for active runs via run record fallback

When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.

Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.

Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay

* feat(chat): implement session key resolution and reset on sidebar navigation

- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.

* fix: subagent timeout=0 passthrough and fallback prompt duplication

Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
  is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
  0 → MAX_SAFE_TIMEOUT_MS)

Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
  message instead of the full original prompt since the session file already
  contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)

* feat(subagents): truncate long task descriptions in subagents command output

- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.

* refactor(subagents): update subagent registry path resolution and improve command output formatting

- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.

* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted

The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.

undefined flowed through the chain as:
  sessions_spawn → timeout: undefined (since undefined != null is false)
  → gateway agent handler → agentCommand opts.timeout: undefined
  → resolveAgentTimeoutMs({ overrideSeconds: undefined })
  → DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)

This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.

Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.

* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)

* fix: thread timeout override through getReplyFromConfig dispatch path

getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).

This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.

* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling

- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.

* feat(tests): add unit tests for steer failure behavior in openclaw-tools

- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.

* fix(subagents): replace stop command with kill in slash commands and documentation

- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.

* feat(tests): add unit tests for readLatestAssistantReply function

- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.

* feat(tests): enhance subagent kill-all cascade tests and announce formatting

- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.

* refactor(subagent): update announce formatting and remove unused constants

- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.

* feat(tests): enhance billing error handling in user-facing text

- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.

* feat(subagent): enhance workflow guidance and auto-announcement clarity

- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.

* fix(cron): avoid announcing interim subagent spawn acks

* chore: clean post-rebase imports

* fix(cron): fall back to child replies when parent stays interim

* fix(subagents): make active-run guidance advisory

* fix(subagents): update announce flow to handle active descendants and enhance test coverage

- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.

* fix(subagents): enhance announce flow and formatting for user updates

- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.

* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)

* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)

* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)

* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
											
										
										
											2026-02-14 22:03:45 -08:00
+								  const providerRaw = String(params.provider ?? "").trim() || defaultProvider;
 								  const modelRaw = String(params.model ?? "").trim() || defaultModel;
 								  const normalizedPrimary = normalizeModelRef(providerRaw, modelRaw);
-												fix(auth/session): preserve override reset behavior and repair oauth profile-id drift (openclaw#18820) thanks @Glucksberg

Verified:
- pnpm build
- pnpm check
- pnpm test:macmini

Co-authored-by: Glucksberg <80581902+Glucksberg@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>

											
										
										
											2026-02-19 23:16:26 -04:00
+								  const configuredPrimary = normalizeModelRef(defaultProvider, defaultModel);
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  const aliasIndex = buildModelAliasIndex({
 								    cfg: params.cfg ?? {},
-												fix: default model fallback when provider/model missing (#954) (thanks @roshanasingh4)

											
										
										
											2026-01-15 16:58:41 +00:00
+								    defaultProvider,
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  });
-												chore: apply local workspace updates (#9911)

* chore: apply local workspace updates

* fix: resolve prep findings after rebase (#9898) (thanks @gumadeiras)

* refactor: centralize model allowlist normalization (#9898) (thanks @gumadeiras)

* fix: guard model allowlist initialization (#9911)

* docs: update changelog scope for #9911

* docs: remove model names from changelog entry (#9911)

* fix: satisfy type-aware lint in model allowlist (#9911)
											
										
										
											2026-02-05 16:54:44 -05:00
+								  const allowlist = buildConfiguredAllowlistKeys({
 								    cfg: params.cfg,
 								    defaultProvider,
 								  });
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								  const { candidates, addExplicitCandidate } = createModelCandidateCollector(allowlist);
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								  addExplicitCandidate(normalizedPrimary);
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
-												feat!: redesign model config + auth profiles

											
										
										
											2026-01-06 00:56:29 +00:00
+								  const modelFallbacks = (() => {
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								    if (params.fallbacksOverride !== undefined) {
 								      return params.fallbacksOverride;
 								    }
-												fix(agents): keep fallback chain reachable on configured fallback models (#25922)

											
										
										
											2026-02-25 01:46:20 +00:00
+								    const configuredFallbacks = resolveAgentModelFallbackValues(
 								      params.cfg?.agents?.defaults?.model,
 								    );
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								    // When user runs a different provider than config, only use configured fallbacks
 								    // if the current model is already in that chain (e.g. session on first fallback).
 								    if (normalizedPrimary.provider !== configuredPrimary.provider) {
 								      const isConfiguredFallback = configuredFallbacks.some((raw) => {
 								        const resolved = resolveModelRefFromString({
 								          raw: String(raw ?? ""),
 								          defaultProvider,
 								          aliasIndex,
 								        });
 								        return resolved ? sameModelCandidate(resolved.ref, normalizedPrimary) : false;
-												fix(agents): keep fallback chain reachable on configured fallback models (#25922)

											
										
										
											2026-02-25 01:46:20 +00:00
+								      });
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								      return isConfiguredFallback ? configuredFallbacks : [];
 								    }
 								    // Same provider: always use full fallback chain (model version differences within provider).
 								    return configuredFallbacks;
-												feat!: redesign model config + auth profiles

											
										
										
											2026-01-06 00:56:29 +00:00
+								  })();
 								  for (const raw of modelFallbacks) {
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								    const resolved = resolveModelRefFromString({
 								      raw: String(raw ?? ""),
-												fix: default model fallback when provider/model missing (#954) (thanks @roshanasingh4)

											
										
										
											2026-01-15 16:58:41 +00:00
+								      defaultProvider,
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								      aliasIndex,
 								    });
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								    if (!resolved) {
 								      continue;
 								    }
-												fix(agents): harden model fallback failover paths

											
										
										
											2026-02-25 03:46:34 +00:00
+								    // Fallbacks are explicit user intent; do not silently filter them by the
 								    // model allowlist.
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								    addExplicitCandidate(resolved.ref);
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  }
-												chore: migrate to oxlint and oxfmt

Co-authored-by: Christoph Nakazawa <christoph.pojer@gmail.com>

											
										
										
											2026-01-14 14:31:43 +00:00
+								  if (params.fallbacksOverride === undefined && primary?.provider && primary.model) {
-												refactor(agents): centralize model fallback resolution

											
										
										
											2026-02-25 04:32:25 +00:00
+								    addExplicitCandidate({ provider: primary.provider, model: primary.model });
-												fix: harden Gmail hook model defaults (#472) (thanks @koala73)

											
										
										
											2026-01-09 19:59:45 +01:00
+								  }
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  return candidates;
 								}
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								const lastProbeAttempt = new Map<string, number>();
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								const MIN_PROBE_INTERVAL_MS = 30_000; // 30 seconds between probes per key
 								const PROBE_MARGIN_MS = 2 * 60 * 1000;
 								const PROBE_SCOPE_DELIMITER = "::";
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								const PROBE_STATE_TTL_MS = 24 * 60 * 60 * 1000;
 								const MAX_PROBE_KEYS = 256;
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
 								function resolveProbeThrottleKey(provider: string, agentDir?: string): string {
 								  const scope = String(agentDir ?? "").trim();
 								  return scope ? `${scope}${PROBE_SCOPE_DELIMITER}${provider}` : provider;
 								}
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								function pruneProbeState(now: number): void {
 								  for (const [key, ts] of lastProbeAttempt) {
 								    if (!Number.isFinite(ts) || ts <= 0 || now - ts > PROBE_STATE_TTL_MS) {
 								      lastProbeAttempt.delete(key);
 								    }
 								  }
 								}
 								function enforceProbeStateCap(): void {
 								  while (lastProbeAttempt.size > MAX_PROBE_KEYS) {
 								    let oldestKey: string | null = null;
 								    let oldestTs = Number.POSITIVE_INFINITY;
 								    for (const [key, ts] of lastProbeAttempt) {
 								      if (ts < oldestTs) {
 								        oldestKey = key;
 								        oldestTs = ts;
 								      }
 								    }
 								    if (!oldestKey) {
 								      break;
 								    }
 								    lastProbeAttempt.delete(oldestKey);
 								  }
 								}
 								function isProbeThrottleOpen(now: number, throttleKey: string): boolean {
 								  pruneProbeState(now);
 								  const lastProbe = lastProbeAttempt.get(throttleKey) ?? 0;
 								  return now - lastProbe >= MIN_PROBE_INTERVAL_MS;
 								}
 								function markProbeAttempt(now: number, throttleKey: string): void {
 								  pruneProbeState(now);
 								  lastProbeAttempt.set(throttleKey, now);
 								  enforceProbeStateCap();
 								}
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								function shouldProbePrimaryDuringCooldown(params: {
 								  isPrimary: boolean;
 								  hasFallbackCandidates: boolean;
 								  now: number;
 								  throttleKey: string;
 								  authStore: ReturnType<typeof ensureAuthProfileStore>;
 								  profileIds: string[];
 								}): boolean {
 								  if (!params.isPrimary || !params.hasFallbackCandidates) {
 								    return false;
 								  }
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								  if (!isProbeThrottleOpen(params.now, params.throttleKey)) {
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								    return false;
 								  }
 								  const soonest = getSoonestCooldownExpiry(params.authStore, params.profileIds);
 								  if (soonest === null || !Number.isFinite(soonest)) {
 								    return true;
 								  }
 								  // Probe when cooldown already expired or within the configured margin.
 								  return params.now >= soonest - PROBE_MARGIN_MS;
 								}
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
 								/** @internal – exposed for unit tests only */
 								export const _probeThrottleInternals = {
 								  lastProbeAttempt,
 								  MIN_PROBE_INTERVAL_MS,
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								  PROBE_MARGIN_MS,
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								  PROBE_STATE_TTL_MS,
 								  MAX_PROBE_KEYS,
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								  resolveProbeThrottleKey,
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								  isProbeThrottleOpen,
 								  pruneProbeState,
 								  markProbeAttempt,
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								} as const;
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								type CooldownDecision =
 								  | {
 								      type: "skip";
 								      reason: FailoverReason;
 								      error: string;
 								    }
 								  | {
 								      type: "attempt";
 								      reason: FailoverReason;
 								      markProbe: boolean;
 								    };
 								function resolveCooldownDecision(params: {
 								  candidate: ModelCandidate;
 								  isPrimary: boolean;
 								  requestedModel: boolean;
 								  hasFallbackCandidates: boolean;
 								  now: number;
 								  probeThrottleKey: string;
 								  authStore: ReturnType<typeof ensureAuthProfileStore>;
 								  profileIds: string[];
 								}): CooldownDecision {
 								  const shouldProbe = shouldProbePrimaryDuringCooldown({
 								    isPrimary: params.isPrimary,
 								    hasFallbackCandidates: params.hasFallbackCandidates,
 								    now: params.now,
 								    throttleKey: params.probeThrottleKey,
 								    authStore: params.authStore,
 								    profileIds: params.profileIds,
 								  });
 								  const inferredReason =
 								    resolveProfilesUnavailableReason({
 								      store: params.authStore,
 								      profileIds: params.profileIds,
 								      now: params.now,
-												fix: use unknown instead of rate_limit as default cooldown reason (#42911)

Merged via squash.

Prepared head SHA: bebf6704d7b02b9a32935c0006eac2d76694fec0
Co-authored-by: VibhorGautam <55019395+VibhorGautam@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-12 00:04:14 +05:30
+								    }) ?? "unknown";
-												fix(agents): broaden 402 temporary-limit detection and allow billing cooldown probe (#38533)

Merged via squash.

Prepared head SHA: 282b9186c6f48fcdbf0c81c49f739e5e9ed2df23
Co-authored-by: xialonglee <22994703+xialonglee@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-08 01:27:01 -06:00
+								  const isPersistentAuthIssue = inferredReason === "auth" || inferredReason === "auth_permanent";
 								  if (isPersistentAuthIssue) {
 								    return {
 								      type: "skip",
 								      reason: inferredReason,
 								      error: `Provider ${params.candidate.provider} has ${inferredReason} issue (skipping all models)`,
 								    };
 								  }
 								  // Billing is semi-persistent: the user may fix their balance, or a transient
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								  // 402 might have been misclassified. Probe single-provider setups on the
 								  // standard throttle so they can recover without a restart; when fallbacks
 								  // exist, only probe near cooldown expiry so the fallback chain stays preferred.
-												fix(agents): broaden 402 temporary-limit detection and allow billing cooldown probe (#38533)

Merged via squash.

Prepared head SHA: 282b9186c6f48fcdbf0c81c49f739e5e9ed2df23
Co-authored-by: xialonglee <22994703+xialonglee@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-08 01:27:01 -06:00
+								  if (inferredReason === "billing") {
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								    const shouldProbeSingleProviderBilling =
 								      params.isPrimary &&
 								      !params.hasFallbackCandidates &&
 								      isProbeThrottleOpen(params.now, params.probeThrottleKey);
 								    if (params.isPrimary && (shouldProbe || shouldProbeSingleProviderBilling)) {
-												fix(agents): broaden 402 temporary-limit detection and allow billing cooldown probe (#38533)

Merged via squash.

Prepared head SHA: 282b9186c6f48fcdbf0c81c49f739e5e9ed2df23
Co-authored-by: xialonglee <22994703+xialonglee@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-08 01:27:01 -06:00
+								      return { type: "attempt", reason: inferredReason, markProbe: true };
 								    }
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								    return {
 								      type: "skip",
 								      reason: inferredReason,
 								      error: `Provider ${params.candidate.provider} has ${inferredReason} issue (skipping all models)`,
 								    };
 								  }
 								  // For primary: try when requested model or when probe allows.
-												fix(agents): handle overloaded failover separately (#38301)

* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
											
										
										
											2026-03-07 01:42:11 +03:00
+								  // For same-provider fallbacks: only relax cooldown on transient provider
 								  // limits, which are often model-scoped and can recover on a sibling model.
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								  const shouldAttemptDespiteCooldown =
 								    (params.isPrimary && (!params.requestedModel || shouldProbe)) ||
-												fix: use unknown instead of rate_limit as default cooldown reason (#42911)

Merged via squash.

Prepared head SHA: bebf6704d7b02b9a32935c0006eac2d76694fec0
Co-authored-by: VibhorGautam <55019395+VibhorGautam@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-12 00:04:14 +05:30
+								    (!params.isPrimary &&
 								      (inferredReason === "rate_limit" ||
 								        inferredReason === "overloaded" ||
 								        inferredReason === "unknown"));
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								  if (!shouldAttemptDespiteCooldown) {
 								    return {
 								      type: "skip",
 								      reason: inferredReason,
 								      error: `Provider ${params.candidate.provider} is in cooldown (all profiles unavailable)`,
 								    };
 								  }
 								  return {
 								    type: "attempt",
 								    reason: inferredReason,
 								    markProbe: params.isPrimary && shouldProbe,
 								  };
 								}
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								export async function runWithModelFallback<T>(params: {
-												refactor: rename to openclaw

											
										
										
											2026-01-30 03:15:10 +01:00
+								  cfg: OpenClawConfig | undefined;
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  provider: string;
 								  model: string;
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								  runId?: string;
-												Agents: finish cooldowned provider skip (#2534)

* Agents: skip cooldowned providers in fallback

* fix: skip cooldowned providers during model failover (#2143) (thanks @YiWang24)
											
										
										
											2026-01-26 22:05:31 -05:00
+								  agentDir?: string;
-												Config: support per-agent model fallbacks

											
										
										
											2026-01-09 14:59:02 +01:00
+								  /** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
 								  fallbacksOverride?: string[];
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								  run: ModelFallbackRunFn<T>;
-												refactor(agents): dedupe model fallback candidate logic

											
										
										
											2026-02-15 06:07:01 +00:00
+								  onError?: ModelFallbackErrorHandler;
 								}): Promise<ModelFallbackRunResult<T>> {
-												Config: support per-agent model fallbacks

											
										
										
											2026-01-09 14:59:02 +01:00
+								  const candidates = resolveFallbackCandidates({
 								    cfg: params.cfg,
 								    provider: params.provider,
 								    model: params.model,
 								    fallbacksOverride: params.fallbacksOverride,
 								  });
-												Agents: finish cooldowned provider skip (#2534)

* Agents: skip cooldowned providers in fallback

* fix: skip cooldowned providers during model failover (#2143) (thanks @YiWang24)
											
										
										
											2026-01-26 22:05:31 -05:00
+								  const authStore = params.cfg
 								    ? ensureAuthProfileStore(params.agentDir, { allowKeychainPrompt: false })
 								    : null;
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  const attempts: FallbackAttempt[] = [];
 								  let lastError: unknown;
-												fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711)

Merged via squash.

Prepared head SHA: 8be8967bcb4be81f6abc5ff078644ec4efcfe7a0
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 07:26:47 -05:00
+								  const cooldownProbeUsedProviders = new Set<string>();
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								  const hasFallbackCandidates = candidates.length > 1;
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								  for (let i = 0; i < candidates.length; i += 1) {
-												chore: Enable more lint rules, disable some that trigger a lot. Will clean up later.

											
										
										
											2026-01-31 16:03:28 +09:00
+								    const candidate = candidates[i];
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								    const isPrimary = i === 0;
 								    const requestedModel =
 								      params.provider === candidate.provider && params.model === candidate.model;
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								    let runOptions: ModelFallbackRunOptions | undefined;
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								    let attemptedDuringCooldown = false;
-												fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711)

Merged via squash.

Prepared head SHA: 8be8967bcb4be81f6abc5ff078644ec4efcfe7a0
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 07:26:47 -05:00
+								    let transientProbeProviderForAttempt: string | null = null;
-												Skip cooldowned providers during model failover (#2143)

* feat(agents): skip cooldowned providers during failover

When all auth profiles for a provider are in cooldown, the failover
mechanism now skips that provider immediately rather than attempting
and waiting for the cooldown error. This prevents long delays when
multiple OAuth providers fail in sequence.

* fix(agents): correct imports and API usage for cooldown check
											
										
										
											2026-01-26 21:59:38 -05:00
+								    if (authStore) {
 								      const profileIds = resolveAuthProfileOrder({
 								        cfg: params.cfg,
 								        store: authStore,
 								        provider: candidate.provider,
 								      });
 								      const isAnyProfileAvailable = profileIds.some((id) => !isProfileInCooldown(authStore, id));
 								      if (profileIds.length > 0 && !isAnyProfileAvailable) {
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								        // All profiles for this provider are in cooldown.
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								        const now = Date.now();
 								        const probeThrottleKey = resolveProbeThrottleKey(candidate.provider, params.agentDir);
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								        const decision = resolveCooldownDecision({
 								          candidate,
 								          isPrimary,
 								          requestedModel,
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								          hasFallbackCandidates,
 								          now,
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								          probeThrottleKey,
-												refactor(agents): extract cooldown probe decision helper

											
										
										
											2026-02-16 08:07:14 -05:00
+								          authStore,
 								          profileIds,
 								        });
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
 								        if (decision.type === "skip") {
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								          attempts.push({
 								            provider: candidate.provider,
 								            model: candidate.model,
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								            error: decision.error,
 								            reason: decision.reason,
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								          });
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								          logModelFallbackDecision({
 								            decision: "skip_candidate",
 								            runId: params.runId,
 								            requestedProvider: params.provider,
 								            requestedModel: params.model,
 								            candidate,
 								            attempt: i + 1,
 								            total: candidates.length,
 								            reason: decision.reason,
 								            error: decision.error,
 								            nextCandidate: candidates[i + 1],
 								            isPrimary,
 								            requestedModelMatched: requestedModel,
 								            fallbackConfigured: hasFallbackCandidates,
 								            profileCount: profileIds.length,
 								          });
-												fix: auto-recover primary model after rate-limit cooldown expires (#17478) (#18045)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: f7a7865727a9aee0aaa3d929cce65dc46c3db234
Co-authored-by: PlayerGhost <28265945+PlayerGhost@users.noreply.github.com>
Co-authored-by: sebslight <19554889+sebslight@users.noreply.github.com>
Reviewed-by: @sebslight

											
										
										
											2026-02-16 10:03:35 -03:00
+								          continue;
 								        }
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
 								        if (decision.markProbe) {
-												fix(agents): probe single-provider billing cooldowns (#41422)

Merged via squash.

Prepared head SHA: bbc4254b94559f95c34e11734a679cbe852aba52
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 00:58:51 +03:00
+								          markProbeAttempt(now, probeThrottleKey);
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
+								        }
-												fix(agents): broaden 402 temporary-limit detection and allow billing cooldown probe (#38533)

Merged via squash.

Prepared head SHA: 282b9186c6f48fcdbf0c81c49f739e5e9ed2df23
Co-authored-by: xialonglee <22994703+xialonglee@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-08 01:27:01 -06:00
+								        if (
 								          decision.reason === "rate_limit" ||
 								          decision.reason === "overloaded" ||
-												fix: use unknown instead of rate_limit as default cooldown reason (#42911)

Merged via squash.

Prepared head SHA: bebf6704d7b02b9a32935c0006eac2d76694fec0
Co-authored-by: VibhorGautam <55019395+VibhorGautam@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-12 00:04:14 +05:30
+								          decision.reason === "billing" ||
 								          decision.reason === "unknown"
-												fix(agents): broaden 402 temporary-limit detection and allow billing cooldown probe (#38533)

Merged via squash.

Prepared head SHA: 282b9186c6f48fcdbf0c81c49f739e5e9ed2df23
Co-authored-by: xialonglee <22994703+xialonglee@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-08 01:27:01 -06:00
+								        ) {
-												fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711)

Merged via squash.

Prepared head SHA: 8be8967bcb4be81f6abc5ff078644ec4efcfe7a0
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 07:26:47 -05:00
+								          // Probe at most once per provider per fallback run when all profiles
 								          // are cooldowned. Re-probing every same-provider candidate can stall
 								          // cross-provider fallback on providers with long internal retries.
 								          const isTransientCooldownReason =
-												fix: use unknown instead of rate_limit as default cooldown reason (#42911)

Merged via squash.

Prepared head SHA: bebf6704d7b02b9a32935c0006eac2d76694fec0
Co-authored-by: VibhorGautam <55019395+VibhorGautam@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-12 00:04:14 +05:30
+								            decision.reason === "rate_limit" ||
 								            decision.reason === "overloaded" ||
 								            decision.reason === "unknown";
-												fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711)

Merged via squash.

Prepared head SHA: 8be8967bcb4be81f6abc5ff078644ec4efcfe7a0
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 07:26:47 -05:00
+								          if (isTransientCooldownReason && cooldownProbeUsedProviders.has(candidate.provider)) {
 								            const error = `Provider ${candidate.provider} is in cooldown (probe already attempted this run)`;
 								            attempts.push({
 								              provider: candidate.provider,
 								              model: candidate.model,
 								              error,
 								              reason: decision.reason,
 								            });
 								            logModelFallbackDecision({
 								              decision: "skip_candidate",
 								              runId: params.runId,
 								              requestedProvider: params.provider,
 								              requestedModel: params.model,
 								              candidate,
 								              attempt: i + 1,
 								              total: candidates.length,
 								              reason: decision.reason,
 								              error,
 								              nextCandidate: candidates[i + 1],
 								              isPrimary,
 								              requestedModelMatched: requestedModel,
 								              fallbackConfigured: hasFallbackCandidates,
 								              profileCount: profileIds.length,
 								            });
 								            continue;
 								          }
-												fix(agents): handle overloaded failover separately (#38301)

* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
											
										
										
											2026-03-07 01:42:11 +03:00
+								          runOptions = { allowTransientCooldownProbe: true };
-												fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711)

Merged via squash.

Prepared head SHA: 8be8967bcb4be81f6abc5ff078644ec4efcfe7a0
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 07:26:47 -05:00
+								          if (isTransientCooldownReason) {
 								            transientProbeProviderForAttempt = candidate.provider;
 								          }
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								        }
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								        attemptedDuringCooldown = true;
 								        logModelFallbackDecision({
 								          decision: "probe_cooldown_candidate",
 								          runId: params.runId,
 								          requestedProvider: params.provider,
 								          requestedModel: params.model,
 								          candidate,
 								          attempt: i + 1,
 								          total: candidates.length,
 								          reason: decision.reason,
 								          nextCandidate: candidates[i + 1],
 								          isPrimary,
 								          requestedModelMatched: requestedModel,
 								          fallbackConfigured: hasFallbackCandidates,
 								          allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe,
 								          profileCount: profileIds.length,
 								        });
-												Skip cooldowned providers during model failover (#2143)

* feat(agents): skip cooldowned providers during failover

When all auth profiles for a provider are in cooldown, the failover
mechanism now skips that provider immediately rather than attempting
and waiting for the cooldown error. This prevents long delays when
multiple OAuth providers fail in sequence.

* fix(agents): correct imports and API usage for cooldown check
											
										
										
											2026-01-26 21:59:38 -05:00
+								      }
 								    }
-												fix(agents): comprehensive quota fallback fixes - session overrides + surgical cooldown logic (#23816)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: e6f2b4742b82b9fe44a7e103170c2f96565b09c5
Co-authored-by: ramezgaberiel <844893+ramezgaberiel@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras

											
										
										
											2026-02-25 19:35:40 -06:00
-												fix(agents): honor explicit rate-limit cooldown probes in fallback runs

											
										
										
											2026-03-05 20:02:36 -08:00
+								    const attemptRun = await runFallbackAttempt({
 								      run: params.run,
 								      ...candidate,
 								      attempts,
 								      options: runOptions,
 								    });
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								    if ("success" in attemptRun) {
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								      if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
 								        logModelFallbackDecision({
 								          decision: "candidate_succeeded",
 								          runId: params.runId,
 								          requestedProvider: params.provider,
 								          requestedModel: params.model,
 								          candidate,
 								          attempt: i + 1,
 								          total: candidates.length,
 								          previousAttempts: attempts,
 								          isPrimary,
 								          requestedModelMatched: requestedModel,
 								          fallbackConfigured: hasFallbackCandidates,
 								        });
 								      }
-												fix(agents): warn clearly on unresolved model ids (#39215, thanks @ademczuk)

Co-authored-by: ademczuk <andrew.demczuk@gmail.com>

											
										
										
											2026-03-07 22:50:27 +00:00
+								      const notFoundAttempt =
 								        i > 0 ? attempts.find((a) => a.reason === "model_not_found") : undefined;
 								      if (notFoundAttempt) {
 								        log.warn(
 								          `Model "${sanitizeForLog(notFoundAttempt.provider)}/${sanitizeForLog(notFoundAttempt.model)}" not found. Fell back to "${sanitizeForLog(candidate.provider)}/${sanitizeForLog(candidate.model)}".`,
 								        );
 								      }
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								      return attemptRun.success;
 								    }
 								    const err = attemptRun.error;
 								    {
-												fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711)

Merged via squash.

Prepared head SHA: 8be8967bcb4be81f6abc5ff078644ec4efcfe7a0
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 07:26:47 -05:00
+								      if (transientProbeProviderForAttempt) {
 								        const probeFailureReason = describeFailoverError(err).reason;
 								        const shouldPreserveTransientProbeSlot =
 								          probeFailureReason === "model_not_found" ||
 								          probeFailureReason === "format" ||
 								          probeFailureReason === "auth" ||
 								          probeFailureReason === "auth_permanent" ||
 								          probeFailureReason === "session_expired";
 								        if (!shouldPreserveTransientProbeSlot) {
 								          cooldownProbeUsedProviders.add(transientProbeProviderForAttempt);
 								        }
 								      }
-												Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)

* Agents: add subagent orchestration controls

* Agents: add subagent orchestration controls (WIP uncommitted changes)

* feat(subagents): add depth-based spawn gating for sub-sub-agents

* feat(subagents): tool policy, registry, and announce chain for nested agents

* feat(subagents): system prompt, docs, changelog for nested sub-agents

* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback

Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.

Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.

Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.

* fix(subagents): track spawn depth in session store and fix announce routing for nested agents

* Fix compaction status tracking and dedupe overflow compaction triggers

* fix(subagents): enforce depth block via session store and implement cascade kill

* fix: inject group chat context into system prompt

* fix(subagents): always write model to session store at spawn time

* Preserve spawnDepth when agent handler rewrites session entry

* fix(subagents): suppress announce on steer-restart

* fix(subagents): fallback spawned session model to runtime default

* fix(subagents): enforce spawn depth when caller key resolves by sessionId

* feat(subagents): implement active-first ordering for numeric targets and enhance task display

- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.

* fix(subagents): show model for active runs via run record fallback

When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.

Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.

Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay

* feat(chat): implement session key resolution and reset on sidebar navigation

- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.

* fix: subagent timeout=0 passthrough and fallback prompt duplication

Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
  is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
  0 → MAX_SAFE_TIMEOUT_MS)

Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
  message instead of the full original prompt since the session file already
  contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)

* feat(subagents): truncate long task descriptions in subagents command output

- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.

* refactor(subagents): update subagent registry path resolution and improve command output formatting

- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.

* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted

The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.

undefined flowed through the chain as:
  sessions_spawn → timeout: undefined (since undefined != null is false)
  → gateway agent handler → agentCommand opts.timeout: undefined
  → resolveAgentTimeoutMs({ overrideSeconds: undefined })
  → DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)

This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.

Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.

* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)

* fix: thread timeout override through getReplyFromConfig dispatch path

getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).

This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.

* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling

- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.

* feat(tests): add unit tests for steer failure behavior in openclaw-tools

- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.

* fix(subagents): replace stop command with kill in slash commands and documentation

- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.

* feat(tests): add unit tests for readLatestAssistantReply function

- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.

* feat(tests): enhance subagent kill-all cascade tests and announce formatting

- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.

* refactor(subagent): update announce formatting and remove unused constants

- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.

* feat(tests): enhance billing error handling in user-facing text

- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.

* feat(subagent): enhance workflow guidance and auto-announcement clarity

- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.

* fix(cron): avoid announcing interim subagent spawn acks

* chore: clean post-rebase imports

* fix(cron): fall back to child replies when parent stays interim

* fix(subagents): make active-run guidance advisory

* fix(subagents): update announce flow to handle active descendants and enhance test coverage

- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.

* fix(subagents): enhance announce flow and formatting for user updates

- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.

* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)

* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)

* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)

* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
											
										
										
											2026-02-14 22:03:45 -08:00
+								      // Context overflow errors should be handled by the inner runner's
 								      // compaction/retry logic, not by model fallback.  If one escapes as a
 								      // throw, rethrow it immediately rather than trying a different model
 								      // that may have a smaller context window and fail worse.
 								      const errMessage = err instanceof Error ? err.message : String(err);
 								      if (isLikelyContextOverflowError(errMessage)) {
 								        throw err;
 								      }
-												refactor(agents): centralize failover normalization

											
										
										
											2026-01-09 22:15:03 +01:00
+								      const normalized =
 								        coerceToFailoverError(err, {
 								          provider: candidate.provider,
 								          model: candidate.model,
 								        }) ?? err;
-												fix(agents): continue fallback loop for unrecognized provider errors (#26106)

* fix(agents): continue fallback loop for unrecognized provider errors

When a provider returns an error that coerceToFailoverError cannot
classify (e.g., custom error messages without standard HTTP status
codes), the fallback loop threw immediately instead of trying the
next candidate. This caused fallback to stop after 2 models even
when 17 were configured.

Only rethrow unrecognized errors when they occur on the last
candidate. For intermediate candidates, record the error as an
attempt and continue to the next model.

Closes #25926

Co-authored-by: Cursor <cursoragent@cursor.com>

* test: cover unknown-error fallback telemetry and land #26106 (thanks @Sid-Qin)

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
											
										
										
											2026-02-25 12:53:26 +08:00
 								      // Even unrecognized errors should not abort the fallback loop when
 								      // there are remaining candidates.  Only abort/context-overflow errors
 								      // (handled above) are truly non-retryable.
 								      const isKnownFailover = isFailoverError(normalized);
 								      if (!isKnownFailover && i === candidates.length - 1) {
-												chore: Enable "curly" rule to avoid single-statement if confusion/errors.

											
										
										
											2026-01-31 16:19:20 +09:00
+								        throw err;
 								      }
-												refactor(agents): centralize failover normalization

											
										
										
											2026-01-09 22:15:03 +01:00
-												fix(agents): continue fallback loop for unrecognized provider errors (#26106)

* fix(agents): continue fallback loop for unrecognized provider errors

When a provider returns an error that coerceToFailoverError cannot
classify (e.g., custom error messages without standard HTTP status
codes), the fallback loop threw immediately instead of trying the
next candidate. This caused fallback to stop after 2 models even
when 17 were configured.

Only rethrow unrecognized errors when they occur on the last
candidate. For intermediate candidates, record the error as an
attempt and continue to the next model.

Closes #25926

Co-authored-by: Cursor <cursoragent@cursor.com>

* test: cover unknown-error fallback telemetry and land #26106 (thanks @Sid-Qin)

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
											
										
										
											2026-02-25 12:53:26 +08:00
+								      lastError = isKnownFailover ? normalized : err;
-												refactor(agents): centralize failover normalization

											
										
										
											2026-01-09 22:15:03 +01:00
+								      const described = describeFailoverError(normalized);
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								      attempts.push({
 								        provider: candidate.provider,
 								        model: candidate.model,
-												fix(auth): billing backoff + cooldown UX

											
										
										
											2026-01-09 21:57:52 +01:00
+								        error: described.message,
-												fix(agents): continue fallback loop for unrecognized provider errors (#26106)

* fix(agents): continue fallback loop for unrecognized provider errors

When a provider returns an error that coerceToFailoverError cannot
classify (e.g., custom error messages without standard HTTP status
codes), the fallback loop threw immediately instead of trying the
next candidate. This caused fallback to stop after 2 models even
when 17 were configured.

Only rethrow unrecognized errors when they occur on the last
candidate. For intermediate candidates, record the error as an
attempt and continue to the next model.

Closes #25926

Co-authored-by: Cursor <cursoragent@cursor.com>

* test: cover unknown-error fallback telemetry and land #26106 (thanks @Sid-Qin)

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
											
										
										
											2026-02-25 12:53:26 +08:00
+								        reason: described.reason ?? "unknown",
-												fix(auth): billing backoff + cooldown UX

											
										
										
											2026-01-09 21:57:52 +01:00
+								        status: described.status,
 								        code: described.code,
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								      });
-												Agents: add fallback error observations (#41337)

Merged via squash.

Prepared head SHA: 852469c82ff28fb0e1be7f1019f5283e712c4283
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
											
										
										
											2026-03-10 01:12:10 +03:00
+								      logModelFallbackDecision({
 								        decision: "candidate_failed",
 								        runId: params.runId,
 								        requestedProvider: params.provider,
 								        requestedModel: params.model,
 								        candidate,
 								        attempt: i + 1,
 								        total: candidates.length,
 								        reason: described.reason,
 								        status: described.status,
 								        code: described.code,
 								        error: described.message,
 								        nextCandidate: candidates[i + 1],
 								        isPrimary,
 								        requestedModelMatched: requestedModel,
 								        fallbackConfigured: hasFallbackCandidates,
 								      });
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								      await params.onError?.({
 								        provider: candidate.provider,
 								        model: candidate.model,
-												fix(agents): continue fallback loop for unrecognized provider errors (#26106)

* fix(agents): continue fallback loop for unrecognized provider errors

When a provider returns an error that coerceToFailoverError cannot
classify (e.g., custom error messages without standard HTTP status
codes), the fallback loop threw immediately instead of trying the
next candidate. This caused fallback to stop after 2 models even
when 17 were configured.

Only rethrow unrecognized errors when they occur on the last
candidate. For intermediate candidates, record the error as an
attempt and continue to the next model.

Closes #25926

Co-authored-by: Cursor <cursoragent@cursor.com>

* test: cover unknown-error fallback telemetry and land #26106 (thanks @Sid-Qin)

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
											
										
										
											2026-02-25 12:53:26 +08:00
+								        error: isKnownFailover ? normalized : err,
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								        attempt: i + 1,
 								        total: candidates.length,
 								      });
 								    }
 								  }
-												refactor(agents): share fallback failure summary builder

											
										
										
											2026-02-19 00:10:08 +00:00
+								  throwFallbackFailureSummary({
 								    attempts,
 								    candidates,
 								    lastError,
 								    label: "models",
 								    formatAttempt: (attempt) =>
 								      `${attempt.provider}/${attempt.model}: ${attempt.error}${
 								        attempt.reason ? ` (${attempt.reason})` : ""
 								      }`,
-												chore: migrate to oxlint and oxfmt

Co-authored-by: Christoph Nakazawa <christoph.pojer@gmail.com>

											
										
										
											2026-01-14 14:31:43 +00:00
+								  });
-												feat: add models scan and fallbacks

											
										
										
											2026-01-04 17:50:55 +01:00
+								}
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
 								export async function runWithImageModelFallback<T>(params: {
-												refactor: rename to openclaw

											
										
										
											2026-01-30 03:15:10 +01:00
+								  cfg: OpenClawConfig | undefined;
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								  modelOverride?: string;
 								  run: (provider: string, model: string) => Promise<T>;
-												refactor(agents): dedupe model fallback candidate logic

											
										
										
											2026-02-15 06:07:01 +00:00
+								  onError?: ModelFallbackErrorHandler;
 								}): Promise<ModelFallbackRunResult<T>> {
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								  const candidates = resolveImageFallbackCandidates({
 								    cfg: params.cfg,
 								    defaultProvider: DEFAULT_PROVIDER,
 								    modelOverride: params.modelOverride,
 								  });
 								  if (candidates.length === 0) {
 								    throw new Error(
-												feat: wire multi-agent config and routing

Co-authored-by: Mark Pors <1078320+pors@users.noreply.github.com>

											
										
										
											2026-01-09 12:44:23 +00:00
+								      "No image model configured. Set agents.defaults.imageModel.primary or agents.defaults.imageModel.fallbacks.",
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								    );
 								  }
 								  const attempts: FallbackAttempt[] = [];
 								  let lastError: unknown;
 								  for (let i = 0; i < candidates.length; i += 1) {
-												chore: Enable more lint rules, disable some that trigger a lot. Will clean up later.

											
										
										
											2026-01-31 16:03:28 +09:00
+								    const candidate = candidates[i];
-												refactor(agents): dedupe model and tool test helpers

											
										
										
											2026-03-02 21:30:12 +00:00
+								    const attemptRun = await runFallbackAttempt({ run: params.run, ...candidate, attempts });
 								    if ("success" in attemptRun) {
 								      return attemptRun.success;
 								    }
 								    {
 								      const err = attemptRun.error;
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								      lastError = err;
 								      attempts.push({
 								        provider: candidate.provider,
 								        model: candidate.model,
 								        error: err instanceof Error ? err.message : String(err),
 								      });
 								      await params.onError?.({
 								        provider: candidate.provider,
 								        model: candidate.model,
 								        error: err,
 								        attempt: i + 1,
 								        total: candidates.length,
 								      });
 								    }
 								  }
-												refactor(agents): share fallback failure summary builder

											
										
										
											2026-02-19 00:10:08 +00:00
+								  throwFallbackFailureSummary({
 								    attempts,
 								    candidates,
 								    lastError,
 								    label: "image models",
 								    formatAttempt: (attempt) => `${attempt.provider}/${attempt.model}: ${attempt.error}`,
-												chore: migrate to oxlint and oxfmt

Co-authored-by: Christoph Nakazawa <christoph.pojer@gmail.com>

											
										
										
											2026-01-14 14:31:43 +00:00
+								  });
-												feat: add image model config + tool

											
										
										
											2026-01-04 19:35:00 +01:00
+								}