2026-01-30 03:15:10 +01:00
import type { OpenClawConfig } from "../config/config.js" ;
2026-02-23 16:18:55 +08:00
import {
resolveAgentModelFallbackValues ,
resolveAgentModelPrimaryValue ,
} from "../config/model-input.js" ;
2026-03-07 22:50:27 +00:00
import { createSubsystemLogger } from "../logging/subsystem.js" ;
import { sanitizeForLog } from "../terminal/ansi.js" ;
2026-02-01 10:03:47 +09:00
import {
ensureAuthProfileStore ,
2026-02-16 10:03:35 -03:00
getSoonestCooldownExpiry ,
2026-02-01 10:03:47 +09:00
isProfileInCooldown ,
2026-02-22 16:10:24 -08:00
resolveProfilesUnavailableReason ,
2026-02-01 10:03:47 +09:00
resolveAuthProfileOrder ,
} from "./auth-profiles.js" ;
2026-01-04 17:50:55 +01:00
import { DEFAULT_MODEL , DEFAULT_PROVIDER } from "./defaults.js" ;
2026-01-18 07:52:19 +00:00
import {
coerceToFailoverError ,
describeFailoverError ,
isFailoverError ,
isTimeoutError ,
} from "./failover-error.js" ;
2026-03-10 01:12:10 +03:00
import { logModelFallbackDecision } from "./model-fallback-observation.js" ;
import type { FallbackAttempt , ModelCandidate } from "./model-fallback.types.js" ;
2026-01-04 17:50:55 +01:00
import {
2026-02-05 16:54:44 -05:00
buildConfiguredAllowlistKeys ,
2026-01-04 17:50:55 +01:00
buildModelAliasIndex ,
modelKey ,
Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)
* Agents: add subagent orchestration controls
* Agents: add subagent orchestration controls (WIP uncommitted changes)
* feat(subagents): add depth-based spawn gating for sub-sub-agents
* feat(subagents): tool policy, registry, and announce chain for nested agents
* feat(subagents): system prompt, docs, changelog for nested sub-agents
* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback
Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.
Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.
Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.
* fix(subagents): track spawn depth in session store and fix announce routing for nested agents
* Fix compaction status tracking and dedupe overflow compaction triggers
* fix(subagents): enforce depth block via session store and implement cascade kill
* fix: inject group chat context into system prompt
* fix(subagents): always write model to session store at spawn time
* Preserve spawnDepth when agent handler rewrites session entry
* fix(subagents): suppress announce on steer-restart
* fix(subagents): fallback spawned session model to runtime default
* fix(subagents): enforce spawn depth when caller key resolves by sessionId
* feat(subagents): implement active-first ordering for numeric targets and enhance task display
- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.
* fix(subagents): show model for active runs via run record fallback
When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.
Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.
Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay
* feat(chat): implement session key resolution and reset on sidebar navigation
- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.
* fix: subagent timeout=0 passthrough and fallback prompt duplication
Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
0 → MAX_SAFE_TIMEOUT_MS)
Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
message instead of the full original prompt since the session file already
contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)
* feat(subagents): truncate long task descriptions in subagents command output
- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.
* refactor(subagents): update subagent registry path resolution and improve command output formatting
- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.
* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted
The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.
undefined flowed through the chain as:
sessions_spawn → timeout: undefined (since undefined != null is false)
→ gateway agent handler → agentCommand opts.timeout: undefined
→ resolveAgentTimeoutMs({ overrideSeconds: undefined })
→ DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)
This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.
Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.
* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)
* fix: thread timeout override through getReplyFromConfig dispatch path
getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).
This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.
* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling
- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.
* feat(tests): add unit tests for steer failure behavior in openclaw-tools
- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.
* fix(subagents): replace stop command with kill in slash commands and documentation
- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.
* feat(tests): add unit tests for readLatestAssistantReply function
- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.
* feat(tests): enhance subagent kill-all cascade tests and announce formatting
- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.
* refactor(subagent): update announce formatting and remove unused constants
- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.
* feat(tests): enhance billing error handling in user-facing text
- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.
* feat(subagent): enhance workflow guidance and auto-announcement clarity
- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.
* fix(cron): avoid announcing interim subagent spawn acks
* chore: clean post-rebase imports
* fix(cron): fall back to child replies when parent stays interim
* fix(subagents): make active-run guidance advisory
* fix(subagents): update announce flow to handle active descendants and enhance test coverage
- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.
* fix(subagents): enhance announce flow and formatting for user updates
- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.
* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)
* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)
* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)
* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
2026-02-14 22:03:45 -08:00
normalizeModelRef ,
2026-01-09 19:59:45 +01:00
resolveConfiguredModelRef ,
2026-01-04 17:50:55 +01:00
resolveModelRefFromString ,
} from "./model-selection.js" ;
2026-02-18 01:34:35 +00:00
import type { FailoverReason } from "./pi-embedded-helpers.js" ;
Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)
* Agents: add subagent orchestration controls
* Agents: add subagent orchestration controls (WIP uncommitted changes)
* feat(subagents): add depth-based spawn gating for sub-sub-agents
* feat(subagents): tool policy, registry, and announce chain for nested agents
* feat(subagents): system prompt, docs, changelog for nested sub-agents
* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback
Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.
Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.
Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.
* fix(subagents): track spawn depth in session store and fix announce routing for nested agents
* Fix compaction status tracking and dedupe overflow compaction triggers
* fix(subagents): enforce depth block via session store and implement cascade kill
* fix: inject group chat context into system prompt
* fix(subagents): always write model to session store at spawn time
* Preserve spawnDepth when agent handler rewrites session entry
* fix(subagents): suppress announce on steer-restart
* fix(subagents): fallback spawned session model to runtime default
* fix(subagents): enforce spawn depth when caller key resolves by sessionId
* feat(subagents): implement active-first ordering for numeric targets and enhance task display
- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.
* fix(subagents): show model for active runs via run record fallback
When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.
Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.
Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay
* feat(chat): implement session key resolution and reset on sidebar navigation
- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.
* fix: subagent timeout=0 passthrough and fallback prompt duplication
Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
0 → MAX_SAFE_TIMEOUT_MS)
Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
message instead of the full original prompt since the session file already
contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)
* feat(subagents): truncate long task descriptions in subagents command output
- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.
* refactor(subagents): update subagent registry path resolution and improve command output formatting
- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.
* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted
The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.
undefined flowed through the chain as:
sessions_spawn → timeout: undefined (since undefined != null is false)
→ gateway agent handler → agentCommand opts.timeout: undefined
→ resolveAgentTimeoutMs({ overrideSeconds: undefined })
→ DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)
This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.
Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.
* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)
* fix: thread timeout override through getReplyFromConfig dispatch path
getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).
This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.
* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling
- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.
* feat(tests): add unit tests for steer failure behavior in openclaw-tools
- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.
* fix(subagents): replace stop command with kill in slash commands and documentation
- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.
* feat(tests): add unit tests for readLatestAssistantReply function
- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.
* feat(tests): enhance subagent kill-all cascade tests and announce formatting
- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.
* refactor(subagent): update announce formatting and remove unused constants
- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.
* feat(tests): enhance billing error handling in user-facing text
- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.
* feat(subagent): enhance workflow guidance and auto-announcement clarity
- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.
* fix(cron): avoid announcing interim subagent spawn acks
* chore: clean post-rebase imports
* fix(cron): fall back to child replies when parent stays interim
* fix(subagents): make active-run guidance advisory
* fix(subagents): update announce flow to handle active descendants and enhance test coverage
- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.
* fix(subagents): enhance announce flow and formatting for user updates
- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.
* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)
* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)
* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)
* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
2026-02-14 22:03:45 -08:00
import { isLikelyContextOverflowError } from "./pi-embedded-helpers.js" ;
2026-01-04 17:50:55 +01:00
2026-03-07 22:50:27 +00:00
const log = createSubsystemLogger ( "model-fallback" ) ;
2026-03-05 20:02:36 -08:00
export type ModelFallbackRunOptions = {
2026-03-07 01:42:11 +03:00
allowTransientCooldownProbe? : boolean ;
2026-03-05 20:02:36 -08:00
} ;
type ModelFallbackRunFn < T > = (
provider : string ,
model : string ,
options? : ModelFallbackRunOptions ,
) = > Promise < T > ;
2026-02-09 18:56:58 -08:00
/ * *
2026-02-09 19:21:33 -08:00
* Fallback abort check . Only treats explicit AbortError names as user aborts .
2026-02-09 18:56:58 -08:00
* Message - based checks ( e . g . , "aborted" ) can mask timeouts and skip fallback .
* /
2026-02-09 19:21:33 -08:00
function isFallbackAbortError ( err : unknown ) : boolean {
2026-01-31 16:19:20 +09:00
if ( ! err || typeof err !== "object" ) {
return false ;
}
if ( isFailoverError ( err ) ) {
return false ;
}
2026-01-04 17:50:55 +01:00
const name = "name" in err ? String ( err . name ) : "" ;
2026-01-20 19:23:06 +00:00
return name === "AbortError" ;
2026-01-04 17:50:55 +01:00
}
2026-01-18 07:52:19 +00:00
function shouldRethrowAbort ( err : unknown ) : boolean {
2026-02-09 19:21:33 -08:00
return isFallbackAbortError ( err ) && ! isTimeoutError ( err ) ;
2026-01-18 07:52:19 +00:00
}
2026-02-15 06:07:01 +00:00
function createModelCandidateCollector ( allowlist : Set < string > | null | undefined ) : {
candidates : ModelCandidate [ ] ;
2026-02-25 04:32:25 +00:00
addExplicitCandidate : ( candidate : ModelCandidate ) = > void ;
addAllowlistedCandidate : ( candidate : ModelCandidate ) = > void ;
2026-02-15 06:07:01 +00:00
} {
2026-01-04 19:35:00 +01:00
const seen = new Set < string > ( ) ;
const candidates : ModelCandidate [ ] = [ ] ;
2026-01-14 14:31:43 +00:00
const addCandidate = ( candidate : ModelCandidate , enforceAllowlist : boolean ) = > {
2026-01-31 16:19:20 +09:00
if ( ! candidate . provider || ! candidate . model ) {
return ;
}
2026-01-04 19:35:00 +01:00
const key = modelKey ( candidate . provider , candidate . model ) ;
2026-01-31 16:19:20 +09:00
if ( seen . has ( key ) ) {
return ;
}
if ( enforceAllowlist && allowlist && ! allowlist . has ( key ) ) {
return ;
}
2026-01-04 19:35:00 +01:00
seen . add ( key ) ;
candidates . push ( candidate ) ;
} ;
2026-02-25 04:32:25 +00:00
const addExplicitCandidate = ( candidate : ModelCandidate ) = > {
addCandidate ( candidate , false ) ;
} ;
const addAllowlistedCandidate = ( candidate : ModelCandidate ) = > {
addCandidate ( candidate , true ) ;
} ;
return { candidates , addExplicitCandidate , addAllowlistedCandidate } ;
2026-02-15 06:07:01 +00:00
}
type ModelFallbackErrorHandler = ( attempt : {
provider : string ;
model : string ;
error : unknown ;
attempt : number ;
total : number ;
} ) = > void | Promise < void > ;
type ModelFallbackRunResult < T > = {
result : T ;
provider : string ;
model : string ;
attempts : FallbackAttempt [ ] ;
} ;
2026-03-02 21:30:12 +00:00
function buildFallbackSuccess < T > ( params : {
result : T ;
provider : string ;
model : string ;
attempts : FallbackAttempt [ ] ;
} ) : ModelFallbackRunResult < T > {
return {
result : params.result ,
provider : params.provider ,
model : params.model ,
attempts : params.attempts ,
} ;
}
async function runFallbackCandidate < T > ( params : {
2026-03-05 20:02:36 -08:00
run : ModelFallbackRunFn < T > ;
2026-03-02 21:30:12 +00:00
provider : string ;
model : string ;
2026-03-05 20:02:36 -08:00
options? : ModelFallbackRunOptions ;
2026-03-02 21:30:12 +00:00
} ) : Promise < { ok : true ; result : T } | { ok : false ; error : unknown } > {
try {
2026-03-05 20:02:36 -08:00
const result = params . options
? await params . run ( params . provider , params . model , params . options )
: await params . run ( params . provider , params . model ) ;
2026-03-02 21:30:12 +00:00
return {
ok : true ,
2026-03-05 20:02:36 -08:00
result ,
2026-03-02 21:30:12 +00:00
} ;
} catch ( err ) {
2026-03-08 12:21:41 +00:00
// Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
// so they become FailoverErrors and continue the fallback loop instead of aborting.
const normalizedFailover = coerceToFailoverError ( err , {
provider : params.provider ,
model : params.model ,
} ) ;
if ( shouldRethrowAbort ( err ) && ! normalizedFailover ) {
2026-03-02 21:30:12 +00:00
throw err ;
}
2026-03-08 12:21:41 +00:00
return { ok : false , error : normalizedFailover ? ? err } ;
2026-03-02 21:30:12 +00:00
}
}
async function runFallbackAttempt < T > ( params : {
2026-03-05 20:02:36 -08:00
run : ModelFallbackRunFn < T > ;
2026-03-02 21:30:12 +00:00
provider : string ;
model : string ;
attempts : FallbackAttempt [ ] ;
2026-03-05 20:02:36 -08:00
options? : ModelFallbackRunOptions ;
2026-03-02 21:30:12 +00:00
} ) : Promise < { success : ModelFallbackRunResult < T > } | { error : unknown } > {
const runResult = await runFallbackCandidate ( {
run : params.run ,
provider : params.provider ,
model : params.model ,
2026-03-05 20:02:36 -08:00
options : params.options ,
2026-03-02 21:30:12 +00:00
} ) ;
if ( runResult . ok ) {
return {
success : buildFallbackSuccess ( {
result : runResult.result ,
provider : params.provider ,
model : params.model ,
attempts : params.attempts ,
} ) ,
} ;
}
return { error : runResult.error } ;
}
2026-02-19 23:16:26 -04:00
function sameModelCandidate ( a : ModelCandidate , b : ModelCandidate ) : boolean {
return a . provider === b . provider && a . model === b . model ;
}
2026-02-19 00:10:08 +00:00
function throwFallbackFailureSummary ( params : {
attempts : FallbackAttempt [ ] ;
candidates : ModelCandidate [ ] ;
lastError : unknown ;
label : string ;
formatAttempt : ( attempt : FallbackAttempt ) = > string ;
} ) : never {
if ( params . attempts . length <= 1 && params . lastError ) {
throw params . lastError ;
}
const summary =
params . attempts . length > 0 ? params . attempts . map ( params . formatAttempt ) . join ( " | " ) : "unknown" ;
throw new Error (
` All ${ params . label } failed ( ${ params . attempts . length || params . candidates . length } ): ${ summary } ` ,
{
cause : params.lastError instanceof Error ? params.lastError : undefined ,
} ,
) ;
}
2026-02-15 06:07:01 +00:00
function resolveImageFallbackCandidates ( params : {
cfg : OpenClawConfig | undefined ;
defaultProvider : string ;
modelOverride? : string ;
} ) : ModelCandidate [ ] {
const aliasIndex = buildModelAliasIndex ( {
cfg : params.cfg ? ? { } ,
defaultProvider : params.defaultProvider ,
} ) ;
const allowlist = buildConfiguredAllowlistKeys ( {
cfg : params.cfg ,
defaultProvider : params.defaultProvider ,
} ) ;
2026-02-25 04:32:25 +00:00
const { candidates , addExplicitCandidate , addAllowlistedCandidate } =
createModelCandidateCollector ( allowlist ) ;
2026-02-15 06:07:01 +00:00
2026-02-25 04:32:25 +00:00
const addRaw = ( raw : string , opts ? : { allowlist? : boolean } ) = > {
2026-01-04 19:35:00 +01:00
const resolved = resolveModelRefFromString ( {
raw : String ( raw ? ? "" ) ,
defaultProvider : params.defaultProvider ,
aliasIndex ,
} ) ;
2026-01-31 16:19:20 +09:00
if ( ! resolved ) {
return ;
}
2026-02-25 04:32:25 +00:00
if ( opts ? . allowlist ) {
addAllowlistedCandidate ( resolved . ref ) ;
return ;
}
addExplicitCandidate ( resolved . ref ) ;
2026-01-04 19:35:00 +01:00
} ;
if ( params . modelOverride ? . trim ( ) ) {
2026-02-25 04:32:25 +00:00
addRaw ( params . modelOverride ) ;
2026-01-06 00:56:29 +00:00
} else {
2026-02-23 16:18:55 +08:00
const primary = resolveAgentModelPrimaryValue ( params . cfg ? . agents ? . defaults ? . imageModel ) ;
2026-01-31 16:19:20 +09:00
if ( primary ? . trim ( ) ) {
2026-02-25 04:32:25 +00:00
addRaw ( primary ) ;
2026-01-31 16:19:20 +09:00
}
2026-01-04 19:35:00 +01:00
}
2026-02-23 16:18:55 +08:00
const imageFallbacks = resolveAgentModelFallbackValues ( params . cfg ? . agents ? . defaults ? . imageModel ) ;
2026-01-06 00:56:29 +00:00
for ( const raw of imageFallbacks ) {
2026-02-25 03:46:34 +00:00
// Explicitly configured image fallbacks should remain reachable even when a
// model allowlist is present.
2026-02-25 04:32:25 +00:00
addRaw ( raw ) ;
2026-01-04 19:35:00 +01:00
}
return candidates ;
}
2026-01-04 17:50:55 +01:00
function resolveFallbackCandidates ( params : {
2026-01-30 03:15:10 +01:00
cfg : OpenClawConfig | undefined ;
2026-01-04 17:50:55 +01:00
provider : string ;
model : string ;
2026-01-09 14:59:02 +01:00
/** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
fallbacksOverride? : string [ ] ;
2026-01-04 17:50:55 +01:00
} ) : ModelCandidate [ ] {
2026-01-09 19:59:45 +01:00
const primary = params . cfg
? resolveConfiguredModelRef ( {
cfg : params.cfg ,
defaultProvider : DEFAULT_PROVIDER ,
defaultModel : DEFAULT_MODEL ,
} )
: null ;
2026-01-15 16:58:41 +00:00
const defaultProvider = primary ? . provider ? ? DEFAULT_PROVIDER ;
const defaultModel = primary ? . model ? ? DEFAULT_MODEL ;
Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)
* Agents: add subagent orchestration controls
* Agents: add subagent orchestration controls (WIP uncommitted changes)
* feat(subagents): add depth-based spawn gating for sub-sub-agents
* feat(subagents): tool policy, registry, and announce chain for nested agents
* feat(subagents): system prompt, docs, changelog for nested sub-agents
* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback
Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.
Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.
Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.
* fix(subagents): track spawn depth in session store and fix announce routing for nested agents
* Fix compaction status tracking and dedupe overflow compaction triggers
* fix(subagents): enforce depth block via session store and implement cascade kill
* fix: inject group chat context into system prompt
* fix(subagents): always write model to session store at spawn time
* Preserve spawnDepth when agent handler rewrites session entry
* fix(subagents): suppress announce on steer-restart
* fix(subagents): fallback spawned session model to runtime default
* fix(subagents): enforce spawn depth when caller key resolves by sessionId
* feat(subagents): implement active-first ordering for numeric targets and enhance task display
- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.
* fix(subagents): show model for active runs via run record fallback
When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.
Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.
Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay
* feat(chat): implement session key resolution and reset on sidebar navigation
- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.
* fix: subagent timeout=0 passthrough and fallback prompt duplication
Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
0 → MAX_SAFE_TIMEOUT_MS)
Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
message instead of the full original prompt since the session file already
contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)
* feat(subagents): truncate long task descriptions in subagents command output
- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.
* refactor(subagents): update subagent registry path resolution and improve command output formatting
- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.
* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted
The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.
undefined flowed through the chain as:
sessions_spawn → timeout: undefined (since undefined != null is false)
→ gateway agent handler → agentCommand opts.timeout: undefined
→ resolveAgentTimeoutMs({ overrideSeconds: undefined })
→ DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)
This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.
Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.
* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)
* fix: thread timeout override through getReplyFromConfig dispatch path
getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).
This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.
* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling
- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.
* feat(tests): add unit tests for steer failure behavior in openclaw-tools
- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.
* fix(subagents): replace stop command with kill in slash commands and documentation
- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.
* feat(tests): add unit tests for readLatestAssistantReply function
- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.
* feat(tests): enhance subagent kill-all cascade tests and announce formatting
- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.
* refactor(subagent): update announce formatting and remove unused constants
- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.
* feat(tests): enhance billing error handling in user-facing text
- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.
* feat(subagent): enhance workflow guidance and auto-announcement clarity
- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.
* fix(cron): avoid announcing interim subagent spawn acks
* chore: clean post-rebase imports
* fix(cron): fall back to child replies when parent stays interim
* fix(subagents): make active-run guidance advisory
* fix(subagents): update announce flow to handle active descendants and enhance test coverage
- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.
* fix(subagents): enhance announce flow and formatting for user updates
- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.
* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)
* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)
* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)
* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
2026-02-14 22:03:45 -08:00
const providerRaw = String ( params . provider ? ? "" ) . trim ( ) || defaultProvider ;
const modelRaw = String ( params . model ? ? "" ) . trim ( ) || defaultModel ;
const normalizedPrimary = normalizeModelRef ( providerRaw , modelRaw ) ;
2026-02-19 23:16:26 -04:00
const configuredPrimary = normalizeModelRef ( defaultProvider , defaultModel ) ;
2026-01-04 17:50:55 +01:00
const aliasIndex = buildModelAliasIndex ( {
cfg : params.cfg ? ? { } ,
2026-01-15 16:58:41 +00:00
defaultProvider ,
2026-01-04 17:50:55 +01:00
} ) ;
2026-02-05 16:54:44 -05:00
const allowlist = buildConfiguredAllowlistKeys ( {
cfg : params.cfg ,
defaultProvider ,
} ) ;
2026-02-25 04:32:25 +00:00
const { candidates , addExplicitCandidate } = createModelCandidateCollector ( allowlist ) ;
2026-01-04 17:50:55 +01:00
2026-02-25 04:32:25 +00:00
addExplicitCandidate ( normalizedPrimary ) ;
2026-01-04 17:50:55 +01:00
2026-01-06 00:56:29 +00:00
const modelFallbacks = ( ( ) = > {
2026-01-31 16:19:20 +09:00
if ( params . fallbacksOverride !== undefined ) {
return params . fallbacksOverride ;
}
2026-02-25 01:46:20 +00:00
const configuredFallbacks = resolveAgentModelFallbackValues (
params . cfg ? . agents ? . defaults ? . model ,
) ;
2026-02-25 19:35:40 -06:00
// When user runs a different provider than config, only use configured fallbacks
// if the current model is already in that chain (e.g. session on first fallback).
if ( normalizedPrimary . provider !== configuredPrimary . provider ) {
const isConfiguredFallback = configuredFallbacks . some ( ( raw ) = > {
const resolved = resolveModelRefFromString ( {
raw : String ( raw ? ? "" ) ,
defaultProvider ,
aliasIndex ,
} ) ;
return resolved ? sameModelCandidate ( resolved . ref , normalizedPrimary ) : false ;
2026-02-25 01:46:20 +00:00
} ) ;
2026-02-25 19:35:40 -06:00
return isConfiguredFallback ? configuredFallbacks : [ ] ;
}
// Same provider: always use full fallback chain (model version differences within provider).
return configuredFallbacks ;
2026-01-06 00:56:29 +00:00
} ) ( ) ;
for ( const raw of modelFallbacks ) {
2026-01-04 17:50:55 +01:00
const resolved = resolveModelRefFromString ( {
raw : String ( raw ? ? "" ) ,
2026-01-15 16:58:41 +00:00
defaultProvider ,
2026-01-04 17:50:55 +01:00
aliasIndex ,
} ) ;
2026-01-31 16:19:20 +09:00
if ( ! resolved ) {
continue ;
}
2026-02-25 03:46:34 +00:00
// Fallbacks are explicit user intent; do not silently filter them by the
// model allowlist.
2026-02-25 04:32:25 +00:00
addExplicitCandidate ( resolved . ref ) ;
2026-01-04 17:50:55 +01:00
}
2026-01-14 14:31:43 +00:00
if ( params . fallbacksOverride === undefined && primary ? . provider && primary . model ) {
2026-02-25 04:32:25 +00:00
addExplicitCandidate ( { provider : primary.provider , model : primary.model } ) ;
2026-01-09 19:59:45 +01:00
}
2026-01-04 17:50:55 +01:00
return candidates ;
}
2026-02-16 10:03:35 -03:00
const lastProbeAttempt = new Map < string , number > ( ) ;
2026-02-16 08:07:14 -05:00
const MIN_PROBE_INTERVAL_MS = 30 _000 ; // 30 seconds between probes per key
const PROBE_MARGIN_MS = 2 * 60 * 1000 ;
const PROBE_SCOPE_DELIMITER = "::" ;
2026-03-10 00:58:51 +03:00
const PROBE_STATE_TTL_MS = 24 * 60 * 60 * 1000 ;
const MAX_PROBE_KEYS = 256 ;
2026-02-16 08:07:14 -05:00
function resolveProbeThrottleKey ( provider : string , agentDir? : string ) : string {
const scope = String ( agentDir ? ? "" ) . trim ( ) ;
return scope ? ` ${ scope } ${ PROBE_SCOPE_DELIMITER } ${ provider } ` : provider ;
}
2026-03-10 00:58:51 +03:00
function pruneProbeState ( now : number ) : void {
for ( const [ key , ts ] of lastProbeAttempt ) {
if ( ! Number . isFinite ( ts ) || ts <= 0 || now - ts > PROBE_STATE_TTL_MS ) {
lastProbeAttempt . delete ( key ) ;
}
}
}
function enforceProbeStateCap ( ) : void {
while ( lastProbeAttempt . size > MAX_PROBE_KEYS ) {
let oldestKey : string | null = null ;
let oldestTs = Number . POSITIVE_INFINITY ;
for ( const [ key , ts ] of lastProbeAttempt ) {
if ( ts < oldestTs ) {
oldestKey = key ;
oldestTs = ts ;
}
}
if ( ! oldestKey ) {
break ;
}
lastProbeAttempt . delete ( oldestKey ) ;
}
}
function isProbeThrottleOpen ( now : number , throttleKey : string ) : boolean {
pruneProbeState ( now ) ;
const lastProbe = lastProbeAttempt . get ( throttleKey ) ? ? 0 ;
return now - lastProbe >= MIN_PROBE_INTERVAL_MS ;
}
function markProbeAttempt ( now : number , throttleKey : string ) : void {
pruneProbeState ( now ) ;
lastProbeAttempt . set ( throttleKey , now ) ;
enforceProbeStateCap ( ) ;
}
2026-02-16 08:07:14 -05:00
function shouldProbePrimaryDuringCooldown ( params : {
isPrimary : boolean ;
hasFallbackCandidates : boolean ;
now : number ;
throttleKey : string ;
authStore : ReturnType < typeof ensureAuthProfileStore > ;
profileIds : string [ ] ;
} ) : boolean {
if ( ! params . isPrimary || ! params . hasFallbackCandidates ) {
return false ;
}
2026-03-10 00:58:51 +03:00
if ( ! isProbeThrottleOpen ( params . now , params . throttleKey ) ) {
2026-02-16 08:07:14 -05:00
return false ;
}
const soonest = getSoonestCooldownExpiry ( params . authStore , params . profileIds ) ;
if ( soonest === null || ! Number . isFinite ( soonest ) ) {
return true ;
}
// Probe when cooldown already expired or within the configured margin.
return params . now >= soonest - PROBE_MARGIN_MS ;
}
2026-02-16 10:03:35 -03:00
/** @internal – exposed for unit tests only */
export const _probeThrottleInternals = {
lastProbeAttempt ,
MIN_PROBE_INTERVAL_MS ,
2026-02-16 08:07:14 -05:00
PROBE_MARGIN_MS ,
2026-03-10 00:58:51 +03:00
PROBE_STATE_TTL_MS ,
MAX_PROBE_KEYS ,
2026-02-16 08:07:14 -05:00
resolveProbeThrottleKey ,
2026-03-10 00:58:51 +03:00
isProbeThrottleOpen ,
pruneProbeState ,
markProbeAttempt ,
2026-02-16 10:03:35 -03:00
} as const ;
2026-02-25 19:35:40 -06:00
type CooldownDecision =
| {
type : "skip" ;
reason : FailoverReason ;
error : string ;
}
| {
type : "attempt" ;
reason : FailoverReason ;
markProbe : boolean ;
} ;
function resolveCooldownDecision ( params : {
candidate : ModelCandidate ;
isPrimary : boolean ;
requestedModel : boolean ;
hasFallbackCandidates : boolean ;
now : number ;
probeThrottleKey : string ;
authStore : ReturnType < typeof ensureAuthProfileStore > ;
profileIds : string [ ] ;
} ) : CooldownDecision {
const shouldProbe = shouldProbePrimaryDuringCooldown ( {
isPrimary : params.isPrimary ,
hasFallbackCandidates : params.hasFallbackCandidates ,
now : params.now ,
throttleKey : params.probeThrottleKey ,
authStore : params.authStore ,
profileIds : params.profileIds ,
} ) ;
const inferredReason =
resolveProfilesUnavailableReason ( {
store : params.authStore ,
profileIds : params.profileIds ,
now : params.now ,
2026-03-12 00:04:14 +05:30
} ) ? ? "unknown" ;
2026-03-08 01:27:01 -06:00
const isPersistentAuthIssue = inferredReason === "auth" || inferredReason === "auth_permanent" ;
if ( isPersistentAuthIssue ) {
return {
type : "skip" ,
reason : inferredReason ,
error : ` Provider ${ params . candidate . provider } has ${ inferredReason } issue (skipping all models) ` ,
} ;
}
// Billing is semi-persistent: the user may fix their balance, or a transient
2026-03-10 00:58:51 +03:00
// 402 might have been misclassified. Probe single-provider setups on the
// standard throttle so they can recover without a restart; when fallbacks
// exist, only probe near cooldown expiry so the fallback chain stays preferred.
2026-03-08 01:27:01 -06:00
if ( inferredReason === "billing" ) {
2026-03-10 00:58:51 +03:00
const shouldProbeSingleProviderBilling =
params . isPrimary &&
! params . hasFallbackCandidates &&
isProbeThrottleOpen ( params . now , params . probeThrottleKey ) ;
if ( params . isPrimary && ( shouldProbe || shouldProbeSingleProviderBilling ) ) {
2026-03-08 01:27:01 -06:00
return { type : "attempt" , reason : inferredReason , markProbe : true } ;
}
2026-02-25 19:35:40 -06:00
return {
type : "skip" ,
reason : inferredReason ,
error : ` Provider ${ params . candidate . provider } has ${ inferredReason } issue (skipping all models) ` ,
} ;
}
// For primary: try when requested model or when probe allows.
2026-03-07 01:42:11 +03:00
// For same-provider fallbacks: only relax cooldown on transient provider
// limits, which are often model-scoped and can recover on a sibling model.
2026-02-25 19:35:40 -06:00
const shouldAttemptDespiteCooldown =
( params . isPrimary && ( ! params . requestedModel || shouldProbe ) ) ||
2026-03-12 00:04:14 +05:30
( ! params . isPrimary &&
( inferredReason === "rate_limit" ||
inferredReason === "overloaded" ||
inferredReason === "unknown" ) ) ;
2026-02-25 19:35:40 -06:00
if ( ! shouldAttemptDespiteCooldown ) {
return {
type : "skip" ,
reason : inferredReason ,
error : ` Provider ${ params . candidate . provider } is in cooldown (all profiles unavailable) ` ,
} ;
}
return {
type : "attempt" ,
reason : inferredReason ,
markProbe : params.isPrimary && shouldProbe ,
} ;
}
2026-01-04 17:50:55 +01:00
export async function runWithModelFallback < T > ( params : {
2026-01-30 03:15:10 +01:00
cfg : OpenClawConfig | undefined ;
2026-01-04 17:50:55 +01:00
provider : string ;
model : string ;
2026-03-10 01:12:10 +03:00
runId? : string ;
2026-01-26 22:05:31 -05:00
agentDir? : string ;
2026-01-09 14:59:02 +01:00
/** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
fallbacksOverride? : string [ ] ;
2026-03-05 20:02:36 -08:00
run : ModelFallbackRunFn < T > ;
2026-02-15 06:07:01 +00:00
onError? : ModelFallbackErrorHandler ;
} ) : Promise < ModelFallbackRunResult < T > > {
2026-01-09 14:59:02 +01:00
const candidates = resolveFallbackCandidates ( {
cfg : params.cfg ,
provider : params.provider ,
model : params.model ,
fallbacksOverride : params.fallbacksOverride ,
} ) ;
2026-01-26 22:05:31 -05:00
const authStore = params . cfg
? ensureAuthProfileStore ( params . agentDir , { allowKeychainPrompt : false } )
: null ;
2026-01-04 17:50:55 +01:00
const attempts : FallbackAttempt [ ] = [ ] ;
let lastError : unknown ;
2026-03-10 07:26:47 -05:00
const cooldownProbeUsedProviders = new Set < string > ( ) ;
2026-01-04 17:50:55 +01:00
2026-02-16 10:03:35 -03:00
const hasFallbackCandidates = candidates . length > 1 ;
2026-01-04 17:50:55 +01:00
for ( let i = 0 ; i < candidates . length ; i += 1 ) {
2026-01-31 16:03:28 +09:00
const candidate = candidates [ i ] ;
2026-03-10 01:12:10 +03:00
const isPrimary = i === 0 ;
const requestedModel =
params . provider === candidate . provider && params . model === candidate . model ;
2026-03-05 20:02:36 -08:00
let runOptions : ModelFallbackRunOptions | undefined ;
2026-03-10 01:12:10 +03:00
let attemptedDuringCooldown = false ;
2026-03-10 07:26:47 -05:00
let transientProbeProviderForAttempt : string | null = null ;
2026-01-26 21:59:38 -05:00
if ( authStore ) {
const profileIds = resolveAuthProfileOrder ( {
cfg : params.cfg ,
store : authStore ,
provider : candidate.provider ,
} ) ;
const isAnyProfileAvailable = profileIds . some ( ( id ) = > ! isProfileInCooldown ( authStore , id ) ) ;
if ( profileIds . length > 0 && ! isAnyProfileAvailable ) {
2026-02-16 10:03:35 -03:00
// All profiles for this provider are in cooldown.
2026-02-16 08:07:14 -05:00
const now = Date . now ( ) ;
const probeThrottleKey = resolveProbeThrottleKey ( candidate . provider , params . agentDir ) ;
2026-02-25 19:35:40 -06:00
const decision = resolveCooldownDecision ( {
candidate ,
isPrimary ,
requestedModel ,
2026-02-16 08:07:14 -05:00
hasFallbackCandidates ,
now ,
2026-02-25 19:35:40 -06:00
probeThrottleKey ,
2026-02-16 08:07:14 -05:00
authStore ,
profileIds ,
} ) ;
2026-02-25 19:35:40 -06:00
if ( decision . type === "skip" ) {
2026-02-16 10:03:35 -03:00
attempts . push ( {
provider : candidate.provider ,
model : candidate.model ,
2026-02-25 19:35:40 -06:00
error : decision.error ,
reason : decision.reason ,
2026-02-16 10:03:35 -03:00
} ) ;
2026-03-10 01:12:10 +03:00
logModelFallbackDecision ( {
decision : "skip_candidate" ,
runId : params.runId ,
requestedProvider : params.provider ,
requestedModel : params.model ,
candidate ,
attempt : i + 1 ,
total : candidates.length ,
reason : decision.reason ,
error : decision.error ,
nextCandidate : candidates [ i + 1 ] ,
isPrimary ,
requestedModelMatched : requestedModel ,
fallbackConfigured : hasFallbackCandidates ,
profileCount : profileIds.length ,
} ) ;
2026-02-16 10:03:35 -03:00
continue ;
}
2026-02-25 19:35:40 -06:00
if ( decision . markProbe ) {
2026-03-10 00:58:51 +03:00
markProbeAttempt ( now , probeThrottleKey ) ;
2026-02-25 19:35:40 -06:00
}
2026-03-08 01:27:01 -06:00
if (
decision . reason === "rate_limit" ||
decision . reason === "overloaded" ||
2026-03-12 00:04:14 +05:30
decision . reason === "billing" ||
decision . reason === "unknown"
2026-03-08 01:27:01 -06:00
) {
2026-03-10 07:26:47 -05:00
// Probe at most once per provider per fallback run when all profiles
// are cooldowned. Re-probing every same-provider candidate can stall
// cross-provider fallback on providers with long internal retries.
const isTransientCooldownReason =
2026-03-12 00:04:14 +05:30
decision . reason === "rate_limit" ||
decision . reason === "overloaded" ||
decision . reason === "unknown" ;
2026-03-10 07:26:47 -05:00
if ( isTransientCooldownReason && cooldownProbeUsedProviders . has ( candidate . provider ) ) {
const error = ` Provider ${ candidate . provider } is in cooldown (probe already attempted this run) ` ;
attempts . push ( {
provider : candidate.provider ,
model : candidate.model ,
error ,
reason : decision.reason ,
} ) ;
logModelFallbackDecision ( {
decision : "skip_candidate" ,
runId : params.runId ,
requestedProvider : params.provider ,
requestedModel : params.model ,
candidate ,
attempt : i + 1 ,
total : candidates.length ,
reason : decision.reason ,
error ,
nextCandidate : candidates [ i + 1 ] ,
isPrimary ,
requestedModelMatched : requestedModel ,
fallbackConfigured : hasFallbackCandidates ,
profileCount : profileIds.length ,
} ) ;
continue ;
}
2026-03-07 01:42:11 +03:00
runOptions = { allowTransientCooldownProbe : true } ;
2026-03-10 07:26:47 -05:00
if ( isTransientCooldownReason ) {
transientProbeProviderForAttempt = candidate . provider ;
}
2026-03-05 20:02:36 -08:00
}
2026-03-10 01:12:10 +03:00
attemptedDuringCooldown = true ;
logModelFallbackDecision ( {
decision : "probe_cooldown_candidate" ,
runId : params.runId ,
requestedProvider : params.provider ,
requestedModel : params.model ,
candidate ,
attempt : i + 1 ,
total : candidates.length ,
reason : decision.reason ,
nextCandidate : candidates [ i + 1 ] ,
isPrimary ,
requestedModelMatched : requestedModel ,
fallbackConfigured : hasFallbackCandidates ,
allowTransientCooldownProbe : runOptions?.allowTransientCooldownProbe ,
profileCount : profileIds.length ,
} ) ;
2026-01-26 21:59:38 -05:00
}
}
2026-02-25 19:35:40 -06:00
2026-03-05 20:02:36 -08:00
const attemptRun = await runFallbackAttempt ( {
run : params.run ,
. . . candidate ,
attempts ,
options : runOptions ,
} ) ;
2026-03-02 21:30:12 +00:00
if ( "success" in attemptRun ) {
2026-03-10 01:12:10 +03:00
if ( i > 0 || attempts . length > 0 || attemptedDuringCooldown ) {
logModelFallbackDecision ( {
decision : "candidate_succeeded" ,
runId : params.runId ,
requestedProvider : params.provider ,
requestedModel : params.model ,
candidate ,
attempt : i + 1 ,
total : candidates.length ,
previousAttempts : attempts ,
isPrimary ,
requestedModelMatched : requestedModel ,
fallbackConfigured : hasFallbackCandidates ,
} ) ;
}
2026-03-07 22:50:27 +00:00
const notFoundAttempt =
i > 0 ? attempts . find ( ( a ) = > a . reason === "model_not_found" ) : undefined ;
if ( notFoundAttempt ) {
log . warn (
` Model " ${ sanitizeForLog ( notFoundAttempt . provider ) } / ${ sanitizeForLog ( notFoundAttempt . model ) } " not found. Fell back to " ${ sanitizeForLog ( candidate . provider ) } / ${ sanitizeForLog ( candidate . model ) } ". ` ,
) ;
}
2026-03-02 21:30:12 +00:00
return attemptRun . success ;
}
const err = attemptRun . error ;
{
2026-03-10 07:26:47 -05:00
if ( transientProbeProviderForAttempt ) {
const probeFailureReason = describeFailoverError ( err ) . reason ;
const shouldPreserveTransientProbeSlot =
probeFailureReason === "model_not_found" ||
probeFailureReason === "format" ||
probeFailureReason === "auth" ||
probeFailureReason === "auth_permanent" ||
probeFailureReason === "session_expired" ;
if ( ! shouldPreserveTransientProbeSlot ) {
cooldownProbeUsedProviders . add ( transientProbeProviderForAttempt ) ;
}
}
Agents: add nested subagent orchestration controls and reduce subagent token waste (#14447)
* Agents: add subagent orchestration controls
* Agents: add subagent orchestration controls (WIP uncommitted changes)
* feat(subagents): add depth-based spawn gating for sub-sub-agents
* feat(subagents): tool policy, registry, and announce chain for nested agents
* feat(subagents): system prompt, docs, changelog for nested sub-agents
* fix(subagents): prevent model fallback override, show model during active runs, and block context overflow fallback
Bug 1: When a session has an explicit model override (e.g., gpt/openai-codex),
the fallback candidate logic in resolveFallbackCandidates silently appended the
global primary model (opus) as a backstop. On reinjection/steer with a transient
error, the session could fall back to opus which has a smaller context window
and crash. Fix: when storedModelOverride is set, pass fallbacksOverride ?? []
instead of undefined, preventing the implicit primary backstop.
Bug 2: Active subagents showed 'model n/a' in /subagents list because
resolveModelDisplay only read entry.model/modelProvider (populated after run
completes). Fix: fall back to modelOverride/providerOverride fields which are
populated at spawn time via sessions.patch.
Bug 3: Context overflow errors (prompt too long, context_length_exceeded) could
theoretically escape runEmbeddedPiAgent and be treated as failover candidates
in runWithModelFallback, causing a switch to a model with a smaller context
window. Fix: in runWithModelFallback, detect context overflow errors via
isLikelyContextOverflowError and rethrow them immediately instead of trying the
next model candidate.
* fix(subagents): track spawn depth in session store and fix announce routing for nested agents
* Fix compaction status tracking and dedupe overflow compaction triggers
* fix(subagents): enforce depth block via session store and implement cascade kill
* fix: inject group chat context into system prompt
* fix(subagents): always write model to session store at spawn time
* Preserve spawnDepth when agent handler rewrites session entry
* fix(subagents): suppress announce on steer-restart
* fix(subagents): fallback spawned session model to runtime default
* fix(subagents): enforce spawn depth when caller key resolves by sessionId
* feat(subagents): implement active-first ordering for numeric targets and enhance task display
- Added a test to verify that subagents with numeric targets follow an active-first list ordering.
- Updated `resolveSubagentTarget` to sort subagent runs based on active status and recent activity.
- Enhanced task display in command responses to prevent truncation of long task descriptions.
- Introduced new utility functions for compacting task text and managing subagent run states.
* fix(subagents): show model for active runs via run record fallback
When the spawned model matches the agent's default model, the session
store's override fields are intentionally cleared (isDefault: true).
The model/modelProvider fields are only populated after the run
completes. This left active subagents showing 'model n/a'.
Fix: store the resolved model on SubagentRunRecord at registration
time, and use it as a fallback in both display paths (subagents tool
and /subagents command) when the session store entry has no model info.
Changes:
- SubagentRunRecord: add optional model field
- registerSubagentRun: accept and persist model param
- sessions-spawn-tool: pass resolvedModel to registerSubagentRun
- subagents-tool: pass run record model as fallback to resolveModelDisplay
- commands-subagents: pass run record model as fallback to resolveModelDisplay
* feat(chat): implement session key resolution and reset on sidebar navigation
- Added functions to resolve the main session key and reset chat state when switching sessions from the sidebar.
- Updated the `renderTab` function to handle session key changes when navigating to the chat tab.
- Introduced a test to verify that the session resets to "main" when opening chat from the sidebar navigation.
* fix: subagent timeout=0 passthrough and fallback prompt duplication
Bug 1: runTimeoutSeconds=0 now means 'no timeout' instead of applying 600s default
- sessions-spawn-tool: default to undefined (not 0) when neither timeout param
is provided; use != null check so explicit 0 passes through to gateway
- agent.ts: accept 0 as valid timeout (resolveAgentTimeoutMs already handles
0 → MAX_SAFE_TIMEOUT_MS)
Bug 2: model fallback no longer re-injects the original prompt as a duplicate
- agent.ts: track fallback attempt index; on retries use a short continuation
message instead of the full original prompt since the session file already
contains it from the first attempt
- Also skip re-sending images on fallback retries (already in session)
* feat(subagents): truncate long task descriptions in subagents command output
- Introduced a new utility function to format task previews, limiting their length to improve readability.
- Updated the command handler to use the new formatting function, ensuring task descriptions are truncated appropriately.
- Adjusted related tests to verify that long task descriptions are now truncated in the output.
* refactor(subagents): update subagent registry path resolution and improve command output formatting
- Replaced direct import of STATE_DIR with a utility function to resolve the state directory dynamically.
- Enhanced the formatting of command output for active and recent subagents, adding separators for better readability.
- Updated related tests to reflect changes in command output structure.
* fix(subagent): default sessions_spawn to no timeout when runTimeoutSeconds omitted
The previous fix (75a791106) correctly handled the case where
runTimeoutSeconds was explicitly set to 0 ("no timeout"). However,
when models omit the parameter entirely (which is common since the
schema marks it as optional), runTimeoutSeconds resolved to undefined.
undefined flowed through the chain as:
sessions_spawn → timeout: undefined (since undefined != null is false)
→ gateway agent handler → agentCommand opts.timeout: undefined
→ resolveAgentTimeoutMs({ overrideSeconds: undefined })
→ DEFAULT_AGENT_TIMEOUT_SECONDS (600s = 10 minutes)
This caused subagents to be killed at exactly 10 minutes even though
the user's intent (via TOOLS.md) was for subagents to run without a
timeout.
Fix: default runTimeoutSeconds to 0 (no timeout) when neither
runTimeoutSeconds nor timeoutSeconds is provided by the caller.
Subagent spawns are long-running by design and should not inherit the
600s agent-command default timeout.
* fix(subagent): accept timeout=0 in agent-via-gateway path (second 600s default)
* fix: thread timeout override through getReplyFromConfig dispatch path
getReplyFromConfig called resolveAgentTimeoutMs({ cfg }) with no override,
always falling back to the config default (600s). Add timeoutOverrideSeconds
to GetReplyOptions and pass it through as overrideSeconds so callers of the
dispatch chain can specify a custom timeout (0 = no timeout).
This complements the existing timeout threading in agentCommand and the
cron isolated-agent runner, which already pass overrideSeconds correctly.
* feat(model-fallback): normalize OpenAI Codex model references and enhance fallback handling
- Added normalization for OpenAI Codex model references, specifically converting "gpt-5.3-codex" to "openai-codex" before execution.
- Updated the `resolveFallbackCandidates` function to utilize the new normalization logic.
- Enhanced tests to verify the correct behavior of model normalization and fallback mechanisms.
- Introduced a new test case to ensure that the normalization process works as expected for various input formats.
* feat(tests): add unit tests for steer failure behavior in openclaw-tools
- Introduced a new test file to validate the behavior of subagents when steer replacement dispatch fails.
- Implemented tests to ensure that the announce behavior is restored correctly and that the suppression reason is cleared as expected.
- Enhanced the subagent registry with a new function to clear steer restart suppression.
- Updated related components to support the new test scenarios.
* fix(subagents): replace stop command with kill in slash commands and documentation
- Updated the `/subagents` command to replace `stop` with `kill` for consistency in controlling sub-agent runs.
- Modified related documentation to reflect the change in command usage.
- Removed legacy timeoutSeconds references from the sessions-spawn-tool schema and tests to streamline timeout handling.
- Enhanced tests to ensure correct behavior of the updated commands and their interactions.
* feat(tests): add unit tests for readLatestAssistantReply function
- Introduced a new test file for the `readLatestAssistantReply` function to validate its behavior with various message scenarios.
- Implemented tests to ensure the function correctly retrieves the latest assistant message and handles cases where the latest message has no text.
- Mocked the gateway call to simulate different message histories for comprehensive testing.
* feat(tests): enhance subagent kill-all cascade tests and announce formatting
- Added a new test to verify that the `kill-all` command cascades through ended parents to active descendants in subagents.
- Updated the subagent announce formatting tests to reflect changes in message structure, including the replacement of "Findings:" with "Result:" and the addition of new expectations for message content.
- Improved the handling of long findings and stats in the announce formatting logic to ensure concise output.
- Refactored related functions to enhance clarity and maintainability in the subagent registry and tools.
* refactor(subagent): update announce formatting and remove unused constants
- Modified the subagent announce formatting to replace "Findings:" with "Result:" and adjusted related expectations in tests.
- Removed constants for maximum announce findings characters and summary words, simplifying the announcement logic.
- Updated the handling of findings to retain full content instead of truncating, ensuring more informative outputs.
- Cleaned up unused imports in the commands-subagents file to enhance code clarity.
* feat(tests): enhance billing error handling in user-facing text
- Added tests to ensure that normal text mentioning billing plans is not rewritten, preserving user context.
- Updated the `isBillingErrorMessage` and `sanitizeUserFacingText` functions to improve handling of billing-related messages.
- Introduced new test cases for various scenarios involving billing messages to ensure accurate processing and output.
- Enhanced the subagent announce flow to correctly manage active descendant runs, preventing premature announcements.
* feat(subagent): enhance workflow guidance and auto-announcement clarity
- Added a new guideline in the subagent system prompt to emphasize trust in push-based completion, discouraging busy polling for status updates.
- Updated documentation to clarify that sub-agents will automatically announce their results, improving user understanding of the workflow.
- Enhanced tests to verify the new guidance on avoiding polling loops and to ensure the accuracy of the updated prompts.
* fix(cron): avoid announcing interim subagent spawn acks
* chore: clean post-rebase imports
* fix(cron): fall back to child replies when parent stays interim
* fix(subagents): make active-run guidance advisory
* fix(subagents): update announce flow to handle active descendants and enhance test coverage
- Modified the announce flow to defer announcements when active descendant runs are present, ensuring accurate status reporting.
- Updated tests to verify the new behavior, including scenarios where no fallback requester is available and ensuring proper handling of finished subagents.
- Enhanced the announce formatting to include an `expectFinal` flag for better clarity in the announcement process.
* fix(subagents): enhance announce flow and formatting for user updates
- Updated the announce flow to provide clearer instructions for user updates based on active subagent runs and requester context.
- Refactored the announcement logic to improve clarity and ensure internal context remains private.
- Enhanced tests to verify the new message expectations and formatting, including updated prompts for user-facing updates.
- Introduced a new function to build reply instructions based on session context, improving the overall announcement process.
* fix: resolve prep blockers and changelog placement (#14447) (thanks @tyler6204)
* fix: restore cron delivery-plan import after rebase (#14447) (thanks @tyler6204)
* fix: resolve test failures from rebase conflicts (#14447) (thanks @tyler6204)
* fix: apply formatting after rebase (#14447) (thanks @tyler6204)
2026-02-14 22:03:45 -08:00
// Context overflow errors should be handled by the inner runner's
// compaction/retry logic, not by model fallback. If one escapes as a
// throw, rethrow it immediately rather than trying a different model
// that may have a smaller context window and fail worse.
const errMessage = err instanceof Error ? err.message : String ( err ) ;
if ( isLikelyContextOverflowError ( errMessage ) ) {
throw err ;
}
2026-01-09 22:15:03 +01:00
const normalized =
coerceToFailoverError ( err , {
provider : candidate.provider ,
model : candidate.model ,
} ) ? ? err ;
2026-02-25 12:53:26 +08:00
// Even unrecognized errors should not abort the fallback loop when
// there are remaining candidates. Only abort/context-overflow errors
// (handled above) are truly non-retryable.
const isKnownFailover = isFailoverError ( normalized ) ;
if ( ! isKnownFailover && i === candidates . length - 1 ) {
2026-01-31 16:19:20 +09:00
throw err ;
}
2026-01-09 22:15:03 +01:00
2026-02-25 12:53:26 +08:00
lastError = isKnownFailover ? normalized : err ;
2026-01-09 22:15:03 +01:00
const described = describeFailoverError ( normalized ) ;
2026-01-04 17:50:55 +01:00
attempts . push ( {
provider : candidate.provider ,
model : candidate.model ,
2026-01-09 21:57:52 +01:00
error : described.message ,
2026-02-25 12:53:26 +08:00
reason : described.reason ? ? "unknown" ,
2026-01-09 21:57:52 +01:00
status : described.status ,
code : described.code ,
2026-01-04 17:50:55 +01:00
} ) ;
2026-03-10 01:12:10 +03:00
logModelFallbackDecision ( {
decision : "candidate_failed" ,
runId : params.runId ,
requestedProvider : params.provider ,
requestedModel : params.model ,
candidate ,
attempt : i + 1 ,
total : candidates.length ,
reason : described.reason ,
status : described.status ,
code : described.code ,
error : described.message ,
nextCandidate : candidates [ i + 1 ] ,
isPrimary ,
requestedModelMatched : requestedModel ,
fallbackConfigured : hasFallbackCandidates ,
} ) ;
2026-01-04 17:50:55 +01:00
await params . onError ? . ( {
provider : candidate.provider ,
model : candidate.model ,
2026-02-25 12:53:26 +08:00
error : isKnownFailover ? normalized : err ,
2026-01-04 17:50:55 +01:00
attempt : i + 1 ,
total : candidates.length ,
} ) ;
}
}
2026-02-19 00:10:08 +00:00
throwFallbackFailureSummary ( {
attempts ,
candidates ,
lastError ,
label : "models" ,
formatAttempt : ( attempt ) = >
` ${ attempt . provider } / ${ attempt . model } : ${ attempt . error } ${
attempt . reason ? ` ( ${ attempt . reason } ) ` : ""
} ` ,
2026-01-14 14:31:43 +00:00
} ) ;
2026-01-04 17:50:55 +01:00
}
2026-01-04 19:35:00 +01:00
export async function runWithImageModelFallback < T > ( params : {
2026-01-30 03:15:10 +01:00
cfg : OpenClawConfig | undefined ;
2026-01-04 19:35:00 +01:00
modelOverride? : string ;
run : ( provider : string , model : string ) = > Promise < T > ;
2026-02-15 06:07:01 +00:00
onError? : ModelFallbackErrorHandler ;
} ) : Promise < ModelFallbackRunResult < T > > {
2026-01-04 19:35:00 +01:00
const candidates = resolveImageFallbackCandidates ( {
cfg : params.cfg ,
defaultProvider : DEFAULT_PROVIDER ,
modelOverride : params.modelOverride ,
} ) ;
if ( candidates . length === 0 ) {
throw new Error (
2026-01-09 12:44:23 +00:00
"No image model configured. Set agents.defaults.imageModel.primary or agents.defaults.imageModel.fallbacks." ,
2026-01-04 19:35:00 +01:00
) ;
}
const attempts : FallbackAttempt [ ] = [ ] ;
let lastError : unknown ;
for ( let i = 0 ; i < candidates . length ; i += 1 ) {
2026-01-31 16:03:28 +09:00
const candidate = candidates [ i ] ;
2026-03-02 21:30:12 +00:00
const attemptRun = await runFallbackAttempt ( { run : params.run , . . . candidate , attempts } ) ;
if ( "success" in attemptRun ) {
return attemptRun . success ;
}
{
const err = attemptRun . error ;
2026-01-04 19:35:00 +01:00
lastError = err ;
attempts . push ( {
provider : candidate.provider ,
model : candidate.model ,
error : err instanceof Error ? err.message : String ( err ) ,
} ) ;
await params . onError ? . ( {
provider : candidate.provider ,
model : candidate.model ,
error : err ,
attempt : i + 1 ,
total : candidates.length ,
} ) ;
}
}
2026-02-19 00:10:08 +00:00
throwFallbackFailureSummary ( {
attempts ,
candidates ,
lastError ,
label : "image models" ,
formatAttempt : ( attempt ) = > ` ${ attempt . provider } / ${ attempt . model } : ${ attempt . error } ` ,
2026-01-14 14:31:43 +00:00
} ) ;
2026-01-04 19:35:00 +01:00
}