diff --git a/src/agents/subagent-announce.ts b/src/agents/subagent-announce.ts index ab2fbb1140e..f1b480d5f16 100644 --- a/src/agents/subagent-announce.ts +++ b/src/agents/subagent-announce.ts @@ -63,9 +63,70 @@ function loadSubagentRegistryRuntime() { return subagentRegistryRuntimePromise; } -const DIRECT_ANNOUNCE_TRANSIENT_RETRY_DELAYS_MS = FAST_TEST_MODE - ? ([8, 16, 32] as const) - : ([5_000, 10_000, 20_000] as const); +// Default retry configuration for announce delivery +const DEFAULT_ANNOUNCE_RETRY_ATTEMPTS = 3; +const DEFAULT_ANNOUNCE_RETRY_MIN_DELAY_MS = 5_000; +const DEFAULT_ANNOUNCE_RETRY_MAX_DELAY_MS = 30_000; +const DEFAULT_ANNOUNCE_RETRY_JITTER = 0.1; + +type AnnounceRetryConfig = { + attempts: number; + minDelayMs: number; + maxDelayMs: number; + jitter: number; +}; + +function resolveAnnounceRetryConfig(cfg: ReturnType): AnnounceRetryConfig { + const retry = cfg.agents?.defaults?.subagents?.announceRetry; + // Clamp minDelayMs to timer-safe maximum + const rawMinDelayMs = + typeof retry?.minDelayMs === "number" && + Number.isFinite(retry.minDelayMs) && + retry.minDelayMs > 0 + ? Math.floor(retry.minDelayMs) + : DEFAULT_ANNOUNCE_RETRY_MIN_DELAY_MS; + const minDelayMs = Math.min(rawMinDelayMs, MAX_TIMER_SAFE_TIMEOUT_MS); + const rawMaxDelayMs = + typeof retry?.maxDelayMs === "number" && + Number.isFinite(retry.maxDelayMs) && + retry.maxDelayMs > 0 + ? Math.floor(retry.maxDelayMs) + : DEFAULT_ANNOUNCE_RETRY_MAX_DELAY_MS; + // Ensure maxDelayMs is at least minDelayMs and does not exceed timer-safe maximum + const maxDelayMs = Math.min(Math.max(minDelayMs, rawMaxDelayMs), MAX_TIMER_SAFE_TIMEOUT_MS); + return { + // attempts represents total attempts (including initial), so retries = attempts - 1 + attempts: + typeof retry?.attempts === "number" && Number.isFinite(retry.attempts) && retry.attempts > 0 + ? Math.min(Math.floor(retry.attempts), 10) + : DEFAULT_ANNOUNCE_RETRY_ATTEMPTS, + minDelayMs, + maxDelayMs, + jitter: + typeof retry?.jitter === "number" && + Number.isFinite(retry.jitter) && + retry.jitter >= 0 && + retry.jitter <= 1 + ? retry.jitter + : DEFAULT_ANNOUNCE_RETRY_JITTER, + }; +} + +function computeRetryDelayMs(retryIndex: number, config: AnnounceRetryConfig): number { + if (FAST_TEST_MODE) { + // Fast test mode: use short fixed delays + const fastDelays = [8, 16, 32]; + return fastDelays[retryIndex] ?? 32; + } + // Exponential backoff with jitter + const baseDelay = config.minDelayMs * Math.pow(2, retryIndex); + const cappedDelay = Math.min(baseDelay, config.maxDelayMs); + const jitterRange = cappedDelay * config.jitter; + const jitterOffset = (Math.random() * 2 - 1) * jitterRange; + const delayWithJitter = Math.floor(cappedDelay + jitterOffset); + // Ensure final delay respects both min and max bounds after jitter + return Math.max(config.minDelayMs, Math.min(config.maxDelayMs, delayWithJitter)); +} type ToolResultMessage = { role?: unknown; @@ -171,7 +232,17 @@ async function runAnnounceDeliveryWithRetry(params: { operation: string; signal?: AbortSignal; run: () => Promise; + retryConfig?: AnnounceRetryConfig; }): Promise { + const config = params.retryConfig ?? { + attempts: DEFAULT_ANNOUNCE_RETRY_ATTEMPTS, + minDelayMs: DEFAULT_ANNOUNCE_RETRY_MIN_DELAY_MS, + maxDelayMs: DEFAULT_ANNOUNCE_RETRY_MAX_DELAY_MS, + jitter: DEFAULT_ANNOUNCE_RETRY_JITTER, + }; + // attempts represents the number of retries (not total attempts) + // This matches the previous behavior where 3 delays = 3 retries + const maxRetries = config.attempts; let retryIndex = 0; for (;;) { if (params.signal?.aborted) { @@ -180,14 +251,14 @@ async function runAnnounceDeliveryWithRetry(params: { try { return await params.run(); } catch (err) { - const delayMs = DIRECT_ANNOUNCE_TRANSIENT_RETRY_DELAYS_MS[retryIndex]; - if (delayMs == null || !isTransientAnnounceDeliveryError(err) || params.signal?.aborted) { + const hasNextRetry = retryIndex < maxRetries; + if (!hasNextRetry || !isTransientAnnounceDeliveryError(err) || params.signal?.aborted) { throw err; } - const nextAttempt = retryIndex + 2; - const maxAttempts = DIRECT_ANNOUNCE_TRANSIENT_RETRY_DELAYS_MS.length + 1; + const delayMs = computeRetryDelayMs(retryIndex, config); + const nextAttempt = retryIndex + 1; defaultRuntime.log( - `[warn] Subagent announce ${params.operation} transient failure, retrying ${nextAttempt}/${maxAttempts} in ${Math.round(delayMs / 1000)}s: ${summarizeDeliveryError(err)}`, + `[warn] Subagent announce ${params.operation} transient failure, retrying ${nextAttempt}/${maxRetries} in ${Math.round(delayMs / 1000)}s: ${summarizeDeliveryError(err)}`, ); retryIndex += 1; await waitForAnnounceRetryDelay(delayMs, params.signal); @@ -893,6 +964,7 @@ async function sendSubagentAnnounceDirectly(params: { ? "completion direct announce agent call" : "direct announce agent call", signal: params.signal, + retryConfig: resolveAnnounceRetryConfig(cfg), run: async () => await callGateway({ method: "agent", @@ -1201,6 +1273,7 @@ async function wakeSubagentRunAfterDescendants(params: { const wakeResponse = await runAnnounceDeliveryWithRetry<{ runId?: string }>({ operation: "descendant wake agent call", signal: params.signal, + retryConfig: resolveAnnounceRetryConfig(cfg), run: async () => await callGateway({ method: "agent", diff --git a/src/config/types.agent-defaults.ts b/src/config/types.agent-defaults.ts index ecaaecb69b9..67dc1a39c56 100644 --- a/src/config/types.agent-defaults.ts +++ b/src/config/types.agent-defaults.ts @@ -290,6 +290,17 @@ export type AgentDefaultsConfig = { runTimeoutSeconds?: number; /** Gateway timeout in ms for sub-agent announce delivery calls (default: 90000). */ announceTimeoutMs?: number; + /** Retry configuration for sub-agent announce delivery on transient failures. */ + announceRetry?: { + /** Number of retry attempts after initial failure (default: 3, max: 10). */ + attempts?: number; + /** Minimum retry delay in ms (default: 5000). */ + minDelayMs?: number; + /** Maximum retry delay cap in ms with exponential backoff (default: 30000). */ + maxDelayMs?: number; + /** Jitter factor (0-1) applied to delays to avoid thundering herd (default: 0.1). */ + jitter?: number; + }; }; /** Optional sandbox settings for non-main sessions. */ sandbox?: AgentSandboxConfig; diff --git a/src/config/zod-schema.agent-defaults.ts b/src/config/zod-schema.agent-defaults.ts index 836a1fdae91..8caf73e0ee8 100644 --- a/src/config/zod-schema.agent-defaults.ts +++ b/src/config/zod-schema.agent-defaults.ts @@ -190,6 +190,48 @@ export const AgentDefaultsSchema = z thinking: z.string().optional(), runTimeoutSeconds: z.number().int().min(0).optional(), announceTimeoutMs: z.number().int().positive().optional(), + announceRetry: z + .object({ + attempts: z + .number() + .int() + .min(1) + .max(10) + .optional() + .describe("Number of retry attempts after initial failure (default: 3, max: 10)."), + minDelayMs: z + .number() + .int() + .positive() + .optional() + .describe("Minimum retry delay in ms (default: 5000)."), + maxDelayMs: z + .number() + .int() + .positive() + .optional() + .describe("Maximum retry delay cap in ms (default: 30000)."), + jitter: z + .number() + .min(0) + .max(1) + .optional() + .describe("Jitter factor (0-1) applied to delays (default: 0.1)."), + }) + .strict() + .refine( + (data) => { + if (data.minDelayMs != null && data.maxDelayMs != null) { + return data.maxDelayMs >= data.minDelayMs; + } + return true; + }, + { + message: "maxDelayMs must be greater than or equal to minDelayMs", + }, + ) + .optional() + .describe("Retry configuration for sub-agent announce delivery on transient failures."), }) .strict() .optional(),