From 7a5f2b128c777e28541c6570f8eb1042df4a2f7e Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 21:25:21 -0400 Subject: [PATCH] fix: add retry with exponential backoff for orphan recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Codex review feedback — if recovery fails (e.g. gateway still booting), retries up to 3 times with exponential backoff (5s → 10s → 20s) before giving up. --- src/agents/subagent-orphan-recovery.ts | 47 ++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/src/agents/subagent-orphan-recovery.ts b/src/agents/subagent-orphan-recovery.ts index f36096d95b2..02dfa1528be 100644 --- a/src/agents/subagent-orphan-recovery.ts +++ b/src/agents/subagent-orphan-recovery.ts @@ -174,18 +174,51 @@ export async function recoverOrphanedSubagentSessions(params: { return result; } +/** Maximum number of retry attempts for orphan recovery. */ +const MAX_RECOVERY_RETRIES = 3; +/** Backoff multiplier between retries (exponential). */ +const RETRY_BACKOFF_MULTIPLIER = 2; + /** - * Schedule orphan recovery after a delay. + * Schedule orphan recovery after a delay, with retry logic. * The delay gives the gateway time to fully bootstrap after restart. + * If recovery fails (e.g. gateway not yet ready), retries with exponential backoff. */ export function scheduleOrphanRecovery(params: { getActiveRuns: () => Map; delayMs?: number; + maxRetries?: number; }): void { - const delay = params.delayMs ?? DEFAULT_RECOVERY_DELAY_MS; - setTimeout(() => { - void recoverOrphanedSubagentSessions(params).catch((err) => { - log.warn(`scheduled orphan recovery failed: ${String(err)}`); - }); - }, delay).unref?.(); + const initialDelay = params.delayMs ?? DEFAULT_RECOVERY_DELAY_MS; + const maxRetries = params.maxRetries ?? MAX_RECOVERY_RETRIES; + + const attemptRecovery = (attempt: number, delay: number) => { + setTimeout(() => { + void recoverOrphanedSubagentSessions(params) + .then((result) => { + if (result.failed > 0 && attempt < maxRetries) { + const nextDelay = delay * RETRY_BACKOFF_MULTIPLIER; + log.info( + `orphan recovery had ${result.failed} failure(s); retrying in ${nextDelay}ms (attempt ${attempt + 1}/${maxRetries})`, + ); + attemptRecovery(attempt + 1, nextDelay); + } + }) + .catch((err) => { + if (attempt < maxRetries) { + const nextDelay = delay * RETRY_BACKOFF_MULTIPLIER; + log.warn( + `scheduled orphan recovery failed: ${String(err)}; retrying in ${nextDelay}ms (attempt ${attempt + 1}/${maxRetries})`, + ); + attemptRecovery(attempt + 1, nextDelay); + } else { + log.warn( + `scheduled orphan recovery failed after ${maxRetries} retries: ${String(err)}`, + ); + } + }); + }, delay).unref?.(); + }; + + attemptRecovery(0, initialDelay); }