diff --git a/src/agents/subagent-orphan-recovery.test.ts b/src/agents/subagent-orphan-recovery.test.ts index 7792140e3db..351e1079dc7 100644 --- a/src/agents/subagent-orphan-recovery.test.ts +++ b/src/agents/subagent-orphan-recovery.test.ts @@ -186,7 +186,7 @@ describe("subagent-orphan-recovery", () => { expect(gateway.callGateway).toHaveBeenCalledTimes(2); }); - it("handles callGateway failure gracefully", async () => { + it("handles callGateway failure gracefully and preserves abortedLastRun flag", async () => { const sessions = await import("../config/sessions.js"); const gateway = await import("../gateway/call.js"); @@ -211,6 +211,10 @@ describe("subagent-orphan-recovery", () => { expect(result.recovered).toBe(0); expect(result.failed).toBe(1); + + // abortedLastRun flag should NOT be cleared on failure, + // so the next restart can retry the recovery + expect(sessions.updateSessionStore).not.toHaveBeenCalled(); }); it("returns empty results when no active runs exist", async () => { @@ -246,8 +250,12 @@ describe("subagent-orphan-recovery", () => { expect(gateway.callGateway).not.toHaveBeenCalled(); }); - it("clears abortedLastRun flag before resuming", async () => { + it("clears abortedLastRun flag after successful resume", async () => { const sessions = await import("../config/sessions.js"); + const gateway = await import("../gateway/call.js"); + + // Ensure callGateway succeeds for this test + vi.mocked(gateway.callGateway).mockResolvedValue({ runId: "resumed-run" } as never); vi.mocked(sessions.loadSessionStore).mockReturnValue({ "agent:main:subagent:test-session-1": { @@ -266,7 +274,7 @@ describe("subagent-orphan-recovery", () => { getActiveRuns: () => activeRuns, }); - // updateSessionStore should have been called to clear the flag + // updateSessionStore should have been called AFTER successful resume to clear the flag expect(sessions.updateSessionStore).toHaveBeenCalledOnce(); const calls = vi.mocked(sessions.updateSessionStore).mock.calls; const [storePath, updater] = calls[0]; diff --git a/src/agents/subagent-orphan-recovery.ts b/src/agents/subagent-orphan-recovery.ts index fc30af26038..f36096d95b2 100644 --- a/src/agents/subagent-orphan-recovery.ts +++ b/src/agents/subagent-orphan-recovery.ts @@ -129,25 +129,31 @@ export async function recoverOrphanedSubagentSessions(params: { log.info(`found orphaned subagent session: ${childSessionKey} (run=${runId})`); - // Clear the aborted flag before resuming - await updateSessionStore(storePath, (currentStore) => { - const current = currentStore[childSessionKey]; - if (current) { - current.abortedLastRun = false; - current.updatedAt = Date.now(); - currentStore[childSessionKey] = current; - } - }); - - // Resume the session with the original task context + // Resume the session with the original task context. + // We intentionally do NOT clear abortedLastRun before attempting + // the resume — if callGateway fails (e.g. gateway still booting), + // the flag stays true so the next restart can retry. const resumed = await resumeOrphanedSession({ sessionKey: childSessionKey, task: runRecord.task, }); if (resumed) { + // Only clear the aborted flag after confirmed successful resume. + await updateSessionStore(storePath, (currentStore) => { + const current = currentStore[childSessionKey]; + if (current) { + current.abortedLastRun = false; + current.updatedAt = Date.now(); + currentStore[childSessionKey] = current; + } + }); result.recovered++; } else { + // Flag stays as abortedLastRun=true so next restart can retry + log.warn( + `resume failed for ${childSessionKey}; abortedLastRun flag preserved for retry on next restart`, + ); result.failed++; } } catch (err) { diff --git a/src/agents/subagent-registry.ts b/src/agents/subagent-registry.ts index 68fd499cbee..c1cab60dd82 100644 --- a/src/agents/subagent-registry.ts +++ b/src/agents/subagent-registry.ts @@ -30,7 +30,6 @@ import { SUBAGENT_ENDED_REASON_KILLED, type SubagentLifecycleEndedReason, } from "./subagent-lifecycle-events.js"; -import { scheduleOrphanRecovery } from "./subagent-orphan-recovery.js"; import { resolveCleanupCompletionReason, resolveDeferredCleanupDecision, @@ -688,10 +687,16 @@ function restoreSubagentRunsOnce() { // Schedule orphan recovery for subagent sessions that were aborted // by a SIGUSR1 reload. This runs after a short delay to let the - // gateway fully bootstrap first. (#47711) - scheduleOrphanRecovery({ - getActiveRuns: () => subagentRuns, - }); + // gateway fully bootstrap first. Dynamic import to avoid increasing + // startup memory footprint. (#47711) + void import("./subagent-orphan-recovery.js").then( + ({ scheduleOrphanRecovery }) => { + scheduleOrphanRecovery({ getActiveRuns: () => subagentRuns }); + }, + () => { + // Ignore import failures — orphan recovery is best-effort. + }, + ); } catch { // ignore restore failures }