fix: address Greptile review feedback

- Remove unrelated pnpm-lock.yaml changes
- Move abortedLastRun flag clearing to AFTER successful resume
  (prevents permanent session loss on transient gateway failures)
- Use dynamic import for orphan recovery module to avoid startup
  memory overhead
- Add test assertion that flag is preserved on resume failure
This commit is contained in:
Joey Krug 2026-03-15 21:16:24 -04:00 committed by Peter Steinberger
parent 7b098ea79f
commit f5e32be558
3 changed files with 38 additions and 19 deletions

View File

@ -186,7 +186,7 @@ describe("subagent-orphan-recovery", () => {
expect(gateway.callGateway).toHaveBeenCalledTimes(2);
});
it("handles callGateway failure gracefully", async () => {
it("handles callGateway failure gracefully and preserves abortedLastRun flag", async () => {
const sessions = await import("../config/sessions.js");
const gateway = await import("../gateway/call.js");
@ -211,6 +211,10 @@ describe("subagent-orphan-recovery", () => {
expect(result.recovered).toBe(0);
expect(result.failed).toBe(1);
// abortedLastRun flag should NOT be cleared on failure,
// so the next restart can retry the recovery
expect(sessions.updateSessionStore).not.toHaveBeenCalled();
});
it("returns empty results when no active runs exist", async () => {
@ -246,8 +250,12 @@ describe("subagent-orphan-recovery", () => {
expect(gateway.callGateway).not.toHaveBeenCalled();
});
it("clears abortedLastRun flag before resuming", async () => {
it("clears abortedLastRun flag after successful resume", async () => {
const sessions = await import("../config/sessions.js");
const gateway = await import("../gateway/call.js");
// Ensure callGateway succeeds for this test
vi.mocked(gateway.callGateway).mockResolvedValue({ runId: "resumed-run" } as never);
vi.mocked(sessions.loadSessionStore).mockReturnValue({
"agent:main:subagent:test-session-1": {
@ -266,7 +274,7 @@ describe("subagent-orphan-recovery", () => {
getActiveRuns: () => activeRuns,
});
// updateSessionStore should have been called to clear the flag
// updateSessionStore should have been called AFTER successful resume to clear the flag
expect(sessions.updateSessionStore).toHaveBeenCalledOnce();
const calls = vi.mocked(sessions.updateSessionStore).mock.calls;
const [storePath, updater] = calls[0];

View File

@ -129,25 +129,31 @@ export async function recoverOrphanedSubagentSessions(params: {
log.info(`found orphaned subagent session: ${childSessionKey} (run=${runId})`);
// Clear the aborted flag before resuming
await updateSessionStore(storePath, (currentStore) => {
const current = currentStore[childSessionKey];
if (current) {
current.abortedLastRun = false;
current.updatedAt = Date.now();
currentStore[childSessionKey] = current;
}
});
// Resume the session with the original task context
// Resume the session with the original task context.
// We intentionally do NOT clear abortedLastRun before attempting
// the resume — if callGateway fails (e.g. gateway still booting),
// the flag stays true so the next restart can retry.
const resumed = await resumeOrphanedSession({
sessionKey: childSessionKey,
task: runRecord.task,
});
if (resumed) {
// Only clear the aborted flag after confirmed successful resume.
await updateSessionStore(storePath, (currentStore) => {
const current = currentStore[childSessionKey];
if (current) {
current.abortedLastRun = false;
current.updatedAt = Date.now();
currentStore[childSessionKey] = current;
}
});
result.recovered++;
} else {
// Flag stays as abortedLastRun=true so next restart can retry
log.warn(
`resume failed for ${childSessionKey}; abortedLastRun flag preserved for retry on next restart`,
);
result.failed++;
}
} catch (err) {

View File

@ -30,7 +30,6 @@ import {
SUBAGENT_ENDED_REASON_KILLED,
type SubagentLifecycleEndedReason,
} from "./subagent-lifecycle-events.js";
import { scheduleOrphanRecovery } from "./subagent-orphan-recovery.js";
import {
resolveCleanupCompletionReason,
resolveDeferredCleanupDecision,
@ -688,10 +687,16 @@ function restoreSubagentRunsOnce() {
// Schedule orphan recovery for subagent sessions that were aborted
// by a SIGUSR1 reload. This runs after a short delay to let the
// gateway fully bootstrap first. (#47711)
scheduleOrphanRecovery({
getActiveRuns: () => subagentRuns,
});
// gateway fully bootstrap first. Dynamic import to avoid increasing
// startup memory footprint. (#47711)
void import("./subagent-orphan-recovery.js").then(
({ scheduleOrphanRecovery }) => {
scheduleOrphanRecovery({ getActiveRuns: () => subagentRuns });
},
() => {
// Ignore import failures — orphan recovery is best-effort.
},
);
} catch {
// ignore restore failures
}