fix: address Greptile review feedback
- Remove unrelated pnpm-lock.yaml changes - Move abortedLastRun flag clearing to AFTER successful resume (prevents permanent session loss on transient gateway failures) - Use dynamic import for orphan recovery module to avoid startup memory overhead - Add test assertion that flag is preserved on resume failure
This commit is contained in:
parent
7b098ea79f
commit
f5e32be558
@ -186,7 +186,7 @@ describe("subagent-orphan-recovery", () => {
|
||||
expect(gateway.callGateway).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it("handles callGateway failure gracefully", async () => {
|
||||
it("handles callGateway failure gracefully and preserves abortedLastRun flag", async () => {
|
||||
const sessions = await import("../config/sessions.js");
|
||||
const gateway = await import("../gateway/call.js");
|
||||
|
||||
@ -211,6 +211,10 @@ describe("subagent-orphan-recovery", () => {
|
||||
|
||||
expect(result.recovered).toBe(0);
|
||||
expect(result.failed).toBe(1);
|
||||
|
||||
// abortedLastRun flag should NOT be cleared on failure,
|
||||
// so the next restart can retry the recovery
|
||||
expect(sessions.updateSessionStore).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("returns empty results when no active runs exist", async () => {
|
||||
@ -246,8 +250,12 @@ describe("subagent-orphan-recovery", () => {
|
||||
expect(gateway.callGateway).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("clears abortedLastRun flag before resuming", async () => {
|
||||
it("clears abortedLastRun flag after successful resume", async () => {
|
||||
const sessions = await import("../config/sessions.js");
|
||||
const gateway = await import("../gateway/call.js");
|
||||
|
||||
// Ensure callGateway succeeds for this test
|
||||
vi.mocked(gateway.callGateway).mockResolvedValue({ runId: "resumed-run" } as never);
|
||||
|
||||
vi.mocked(sessions.loadSessionStore).mockReturnValue({
|
||||
"agent:main:subagent:test-session-1": {
|
||||
@ -266,7 +274,7 @@ describe("subagent-orphan-recovery", () => {
|
||||
getActiveRuns: () => activeRuns,
|
||||
});
|
||||
|
||||
// updateSessionStore should have been called to clear the flag
|
||||
// updateSessionStore should have been called AFTER successful resume to clear the flag
|
||||
expect(sessions.updateSessionStore).toHaveBeenCalledOnce();
|
||||
const calls = vi.mocked(sessions.updateSessionStore).mock.calls;
|
||||
const [storePath, updater] = calls[0];
|
||||
|
||||
@ -129,25 +129,31 @@ export async function recoverOrphanedSubagentSessions(params: {
|
||||
|
||||
log.info(`found orphaned subagent session: ${childSessionKey} (run=${runId})`);
|
||||
|
||||
// Clear the aborted flag before resuming
|
||||
await updateSessionStore(storePath, (currentStore) => {
|
||||
const current = currentStore[childSessionKey];
|
||||
if (current) {
|
||||
current.abortedLastRun = false;
|
||||
current.updatedAt = Date.now();
|
||||
currentStore[childSessionKey] = current;
|
||||
}
|
||||
});
|
||||
|
||||
// Resume the session with the original task context
|
||||
// Resume the session with the original task context.
|
||||
// We intentionally do NOT clear abortedLastRun before attempting
|
||||
// the resume — if callGateway fails (e.g. gateway still booting),
|
||||
// the flag stays true so the next restart can retry.
|
||||
const resumed = await resumeOrphanedSession({
|
||||
sessionKey: childSessionKey,
|
||||
task: runRecord.task,
|
||||
});
|
||||
|
||||
if (resumed) {
|
||||
// Only clear the aborted flag after confirmed successful resume.
|
||||
await updateSessionStore(storePath, (currentStore) => {
|
||||
const current = currentStore[childSessionKey];
|
||||
if (current) {
|
||||
current.abortedLastRun = false;
|
||||
current.updatedAt = Date.now();
|
||||
currentStore[childSessionKey] = current;
|
||||
}
|
||||
});
|
||||
result.recovered++;
|
||||
} else {
|
||||
// Flag stays as abortedLastRun=true so next restart can retry
|
||||
log.warn(
|
||||
`resume failed for ${childSessionKey}; abortedLastRun flag preserved for retry on next restart`,
|
||||
);
|
||||
result.failed++;
|
||||
}
|
||||
} catch (err) {
|
||||
|
||||
@ -30,7 +30,6 @@ import {
|
||||
SUBAGENT_ENDED_REASON_KILLED,
|
||||
type SubagentLifecycleEndedReason,
|
||||
} from "./subagent-lifecycle-events.js";
|
||||
import { scheduleOrphanRecovery } from "./subagent-orphan-recovery.js";
|
||||
import {
|
||||
resolveCleanupCompletionReason,
|
||||
resolveDeferredCleanupDecision,
|
||||
@ -688,10 +687,16 @@ function restoreSubagentRunsOnce() {
|
||||
|
||||
// Schedule orphan recovery for subagent sessions that were aborted
|
||||
// by a SIGUSR1 reload. This runs after a short delay to let the
|
||||
// gateway fully bootstrap first. (#47711)
|
||||
scheduleOrphanRecovery({
|
||||
getActiveRuns: () => subagentRuns,
|
||||
});
|
||||
// gateway fully bootstrap first. Dynamic import to avoid increasing
|
||||
// startup memory footprint. (#47711)
|
||||
void import("./subagent-orphan-recovery.js").then(
|
||||
({ scheduleOrphanRecovery }) => {
|
||||
scheduleOrphanRecovery({ getActiveRuns: () => subagentRuns });
|
||||
},
|
||||
() => {
|
||||
// Ignore import failures — orphan recovery is best-effort.
|
||||
},
|
||||
);
|
||||
} catch {
|
||||
// ignore restore failures
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user