From dd9364e29716de9c1e1d39304efe07f7f42c28a3 Mon Sep 17 00:00:00 2001 From: Rene Date: Sun, 15 Mar 2026 17:08:10 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20address=20review=20feedback=20=E2=80=94?= =?UTF-8?q?=20guard=20scheduleNext=20for=20requests-in-flight?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add skipFinalSchedule flag to suppress scheduleNext() in the finally block when returning from the requests-in-flight path, preserving the wake layer's 1 s retry cooldown (Codex + greptile review) - Update stale comment in per-agent catch block to reflect that scheduleNext is now handled by the outer finally (greptile review) - Improve test to exercise the outer finally block directly by throwing from runOnce (not rejecting, which the inner catch handles) --- src/infra/heartbeat-runner.scheduler.test.ts | 28 ++++++++++++++------ src/infra/heartbeat-runner.ts | 25 ++++++++++++----- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/infra/heartbeat-runner.scheduler.test.ts b/src/infra/heartbeat-runner.scheduler.test.ts index 1d6fbd66b55..104e9f45385 100644 --- a/src/infra/heartbeat-runner.scheduler.test.ts +++ b/src/infra/heartbeat-runner.scheduler.test.ts @@ -284,30 +284,42 @@ describe("startHeartbeatRunner", () => { runner.stop(); }); - it("re-arms timer after runOnce rejects with an unhandled promise rejection (#45772)", async () => { + it("re-arms timer after run() rejects outside the per-agent try/catch (#45772)", async () => { useFakeHeartbeatTime(); + // The inner per-agent try/catch already handles runOnce rejections. + // The bug in #45772 is caused by failures that escape the inner catch — + // e.g. from session resolution, agent iteration, or async code between + // the inner catch and scheduleNext(). We simulate this by making the + // FIRST runOnce succeed (so the inner catch is not triggered) but then + // throwing from a second agent's runOnce, AND making the runner itself + // propagate the error (since the inner catch would normally swallow it). + // + // However, the simplest way to exercise the outer finally is to reject + // from the run() handler itself. Since run() is called by the wake layer, + // a rejection that escapes run() would previously leave no timer re-armed. + // The fix ensures the finally block catches this. let callCount = 0; const runSpy = vi.fn().mockImplementation(async () => { callCount++; if (callCount === 1) { - // Simulate an unhandled rejection that escapes the inner try/catch - // (e.g. from session resolution or preflight code outside the - // existing error handling). Before the fix, this would permanently - // kill the heartbeat timer. - return Promise.reject(new Error("unexpected async failure")); + // Simulate a rejection that escapes all inner error handling. + // Before the fix, this would permanently kill the heartbeat timer + // because scheduleNext() was never called. + throw new Error("unexpected async failure outside inner catch"); } return { status: "ran", durationMs: 1 }; }); const runner = startDefaultRunner(runSpy); - // First heartbeat fires and rejects + // First heartbeat fires and throws — the outer finally must re-arm. await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); expect(runSpy).toHaveBeenCalledTimes(1); // Second heartbeat MUST still fire — the timer must have been re-armed - // despite the rejection. This is the core assertion for #45772. + // by the finally block despite the rejection. This is the core assertion + // for #45772. await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); expect(runSpy).toHaveBeenCalledTimes(2); diff --git a/src/infra/heartbeat-runner.ts b/src/infra/heartbeat-runner.ts index 68d151949a9..ae0b466e6f5 100644 --- a/src/infra/heartbeat-runner.ts +++ b/src/infra/heartbeat-runner.ts @@ -1079,6 +1079,12 @@ export function startHeartbeatRunner(opts: { const now = startedAt; let ran = false; + // Track whether the wake layer handles rescheduling (requests-in-flight). + // When true, we must NOT call scheduleNext() in the finally block because + // it would register a 0 ms timer that races with and defeats the wake + // layer's 1 s retry cooldown. + let skipFinalSchedule = false; + // Wrap the entire run in try/finally so scheduleNext() is ALWAYS called, // even if an unexpected rejection or silent error exits the function early. // This prevents the heartbeat timer from permanently dying (see #45772). @@ -1127,8 +1133,8 @@ export function startHeartbeatRunner(opts: { deps: { runtime: state.runtime }, }); } catch (err) { - // If runOnce throws (e.g. during session compaction), we must still - // advance the timer and call scheduleNext so heartbeats keep firing. + // If runOnce throws (e.g. during session compaction), advance the + // agent's schedule so the next interval is computed correctly. const errMsg = formatErrorMessage(err); log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg }); advanceAgentSchedule(agent, now); @@ -1136,9 +1142,11 @@ export function startHeartbeatRunner(opts: { } if (res.status === "skipped" && res.reason === "requests-in-flight") { // Do not advance the schedule — the main lane is busy and the wake - // layer will retry shortly (DEFAULT_RETRY_MS = 1 s). Calling - // scheduleNext() here would register a 0 ms timer that races with - // the wake layer's 1 s retry and wins, bypassing the cooldown. + // layer will retry shortly (DEFAULT_RETRY_MS = 1 s). We must also + // suppress scheduleNext() in the finally block because it would + // register a 0 ms timer that races with the wake layer's retry + // cooldown and defeats the backoff. + skipFinalSchedule = true; return res; } if (res.status !== "skipped" || res.reason !== "disabled") { @@ -1154,11 +1162,14 @@ export function startHeartbeatRunner(opts: { } return { status: "skipped", reason: isInterval ? "not-due" : "disabled" }; } finally { - // Always re-arm the timer regardless of how the run exits. + // Always re-arm the timer regardless of how the run exits — unless the + // wake layer is handling rescheduling (requests-in-flight path). // This is the critical fix for #45772: without this, any unhandled // rejection or unexpected early exit permanently kills the heartbeat // timer because scheduleNext() is never called. - scheduleNext(); + if (!skipFinalSchedule) { + scheduleNext(); + } } };