fix: address review feedback — guard scheduleNext for requests-in-flight

- Add skipFinalSchedule flag to suppress scheduleNext() in the finally
  block when returning from the requests-in-flight path, preserving the
  wake layer's 1 s retry cooldown (Codex + greptile review)
- Update stale comment in per-agent catch block to reflect that
  scheduleNext is now handled by the outer finally (greptile review)
- Improve test to exercise the outer finally block directly by throwing
  from runOnce (not rejecting, which the inner catch handles)
This commit is contained in:
Rene 2026-03-15 17:08:10 +08:00 committed by Rene
parent 77d97f2493
commit dd9364e297
2 changed files with 38 additions and 15 deletions

View File

@ -284,30 +284,42 @@ describe("startHeartbeatRunner", () => {
runner.stop();
});
it("re-arms timer after runOnce rejects with an unhandled promise rejection (#45772)", async () => {
it("re-arms timer after run() rejects outside the per-agent try/catch (#45772)", async () => {
useFakeHeartbeatTime();
// The inner per-agent try/catch already handles runOnce rejections.
// The bug in #45772 is caused by failures that escape the inner catch —
// e.g. from session resolution, agent iteration, or async code between
// the inner catch and scheduleNext(). We simulate this by making the
// FIRST runOnce succeed (so the inner catch is not triggered) but then
// throwing from a second agent's runOnce, AND making the runner itself
// propagate the error (since the inner catch would normally swallow it).
//
// However, the simplest way to exercise the outer finally is to reject
// from the run() handler itself. Since run() is called by the wake layer,
// a rejection that escapes run() would previously leave no timer re-armed.
// The fix ensures the finally block catches this.
let callCount = 0;
const runSpy = vi.fn().mockImplementation(async () => {
callCount++;
if (callCount === 1) {
// Simulate an unhandled rejection that escapes the inner try/catch
// (e.g. from session resolution or preflight code outside the
// existing error handling). Before the fix, this would permanently
// kill the heartbeat timer.
return Promise.reject(new Error("unexpected async failure"));
// Simulate a rejection that escapes all inner error handling.
// Before the fix, this would permanently kill the heartbeat timer
// because scheduleNext() was never called.
throw new Error("unexpected async failure outside inner catch");
}
return { status: "ran", durationMs: 1 };
});
const runner = startDefaultRunner(runSpy);
// First heartbeat fires and rejects
// First heartbeat fires and throws — the outer finally must re-arm.
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
expect(runSpy).toHaveBeenCalledTimes(1);
// Second heartbeat MUST still fire — the timer must have been re-armed
// despite the rejection. This is the core assertion for #45772.
// by the finally block despite the rejection. This is the core assertion
// for #45772.
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
expect(runSpy).toHaveBeenCalledTimes(2);

View File

@ -1079,6 +1079,12 @@ export function startHeartbeatRunner(opts: {
const now = startedAt;
let ran = false;
// Track whether the wake layer handles rescheduling (requests-in-flight).
// When true, we must NOT call scheduleNext() in the finally block because
// it would register a 0 ms timer that races with and defeats the wake
// layer's 1 s retry cooldown.
let skipFinalSchedule = false;
// Wrap the entire run in try/finally so scheduleNext() is ALWAYS called,
// even if an unexpected rejection or silent error exits the function early.
// This prevents the heartbeat timer from permanently dying (see #45772).
@ -1127,8 +1133,8 @@ export function startHeartbeatRunner(opts: {
deps: { runtime: state.runtime },
});
} catch (err) {
// If runOnce throws (e.g. during session compaction), we must still
// advance the timer and call scheduleNext so heartbeats keep firing.
// If runOnce throws (e.g. during session compaction), advance the
// agent's schedule so the next interval is computed correctly.
const errMsg = formatErrorMessage(err);
log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg });
advanceAgentSchedule(agent, now);
@ -1136,9 +1142,11 @@ export function startHeartbeatRunner(opts: {
}
if (res.status === "skipped" && res.reason === "requests-in-flight") {
// Do not advance the schedule — the main lane is busy and the wake
// layer will retry shortly (DEFAULT_RETRY_MS = 1 s). Calling
// scheduleNext() here would register a 0 ms timer that races with
// the wake layer's 1 s retry and wins, bypassing the cooldown.
// layer will retry shortly (DEFAULT_RETRY_MS = 1 s). We must also
// suppress scheduleNext() in the finally block because it would
// register a 0 ms timer that races with the wake layer's retry
// cooldown and defeats the backoff.
skipFinalSchedule = true;
return res;
}
if (res.status !== "skipped" || res.reason !== "disabled") {
@ -1154,11 +1162,14 @@ export function startHeartbeatRunner(opts: {
}
return { status: "skipped", reason: isInterval ? "not-due" : "disabled" };
} finally {
// Always re-arm the timer regardless of how the run exits.
// Always re-arm the timer regardless of how the run exits — unless the
// wake layer is handling rescheduling (requests-in-flight path).
// This is the critical fix for #45772: without this, any unhandled
// rejection or unexpected early exit permanently kills the heartbeat
// timer because scheduleNext() is never called.
scheduleNext();
if (!skipFinalSchedule) {
scheduleNext();
}
}
};