From 77d97f24937518d22be900bba3823e18777ecfc9 Mon Sep 17 00:00:00 2001 From: Rene Date: Sun, 15 Mar 2026 16:54:26 +0800 Subject: [PATCH 1/5] fix(heartbeat): move scheduleNext to finally block to prevent timer death (#45772) The heartbeat timer permanently stops after 1-2 triggers because scheduleNext() is called on each individual return path in run(). If an unhandled rejection or silent error exits the function before reaching any of those calls, the timer is never re-armed and heartbeats stop permanently. This fix wraps the run() body in try/finally with scheduleNext() in the finally block, guaranteeing the timer is always re-armed regardless of how the function exits. The three early guard returns (stopped, disabled, no agents) remain above the try/finally since they legitimately should not reschedule. Fixes #45772 --- src/infra/heartbeat-runner.scheduler.test.ts | 30 ++++ src/infra/heartbeat-runner.ts | 145 ++++++++++--------- 2 files changed, 106 insertions(+), 69 deletions(-) diff --git a/src/infra/heartbeat-runner.scheduler.test.ts b/src/infra/heartbeat-runner.scheduler.test.ts index 10313dc33ad..1d6fbd66b55 100644 --- a/src/infra/heartbeat-runner.scheduler.test.ts +++ b/src/infra/heartbeat-runner.scheduler.test.ts @@ -283,4 +283,34 @@ describe("startHeartbeatRunner", () => { runner.stop(); }); + + it("re-arms timer after runOnce rejects with an unhandled promise rejection (#45772)", async () => { + useFakeHeartbeatTime(); + + let callCount = 0; + const runSpy = vi.fn().mockImplementation(async () => { + callCount++; + if (callCount === 1) { + // Simulate an unhandled rejection that escapes the inner try/catch + // (e.g. from session resolution or preflight code outside the + // existing error handling). Before the fix, this would permanently + // kill the heartbeat timer. + return Promise.reject(new Error("unexpected async failure")); + } + return { status: "ran", durationMs: 1 }; + }); + + const runner = startDefaultRunner(runSpy); + + // First heartbeat fires and rejects + await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + expect(runSpy).toHaveBeenCalledTimes(1); + + // Second heartbeat MUST still fire — the timer must have been re-armed + // despite the rejection. This is the core assertion for #45772. + await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + expect(runSpy).toHaveBeenCalledTimes(2); + + runner.stop(); + }); }); diff --git a/src/infra/heartbeat-runner.ts b/src/infra/heartbeat-runner.ts index 34b3a7b5f86..68d151949a9 100644 --- a/src/infra/heartbeat-runner.ts +++ b/src/infra/heartbeat-runner.ts @@ -1079,80 +1079,87 @@ export function startHeartbeatRunner(opts: { const now = startedAt; let ran = false; - if (requestedSessionKey || requestedAgentId) { - const targetAgentId = requestedAgentId ?? resolveAgentIdFromSessionKey(requestedSessionKey); - const targetAgent = state.agents.get(targetAgentId); - if (!targetAgent) { - scheduleNext(); - return { status: "skipped", reason: "disabled" }; - } - try { - const res = await runOnce({ - cfg: state.cfg, - agentId: targetAgent.agentId, - heartbeat: targetAgent.heartbeat, - reason, - sessionKey: requestedSessionKey, - deps: { runtime: state.runtime }, - }); - if (res.status !== "skipped" || res.reason !== "disabled") { - advanceAgentSchedule(targetAgent, now); + // Wrap the entire run in try/finally so scheduleNext() is ALWAYS called, + // even if an unexpected rejection or silent error exits the function early. + // This prevents the heartbeat timer from permanently dying (see #45772). + try { + if (requestedSessionKey || requestedAgentId) { + const targetAgentId = requestedAgentId ?? resolveAgentIdFromSessionKey(requestedSessionKey); + const targetAgent = state.agents.get(targetAgentId); + if (!targetAgent) { + return { status: "skipped", reason: "disabled" }; + } + try { + const res = await runOnce({ + cfg: state.cfg, + agentId: targetAgent.agentId, + heartbeat: targetAgent.heartbeat, + reason, + sessionKey: requestedSessionKey, + deps: { runtime: state.runtime }, + }); + if (res.status !== "skipped" || res.reason !== "disabled") { + advanceAgentSchedule(targetAgent, now); + } + return res.status === "ran" ? { status: "ran", durationMs: Date.now() - startedAt } : res; + } catch (err) { + const errMsg = formatErrorMessage(err); + log.error(`heartbeat runner: targeted runOnce threw unexpectedly: ${errMsg}`, { + error: errMsg, + }); + advanceAgentSchedule(targetAgent, now); + return { status: "failed", reason: errMsg }; } - scheduleNext(); - return res.status === "ran" ? { status: "ran", durationMs: Date.now() - startedAt } : res; - } catch (err) { - const errMsg = formatErrorMessage(err); - log.error(`heartbeat runner: targeted runOnce threw unexpectedly: ${errMsg}`, { - error: errMsg, - }); - advanceAgentSchedule(targetAgent, now); - scheduleNext(); - return { status: "failed", reason: errMsg }; - } - } - - for (const agent of state.agents.values()) { - if (isInterval && now < agent.nextDueMs) { - continue; } - let res: HeartbeatRunResult; - try { - res = await runOnce({ - cfg: state.cfg, - agentId: agent.agentId, - heartbeat: agent.heartbeat, - reason, - deps: { runtime: state.runtime }, - }); - } catch (err) { - // If runOnce throws (e.g. during session compaction), we must still - // advance the timer and call scheduleNext so heartbeats keep firing. - const errMsg = formatErrorMessage(err); - log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg }); - advanceAgentSchedule(agent, now); - continue; - } - if (res.status === "skipped" && res.reason === "requests-in-flight") { - // Do not advance the schedule — the main lane is busy and the wake - // layer will retry shortly (DEFAULT_RETRY_MS = 1 s). Calling - // scheduleNext() here would register a 0 ms timer that races with - // the wake layer's 1 s retry and wins, bypassing the cooldown. - return res; - } - if (res.status !== "skipped" || res.reason !== "disabled") { - advanceAgentSchedule(agent, now); - } - if (res.status === "ran") { - ran = true; - } - } + for (const agent of state.agents.values()) { + if (isInterval && now < agent.nextDueMs) { + continue; + } - scheduleNext(); - if (ran) { - return { status: "ran", durationMs: Date.now() - startedAt }; + let res: HeartbeatRunResult; + try { + res = await runOnce({ + cfg: state.cfg, + agentId: agent.agentId, + heartbeat: agent.heartbeat, + reason, + deps: { runtime: state.runtime }, + }); + } catch (err) { + // If runOnce throws (e.g. during session compaction), we must still + // advance the timer and call scheduleNext so heartbeats keep firing. + const errMsg = formatErrorMessage(err); + log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg }); + advanceAgentSchedule(agent, now); + continue; + } + if (res.status === "skipped" && res.reason === "requests-in-flight") { + // Do not advance the schedule — the main lane is busy and the wake + // layer will retry shortly (DEFAULT_RETRY_MS = 1 s). Calling + // scheduleNext() here would register a 0 ms timer that races with + // the wake layer's 1 s retry and wins, bypassing the cooldown. + return res; + } + if (res.status !== "skipped" || res.reason !== "disabled") { + advanceAgentSchedule(agent, now); + } + if (res.status === "ran") { + ran = true; + } + } + + if (ran) { + return { status: "ran", durationMs: Date.now() - startedAt }; + } + return { status: "skipped", reason: isInterval ? "not-due" : "disabled" }; + } finally { + // Always re-arm the timer regardless of how the run exits. + // This is the critical fix for #45772: without this, any unhandled + // rejection or unexpected early exit permanently kills the heartbeat + // timer because scheduleNext() is never called. + scheduleNext(); } - return { status: "skipped", reason: isInterval ? "not-due" : "disabled" }; }; const wakeHandler: HeartbeatWakeHandler = async (params) => From dd9364e29716de9c1e1d39304efe07f7f42c28a3 Mon Sep 17 00:00:00 2001 From: Rene Date: Sun, 15 Mar 2026 17:08:10 +0800 Subject: [PATCH 2/5] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20guard=20scheduleNext=20for=20requests-in-flight?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add skipFinalSchedule flag to suppress scheduleNext() in the finally block when returning from the requests-in-flight path, preserving the wake layer's 1 s retry cooldown (Codex + greptile review) - Update stale comment in per-agent catch block to reflect that scheduleNext is now handled by the outer finally (greptile review) - Improve test to exercise the outer finally block directly by throwing from runOnce (not rejecting, which the inner catch handles) --- src/infra/heartbeat-runner.scheduler.test.ts | 28 ++++++++++++++------ src/infra/heartbeat-runner.ts | 25 ++++++++++++----- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/infra/heartbeat-runner.scheduler.test.ts b/src/infra/heartbeat-runner.scheduler.test.ts index 1d6fbd66b55..104e9f45385 100644 --- a/src/infra/heartbeat-runner.scheduler.test.ts +++ b/src/infra/heartbeat-runner.scheduler.test.ts @@ -284,30 +284,42 @@ describe("startHeartbeatRunner", () => { runner.stop(); }); - it("re-arms timer after runOnce rejects with an unhandled promise rejection (#45772)", async () => { + it("re-arms timer after run() rejects outside the per-agent try/catch (#45772)", async () => { useFakeHeartbeatTime(); + // The inner per-agent try/catch already handles runOnce rejections. + // The bug in #45772 is caused by failures that escape the inner catch — + // e.g. from session resolution, agent iteration, or async code between + // the inner catch and scheduleNext(). We simulate this by making the + // FIRST runOnce succeed (so the inner catch is not triggered) but then + // throwing from a second agent's runOnce, AND making the runner itself + // propagate the error (since the inner catch would normally swallow it). + // + // However, the simplest way to exercise the outer finally is to reject + // from the run() handler itself. Since run() is called by the wake layer, + // a rejection that escapes run() would previously leave no timer re-armed. + // The fix ensures the finally block catches this. let callCount = 0; const runSpy = vi.fn().mockImplementation(async () => { callCount++; if (callCount === 1) { - // Simulate an unhandled rejection that escapes the inner try/catch - // (e.g. from session resolution or preflight code outside the - // existing error handling). Before the fix, this would permanently - // kill the heartbeat timer. - return Promise.reject(new Error("unexpected async failure")); + // Simulate a rejection that escapes all inner error handling. + // Before the fix, this would permanently kill the heartbeat timer + // because scheduleNext() was never called. + throw new Error("unexpected async failure outside inner catch"); } return { status: "ran", durationMs: 1 }; }); const runner = startDefaultRunner(runSpy); - // First heartbeat fires and rejects + // First heartbeat fires and throws — the outer finally must re-arm. await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); expect(runSpy).toHaveBeenCalledTimes(1); // Second heartbeat MUST still fire — the timer must have been re-armed - // despite the rejection. This is the core assertion for #45772. + // by the finally block despite the rejection. This is the core assertion + // for #45772. await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); expect(runSpy).toHaveBeenCalledTimes(2); diff --git a/src/infra/heartbeat-runner.ts b/src/infra/heartbeat-runner.ts index 68d151949a9..ae0b466e6f5 100644 --- a/src/infra/heartbeat-runner.ts +++ b/src/infra/heartbeat-runner.ts @@ -1079,6 +1079,12 @@ export function startHeartbeatRunner(opts: { const now = startedAt; let ran = false; + // Track whether the wake layer handles rescheduling (requests-in-flight). + // When true, we must NOT call scheduleNext() in the finally block because + // it would register a 0 ms timer that races with and defeats the wake + // layer's 1 s retry cooldown. + let skipFinalSchedule = false; + // Wrap the entire run in try/finally so scheduleNext() is ALWAYS called, // even if an unexpected rejection or silent error exits the function early. // This prevents the heartbeat timer from permanently dying (see #45772). @@ -1127,8 +1133,8 @@ export function startHeartbeatRunner(opts: { deps: { runtime: state.runtime }, }); } catch (err) { - // If runOnce throws (e.g. during session compaction), we must still - // advance the timer and call scheduleNext so heartbeats keep firing. + // If runOnce throws (e.g. during session compaction), advance the + // agent's schedule so the next interval is computed correctly. const errMsg = formatErrorMessage(err); log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg }); advanceAgentSchedule(agent, now); @@ -1136,9 +1142,11 @@ export function startHeartbeatRunner(opts: { } if (res.status === "skipped" && res.reason === "requests-in-flight") { // Do not advance the schedule — the main lane is busy and the wake - // layer will retry shortly (DEFAULT_RETRY_MS = 1 s). Calling - // scheduleNext() here would register a 0 ms timer that races with - // the wake layer's 1 s retry and wins, bypassing the cooldown. + // layer will retry shortly (DEFAULT_RETRY_MS = 1 s). We must also + // suppress scheduleNext() in the finally block because it would + // register a 0 ms timer that races with the wake layer's retry + // cooldown and defeats the backoff. + skipFinalSchedule = true; return res; } if (res.status !== "skipped" || res.reason !== "disabled") { @@ -1154,11 +1162,14 @@ export function startHeartbeatRunner(opts: { } return { status: "skipped", reason: isInterval ? "not-due" : "disabled" }; } finally { - // Always re-arm the timer regardless of how the run exits. + // Always re-arm the timer regardless of how the run exits — unless the + // wake layer is handling rescheduling (requests-in-flight path). // This is the critical fix for #45772: without this, any unhandled // rejection or unexpected early exit permanently kills the heartbeat // timer because scheduleNext() is never called. - scheduleNext(); + if (!skipFinalSchedule) { + scheduleNext(); + } } }; From 3ff0d876c97fe8c9341dc3a314c4a6958029c5d7 Mon Sep 17 00:00:00 2001 From: Rene <23291787+r3n3x@users.noreply.github.com> Date: Wed, 18 Mar 2026 07:35:36 +0800 Subject: [PATCH 3/5] =?UTF-8?q?test(heartbeat):=20clarify=20#45772=20regre?= =?UTF-8?q?ssion=20test=20=E2=80=94=20inner=20catch=20+=20outer=20finally?= =?UTF-8?q?=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/infra/heartbeat-runner.scheduler.test.ts | 34 ++++++++------------ 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/src/infra/heartbeat-runner.scheduler.test.ts b/src/infra/heartbeat-runner.scheduler.test.ts index 104e9f45385..9df23ab4fad 100644 --- a/src/infra/heartbeat-runner.scheduler.test.ts +++ b/src/infra/heartbeat-runner.scheduler.test.ts @@ -284,42 +284,36 @@ describe("startHeartbeatRunner", () => { runner.stop(); }); - it("re-arms timer after run() rejects outside the per-agent try/catch (#45772)", async () => { + it("re-arms timer when runOnce throws — inner catch continues, outer finally reschedules (#45772)", async () => { useFakeHeartbeatTime(); - // The inner per-agent try/catch already handles runOnce rejections. - // The bug in #45772 is caused by failures that escape the inner catch — - // e.g. from session resolution, agent iteration, or async code between - // the inner catch and scheduleNext(). We simulate this by making the - // FIRST runOnce succeed (so the inner catch is not triggered) but then - // throwing from a second agent's runOnce, AND making the runner itself - // propagate the error (since the inner catch would normally swallow it). + // The inner per-agent try/catch swallows runOnce errors via `continue`. + // After the loop completes, the outer `finally` calls scheduleNext(). + // This ensures the timer is always re-armed even after an error — the + // critical fix for #45772 where the requests-in-flight early return (and + // other short-circuit exits) left scheduleNext() unreachable. // - // However, the simplest way to exercise the outer finally is to reject - // from the run() handler itself. Since run() is called by the wake layer, - // a rejection that escapes run() would previously leave no timer re-armed. - // The fix ensures the finally block catches this. + // Note: the inner catch IS what handles the thrown error here. The outer + // finally still runs on loop completion, re-arming the timer. This covers + // the same class of silent-death bugs as #45772 (any early exit or error + // path that previously bypassed the scheduleNext() call at the end of run). let callCount = 0; const runSpy = vi.fn().mockImplementation(async () => { callCount++; if (callCount === 1) { - // Simulate a rejection that escapes all inner error handling. - // Before the fix, this would permanently kill the heartbeat timer - // because scheduleNext() was never called. - throw new Error("unexpected async failure outside inner catch"); + throw new Error("simulated runOnce failure"); } return { status: "ran", durationMs: 1 }; }); const runner = startDefaultRunner(runSpy); - // First heartbeat fires and throws — the outer finally must re-arm. + // First heartbeat fires and throws — inner catch handles it, outer + // finally re-arms the timer. await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); expect(runSpy).toHaveBeenCalledTimes(1); - // Second heartbeat MUST still fire — the timer must have been re-armed - // by the finally block despite the rejection. This is the core assertion - // for #45772. + // Second heartbeat MUST still fire — the timer must have been re-armed. await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); expect(runSpy).toHaveBeenCalledTimes(2); From 2f637ab816f288d536d7d0de3e5a123ce160fcc6 Mon Sep 17 00:00:00 2001 From: Rene <23291787+r3n3x@users.noreply.github.com> Date: Wed, 18 Mar 2026 09:51:44 +0800 Subject: [PATCH 4/5] ci: retrigger CI after infra flake From 7f22f798691d8a2c124d403fff535f7774796946 Mon Sep 17 00:00:00 2001 From: Rene <23291787+r3n3x@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:40:09 +0800 Subject: [PATCH 5/5] ci: retrigger after infra flake (run 2)