From e316f68fd6e81c6e21e70c30a0432e662199b5bd Mon Sep 17 00:00:00 2001 From: Jason Wu Date: Sat, 21 Mar 2026 10:47:50 +0800 Subject: [PATCH] fix(health-monitor): skip restart when account removed or monitor disabled post-drain --- src/gateway/channel-health-monitor.test.ts | 79 ++++++++++++++++++++++ src/gateway/channel-health-monitor.ts | 14 ++++ 2 files changed, 93 insertions(+) diff --git a/src/gateway/channel-health-monitor.test.ts b/src/gateway/channel-health-monitor.test.ts index 98739a10ada..c51e1da324f 100644 --- a/src/gateway/channel-health-monitor.test.ts +++ b/src/gateway/channel-health-monitor.test.ts @@ -671,6 +671,85 @@ describe("channel-health-monitor", () => { monitor.stop(); }); + it("skips restart when account is removed during drain", async () => { + const now = Date.now(); + let callCount = 0; + const manager = createMockChannelManager({ + getRuntimeSnapshot: vi.fn(() => { + callCount++; + // After the initial health check + first drain poll, simulate account removal. + const removed = callCount >= 3; + return snapshotWith( + removed + ? {} + : { + discord: { + default: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: now - 300_000, + activeRuns: callCount >= 3 ? 0 : 1, + busy: true, + lastRunActivityAt: now - 5_000, + }, + }, + }, + ); + }), + }); + const monitor = startDefaultMonitor(manager, { checkIntervalMs: DEFAULT_CHECK_INTERVAL_MS }); + await vi.advanceTimersByTimeAsync(DEFAULT_CHECK_INTERVAL_MS + 1); + // Drain polls until activeRuns=0 / account removed. + await vi.advanceTimersByTimeAsync(3_000); + expect(manager.stopChannel).not.toHaveBeenCalled(); + expect(manager.startChannel).not.toHaveBeenCalled(); + monitor.stop(); + }); + + it("skips restart when health monitor is disabled during drain", async () => { + const now = Date.now(); + let callCount = 0; + let monitorEnabled = true; + const manager = createMockChannelManager({ + getRuntimeSnapshot: vi.fn(() => { + callCount++; + const drained = callCount >= 4; + return snapshotWith({ + discord: { + default: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: now - 300_000, + activeRuns: drained ? 0 : 1, + busy: !drained, + lastRunActivityAt: now - 5_000, + }, + }, + }); + }), + isHealthMonitorEnabled: vi.fn(() => { + // Disable after drain starts (simulating operator hot-reload). + if (!monitorEnabled) { + return false; + } + return true; + }), + }); + const monitor = startDefaultMonitor(manager, { checkIntervalMs: DEFAULT_CHECK_INTERVAL_MS }); + await vi.advanceTimersByTimeAsync(DEFAULT_CHECK_INTERVAL_MS + 1); + // Disable health monitor while drain is in progress. + monitorEnabled = false; + // Drain finishes after a few polls. + await vi.advanceTimersByTimeAsync(3_000); + expect(manager.stopChannel).not.toHaveBeenCalled(); + expect(manager.startChannel).not.toHaveBeenCalled(); + monitor.stop(); + }); + it("skips drain when channel has no active runs", async () => { const now = Date.now(); const snapshotFn = vi.fn(() => diff --git a/src/gateway/channel-health-monitor.ts b/src/gateway/channel-health-monitor.ts index 20eaf8c5dd6..65c95f2d473 100644 --- a/src/gateway/channel-health-monitor.ts +++ b/src/gateway/channel-health-monitor.ts @@ -253,6 +253,13 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann // or reconnected while we were waiting. Only proceed if still unhealthy. const postDrainSnap = channelManager.getRuntimeSnapshot(); const postDrainStatus = postDrainSnap.channelAccounts[channelId]?.[accountId]; + // Account was removed during drain (config hot-reload) — do not resurrect it. + if (!postDrainStatus) { + log.debug?.( + `[${channelId}:${accountId}] health-monitor: account removed during drain, skipping restart`, + ); + continue; + } if (postDrainStatus) { const postDrainHealth = evaluateChannelHealth(postDrainStatus, { ...healthPolicy, @@ -265,6 +272,13 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann continue; } } + // Re-check monitor enablement after drain — operator may have disabled it. + if (!channelManager.isHealthMonitorEnabled(channelId as ChannelId, accountId)) { + log.info?.( + `[${channelId}:${accountId}] health-monitor: monitor disabled during drain, skipping restart`, + ); + continue; + } // Re-prune the hourly bucket with a fresh timestamp so that entries // which aged out during the drain window are not counted against the cap. pruneOldRestarts(record, Date.now());