fix(health-monitor): skip restart when account removed or monitor disabled post-drain

This commit is contained in:
Jason Wu 2026-03-21 10:47:50 +08:00
parent 8a9a9cbbe5
commit e316f68fd6
2 changed files with 93 additions and 0 deletions

View File

@ -671,6 +671,85 @@ describe("channel-health-monitor", () => {
monitor.stop();
});
it("skips restart when account is removed during drain", async () => {
const now = Date.now();
let callCount = 0;
const manager = createMockChannelManager({
getRuntimeSnapshot: vi.fn(() => {
callCount++;
// After the initial health check + first drain poll, simulate account removal.
const removed = callCount >= 3;
return snapshotWith(
removed
? {}
: {
discord: {
default: {
running: true,
connected: false,
enabled: true,
configured: true,
lastStartAt: now - 300_000,
activeRuns: callCount >= 3 ? 0 : 1,
busy: true,
lastRunActivityAt: now - 5_000,
},
},
},
);
}),
});
const monitor = startDefaultMonitor(manager, { checkIntervalMs: DEFAULT_CHECK_INTERVAL_MS });
await vi.advanceTimersByTimeAsync(DEFAULT_CHECK_INTERVAL_MS + 1);
// Drain polls until activeRuns=0 / account removed.
await vi.advanceTimersByTimeAsync(3_000);
expect(manager.stopChannel).not.toHaveBeenCalled();
expect(manager.startChannel).not.toHaveBeenCalled();
monitor.stop();
});
it("skips restart when health monitor is disabled during drain", async () => {
const now = Date.now();
let callCount = 0;
let monitorEnabled = true;
const manager = createMockChannelManager({
getRuntimeSnapshot: vi.fn(() => {
callCount++;
const drained = callCount >= 4;
return snapshotWith({
discord: {
default: {
running: true,
connected: false,
enabled: true,
configured: true,
lastStartAt: now - 300_000,
activeRuns: drained ? 0 : 1,
busy: !drained,
lastRunActivityAt: now - 5_000,
},
},
});
}),
isHealthMonitorEnabled: vi.fn(() => {
// Disable after drain starts (simulating operator hot-reload).
if (!monitorEnabled) {
return false;
}
return true;
}),
});
const monitor = startDefaultMonitor(manager, { checkIntervalMs: DEFAULT_CHECK_INTERVAL_MS });
await vi.advanceTimersByTimeAsync(DEFAULT_CHECK_INTERVAL_MS + 1);
// Disable health monitor while drain is in progress.
monitorEnabled = false;
// Drain finishes after a few polls.
await vi.advanceTimersByTimeAsync(3_000);
expect(manager.stopChannel).not.toHaveBeenCalled();
expect(manager.startChannel).not.toHaveBeenCalled();
monitor.stop();
});
it("skips drain when channel has no active runs", async () => {
const now = Date.now();
const snapshotFn = vi.fn(() =>

View File

@ -253,6 +253,13 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann
// or reconnected while we were waiting. Only proceed if still unhealthy.
const postDrainSnap = channelManager.getRuntimeSnapshot();
const postDrainStatus = postDrainSnap.channelAccounts[channelId]?.[accountId];
// Account was removed during drain (config hot-reload) — do not resurrect it.
if (!postDrainStatus) {
log.debug?.(
`[${channelId}:${accountId}] health-monitor: account removed during drain, skipping restart`,
);
continue;
}
if (postDrainStatus) {
const postDrainHealth = evaluateChannelHealth(postDrainStatus, {
...healthPolicy,
@ -265,6 +272,13 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann
continue;
}
}
// Re-check monitor enablement after drain — operator may have disabled it.
if (!channelManager.isHealthMonitorEnabled(channelId as ChannelId, accountId)) {
log.info?.(
`[${channelId}:${accountId}] health-monitor: monitor disabled during drain, skipping restart`,
);
continue;
}
// Re-prune the hourly bucket with a fresh timestamp so that entries
// which aged out during the drain window are not counted against the cap.
pruneOldRestarts(record, Date.now());