fix(telegram): polling stall recovery fails when grammY retries mask the stall

The polling watchdog tracks getUpdates call initiation, not successful
completion. When the watchdog triggers a restart but recovery fails,
grammY's internal retry mechanism continues making failed getUpdates
calls at intervals shorter than the 90s stall threshold. Each failed
attempt updates lastGetUpdatesAt, fooling the watchdog into thinking
polling is healthy.

This caused a 50-minute outage where the gateway process was alive
(health endpoint returning ok) but Telegram was completely deaf.

Three fixes:
1. Track getUpdates success, not initiation - await prev() before
   updating the timestamp so failed calls don't reset the clock
2. Reset restartAttempts on successful getUpdates - prevents permanent
   backoff growth after genuine recovery
3. Escalate after 5 consecutive stall restarts - process.exit(1) lets
   the process manager (systemd/launchd) do a clean restart

Fixes #44595
This commit is contained in:
Tom Alison 2026-03-12 20:24:56 -07:00
parent e81442ac80
commit d7a6bed2ed

View File

@ -16,6 +16,7 @@ const TELEGRAM_POLL_RESTART_POLICY = {
const POLL_STALL_THRESHOLD_MS = 90_000;
const POLL_WATCHDOG_INTERVAL_MS = 30_000;
const POLL_STOP_GRACE_MS = 15_000;
const MAX_CONSECUTIVE_POLL_RESTARTS = 5;
const waitForGracefulStop = async (stop: () => Promise<void>) => {
let timer: ReturnType<typeof setTimeout> | undefined;
@ -182,11 +183,15 @@ export class TelegramPollingSession {
await this.#confirmPersistedOffset(bot);
let lastGetUpdatesAt = Date.now();
bot.api.config.use((prev, method, payload, signal) => {
let consecutiveStallRestarts = 0;
bot.api.config.use(async (prev, method, payload, signal) => {
const result = await prev(method, payload, signal);
if (method === "getUpdates") {
lastGetUpdatesAt = Date.now();
this.#restartAttempts = 0;
consecutiveStallRestarts = 0;
}
return prev(method, payload, signal);
return result;
});
const runner = run(bot, this.opts.runnerOptions);
@ -227,9 +232,16 @@ export class TelegramPollingSession {
}
const elapsed = Date.now() - lastGetUpdatesAt;
if (elapsed > POLL_STALL_THRESHOLD_MS && runner.isRunning()) {
consecutiveStallRestarts += 1;
stalledRestart = true;
if (consecutiveStallRestarts >= MAX_CONSECUTIVE_POLL_RESTARTS) {
this.opts.log(
`[telegram] Polling recovery exhausted after ${consecutiveStallRestarts} consecutive stall restarts without successful getUpdates; escalating to process exit.`,
);
process.exit(1);
}
this.opts.log(
`[telegram] Polling stall detected (no getUpdates for ${formatDurationPrecise(elapsed)}); forcing restart.`,
`[telegram] Polling stall detected (no getUpdates for ${formatDurationPrecise(elapsed)}); forcing restart (attempt ${consecutiveStallRestarts}/${MAX_CONSECUTIVE_POLL_RESTARTS}).`,
);
void stopRunner();
void stopBot();