From d7a6bed2ed9554edb445a95e2dd3b23e348574a5 Mon Sep 17 00:00:00 2001 From: Tom Alison <6170+talison@users.noreply.github.com> Date: Thu, 12 Mar 2026 20:24:56 -0700 Subject: [PATCH 1/3] fix(telegram): polling stall recovery fails when grammY retries mask the stall The polling watchdog tracks getUpdates call initiation, not successful completion. When the watchdog triggers a restart but recovery fails, grammY's internal retry mechanism continues making failed getUpdates calls at intervals shorter than the 90s stall threshold. Each failed attempt updates lastGetUpdatesAt, fooling the watchdog into thinking polling is healthy. This caused a 50-minute outage where the gateway process was alive (health endpoint returning ok) but Telegram was completely deaf. Three fixes: 1. Track getUpdates success, not initiation - await prev() before updating the timestamp so failed calls don't reset the clock 2. Reset restartAttempts on successful getUpdates - prevents permanent backoff growth after genuine recovery 3. Escalate after 5 consecutive stall restarts - process.exit(1) lets the process manager (systemd/launchd) do a clean restart Fixes #44595 --- extensions/telegram/src/polling-session.ts | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/extensions/telegram/src/polling-session.ts b/extensions/telegram/src/polling-session.ts index 5506ce4e434..1f5c9c0dd97 100644 --- a/extensions/telegram/src/polling-session.ts +++ b/extensions/telegram/src/polling-session.ts @@ -16,6 +16,7 @@ const TELEGRAM_POLL_RESTART_POLICY = { const POLL_STALL_THRESHOLD_MS = 90_000; const POLL_WATCHDOG_INTERVAL_MS = 30_000; const POLL_STOP_GRACE_MS = 15_000; +const MAX_CONSECUTIVE_POLL_RESTARTS = 5; const waitForGracefulStop = async (stop: () => Promise) => { let timer: ReturnType | undefined; @@ -182,11 +183,15 @@ export class TelegramPollingSession { await this.#confirmPersistedOffset(bot); let lastGetUpdatesAt = Date.now(); - bot.api.config.use((prev, method, payload, signal) => { + let consecutiveStallRestarts = 0; + bot.api.config.use(async (prev, method, payload, signal) => { + const result = await prev(method, payload, signal); if (method === "getUpdates") { lastGetUpdatesAt = Date.now(); + this.#restartAttempts = 0; + consecutiveStallRestarts = 0; } - return prev(method, payload, signal); + return result; }); const runner = run(bot, this.opts.runnerOptions); @@ -227,9 +232,16 @@ export class TelegramPollingSession { } const elapsed = Date.now() - lastGetUpdatesAt; if (elapsed > POLL_STALL_THRESHOLD_MS && runner.isRunning()) { + consecutiveStallRestarts += 1; stalledRestart = true; + if (consecutiveStallRestarts >= MAX_CONSECUTIVE_POLL_RESTARTS) { + this.opts.log( + `[telegram] Polling recovery exhausted after ${consecutiveStallRestarts} consecutive stall restarts without successful getUpdates; escalating to process exit.`, + ); + process.exit(1); + } this.opts.log( - `[telegram] Polling stall detected (no getUpdates for ${formatDurationPrecise(elapsed)}); forcing restart.`, + `[telegram] Polling stall detected (no getUpdates for ${formatDurationPrecise(elapsed)}); forcing restart (attempt ${consecutiveStallRestarts}/${MAX_CONSECUTIVE_POLL_RESTARTS}).`, ); void stopRunner(); void stopBot(); From 687d0b4208ce965f865a1816de678341af4ecb36 Mon Sep 17 00:00:00 2001 From: Tom Alison <6170+talison@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:07:49 -0700 Subject: [PATCH 2/3] fix: promote consecutiveStallRestarts to class field The counter was declared as a local variable inside #runPollingCycle(), resetting to 0 on every cycle restart. This made the escalation to process.exit(1) after MAX_CONSECUTIVE_POLL_RESTARTS dead code since the counter never accumulated across restarts. Promote to a private class field (#consecutiveStallRestarts) matching the pattern used by #restartAttempts, so stall restarts accumulate correctly and the process-exit escalation works as intended. --- extensions/telegram/src/polling-session.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/extensions/telegram/src/polling-session.ts b/extensions/telegram/src/polling-session.ts index 1f5c9c0dd97..e8b059bbee5 100644 --- a/extensions/telegram/src/polling-session.ts +++ b/extensions/telegram/src/polling-session.ts @@ -52,6 +52,7 @@ type TelegramPollingSessionOpts = { export class TelegramPollingSession { #restartAttempts = 0; + #consecutiveStallRestarts = 0; #webhookCleared = false; #forceRestarted = false; #activeRunner: ReturnType | undefined; @@ -183,13 +184,12 @@ export class TelegramPollingSession { await this.#confirmPersistedOffset(bot); let lastGetUpdatesAt = Date.now(); - let consecutiveStallRestarts = 0; bot.api.config.use(async (prev, method, payload, signal) => { const result = await prev(method, payload, signal); if (method === "getUpdates") { lastGetUpdatesAt = Date.now(); this.#restartAttempts = 0; - consecutiveStallRestarts = 0; + this.#consecutiveStallRestarts = 0; } return result; }); @@ -232,16 +232,16 @@ export class TelegramPollingSession { } const elapsed = Date.now() - lastGetUpdatesAt; if (elapsed > POLL_STALL_THRESHOLD_MS && runner.isRunning()) { - consecutiveStallRestarts += 1; + this.#consecutiveStallRestarts += 1; stalledRestart = true; - if (consecutiveStallRestarts >= MAX_CONSECUTIVE_POLL_RESTARTS) { + if (this.#consecutiveStallRestarts >= MAX_CONSECUTIVE_POLL_RESTARTS) { this.opts.log( - `[telegram] Polling recovery exhausted after ${consecutiveStallRestarts} consecutive stall restarts without successful getUpdates; escalating to process exit.`, + `[telegram] Polling recovery exhausted after ${this.#consecutiveStallRestarts} consecutive stall restarts without successful getUpdates; escalating to process exit.`, ); process.exit(1); } this.opts.log( - `[telegram] Polling stall detected (no getUpdates for ${formatDurationPrecise(elapsed)}); forcing restart (attempt ${consecutiveStallRestarts}/${MAX_CONSECUTIVE_POLL_RESTARTS}).`, + `[telegram] Polling stall detected (no getUpdates for ${formatDurationPrecise(elapsed)}); forcing restart (attempt ${this.#consecutiveStallRestarts}/${MAX_CONSECUTIVE_POLL_RESTARTS}).`, ); void stopRunner(); void stopBot(); From afec11e21b846b1e5b7b5fb0eb637224173c0093 Mon Sep 17 00:00:00 2001 From: Tom Alison <6170+talison@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:24:45 -0700 Subject: [PATCH 3/3] fix: off-by-one in stall restart escalation Change >= to > so the process exit fires after MAX_CONSECUTIVE_POLL_RESTARTS actual restart attempts rather than exiting on the Nth detection before the Nth restart has a chance to recover. A transient outage that would recover on the 5th retry no longer gets killed early. --- extensions/telegram/src/polling-session.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/telegram/src/polling-session.ts b/extensions/telegram/src/polling-session.ts index e8b059bbee5..c8ebca6e4bc 100644 --- a/extensions/telegram/src/polling-session.ts +++ b/extensions/telegram/src/polling-session.ts @@ -234,7 +234,7 @@ export class TelegramPollingSession { if (elapsed > POLL_STALL_THRESHOLD_MS && runner.isRunning()) { this.#consecutiveStallRestarts += 1; stalledRestart = true; - if (this.#consecutiveStallRestarts >= MAX_CONSECUTIVE_POLL_RESTARTS) { + if (this.#consecutiveStallRestarts > MAX_CONSECUTIVE_POLL_RESTARTS) { this.opts.log( `[telegram] Polling recovery exhausted after ${this.#consecutiveStallRestarts} consecutive stall restarts without successful getUpdates; escalating to process exit.`, );