fix(discord): retry startup with exponential backoff on transient network outage

When the Discord plugin starts during a network outage, the initial
WebSocket handshake fails. The existing code attempts one forced
reconnect after a 15s ready-timeout, then throws if that also times
out — leaving the channel permanently dead until someone manually
restarts the service.

This commit replaces the immediate throw with an exponential-backoff
retry loop (up to 8 additional attempts, 15s → 30s → 60s → … → 5min
per attempt) that mirrors the behaviour of the post-connected
reconnect-stall watchdog. Recovery is self-healing once the network
comes back.

Fixes #51370
This commit is contained in:
zhuowater 2026-03-21 10:09:00 +08:00
parent 5e417b44e1
commit 8229cde856

View File

@ -364,20 +364,89 @@ export async function runDiscordGatewayLifecycle(params: {
return;
}
if (reconnected === "timeout" && !lifecycleStopping) {
const error = new Error(
`discord gateway did not reach READY within ${DISCORD_GATEWAY_READY_TIMEOUT_MS}ms after a forced reconnect`,
);
const startupFailureAt = Date.now();
pushStatus({
connected: false,
lastEventAt: startupFailureAt,
lastDisconnect: {
at: startupFailureAt,
error: "startup-reconnect-timeout",
},
lastError: error.message,
});
throw error;
// The forced reconnect also timed out — likely a transient network
// outage during startup. Rather than giving up immediately (which
// leaves the channel permanently dead until the gateway restarts),
// retry with exponential backoff, mirroring the post-connected
// reconnect-stall-watchdog behaviour.
const STARTUP_MAX_BACKOFF_ATTEMPTS = 8;
const STARTUP_BACKOFF_BASE_MS = DISCORD_GATEWAY_READY_TIMEOUT_MS;
const STARTUP_BACKOFF_MAX_MS = RECONNECT_STALL_TIMEOUT_MS;
let startupRecovered = false;
for (
let backoffAttempt = 1;
backoffAttempt <= STARTUP_MAX_BACKOFF_ATTEMPTS && !lifecycleStopping;
backoffAttempt++
) {
const delayMs = Math.min(
STARTUP_BACKOFF_BASE_MS * (1 << Math.min(backoffAttempt - 1, 5)),
STARTUP_BACKOFF_MAX_MS,
);
const stuckAt = Date.now();
params.runtime.error?.(
danger(
`discord: gateway startup stalled; backing off ${Math.round(delayMs / 1000)}s before retry (attempt ${backoffAttempt}/${STARTUP_MAX_BACKOFF_ATTEMPTS})`,
),
);
pushStatus({
connected: false,
lastEventAt: stuckAt,
lastDisconnect: {
at: stuckAt,
error: "startup-reconnect-timeout",
},
});
await new Promise<void>((resolve) => {
const t = setTimeout(resolve, delayMs);
t.unref?.();
params.abortSignal?.addEventListener("abort", () => {
clearTimeout(t);
resolve();
}, { once: true });
});
if (lifecycleStopping || params.abortSignal?.aborted) {
return;
}
clearResumeState();
gateway?.disconnect();
gateway?.connect(false);
const backoffReady = await waitForDiscordGatewayReady({
gateway,
abortSignal: params.abortSignal,
timeoutMs: DISCORD_GATEWAY_READY_TIMEOUT_MS,
beforePoll: drainPendingGatewayErrors,
});
if (backoffReady === "stopped" || lifecycleStopping) {
return;
}
if (backoffReady !== "timeout") {
startupRecovered = true;
break;
}
}
if (!startupRecovered && !lifecycleStopping) {
const error = new Error(
`discord gateway did not reach READY after ${STARTUP_MAX_BACKOFF_ATTEMPTS + 2} startup attempts`,
);
const startupFailureAt = Date.now();
pushStatus({
connected: false,
lastEventAt: startupFailureAt,
lastDisconnect: {
at: startupFailureAt,
error: "startup-reconnect-timeout",
},
lastError: error.message,
});
throw error;
}
}
}
}