fix(discord): retry startup with exponential backoff on transient network outage
When the Discord plugin starts during a network outage, the initial WebSocket handshake fails. The existing code attempts one forced reconnect after a 15s ready-timeout, then throws if that also times out — leaving the channel permanently dead until someone manually restarts the service. This commit replaces the immediate throw with an exponential-backoff retry loop (up to 8 additional attempts, 15s → 30s → 60s → … → 5min per attempt) that mirrors the behaviour of the post-connected reconnect-stall watchdog. Recovery is self-healing once the network comes back. Fixes #51370
This commit is contained in:
parent
5e417b44e1
commit
8229cde856
@ -364,20 +364,89 @@ export async function runDiscordGatewayLifecycle(params: {
|
||||
return;
|
||||
}
|
||||
if (reconnected === "timeout" && !lifecycleStopping) {
|
||||
const error = new Error(
|
||||
`discord gateway did not reach READY within ${DISCORD_GATEWAY_READY_TIMEOUT_MS}ms after a forced reconnect`,
|
||||
);
|
||||
const startupFailureAt = Date.now();
|
||||
pushStatus({
|
||||
connected: false,
|
||||
lastEventAt: startupFailureAt,
|
||||
lastDisconnect: {
|
||||
at: startupFailureAt,
|
||||
error: "startup-reconnect-timeout",
|
||||
},
|
||||
lastError: error.message,
|
||||
});
|
||||
throw error;
|
||||
// The forced reconnect also timed out — likely a transient network
|
||||
// outage during startup. Rather than giving up immediately (which
|
||||
// leaves the channel permanently dead until the gateway restarts),
|
||||
// retry with exponential backoff, mirroring the post-connected
|
||||
// reconnect-stall-watchdog behaviour.
|
||||
const STARTUP_MAX_BACKOFF_ATTEMPTS = 8;
|
||||
const STARTUP_BACKOFF_BASE_MS = DISCORD_GATEWAY_READY_TIMEOUT_MS;
|
||||
const STARTUP_BACKOFF_MAX_MS = RECONNECT_STALL_TIMEOUT_MS;
|
||||
|
||||
let startupRecovered = false;
|
||||
for (
|
||||
let backoffAttempt = 1;
|
||||
backoffAttempt <= STARTUP_MAX_BACKOFF_ATTEMPTS && !lifecycleStopping;
|
||||
backoffAttempt++
|
||||
) {
|
||||
const delayMs = Math.min(
|
||||
STARTUP_BACKOFF_BASE_MS * (1 << Math.min(backoffAttempt - 1, 5)),
|
||||
STARTUP_BACKOFF_MAX_MS,
|
||||
);
|
||||
const stuckAt = Date.now();
|
||||
params.runtime.error?.(
|
||||
danger(
|
||||
`discord: gateway startup stalled; backing off ${Math.round(delayMs / 1000)}s before retry (attempt ${backoffAttempt}/${STARTUP_MAX_BACKOFF_ATTEMPTS})`,
|
||||
),
|
||||
);
|
||||
pushStatus({
|
||||
connected: false,
|
||||
lastEventAt: stuckAt,
|
||||
lastDisconnect: {
|
||||
at: stuckAt,
|
||||
error: "startup-reconnect-timeout",
|
||||
},
|
||||
});
|
||||
|
||||
await new Promise<void>((resolve) => {
|
||||
const t = setTimeout(resolve, delayMs);
|
||||
t.unref?.();
|
||||
params.abortSignal?.addEventListener("abort", () => {
|
||||
clearTimeout(t);
|
||||
resolve();
|
||||
}, { once: true });
|
||||
});
|
||||
|
||||
if (lifecycleStopping || params.abortSignal?.aborted) {
|
||||
return;
|
||||
}
|
||||
|
||||
clearResumeState();
|
||||
gateway?.disconnect();
|
||||
gateway?.connect(false);
|
||||
|
||||
const backoffReady = await waitForDiscordGatewayReady({
|
||||
gateway,
|
||||
abortSignal: params.abortSignal,
|
||||
timeoutMs: DISCORD_GATEWAY_READY_TIMEOUT_MS,
|
||||
beforePoll: drainPendingGatewayErrors,
|
||||
});
|
||||
|
||||
if (backoffReady === "stopped" || lifecycleStopping) {
|
||||
return;
|
||||
}
|
||||
if (backoffReady !== "timeout") {
|
||||
startupRecovered = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!startupRecovered && !lifecycleStopping) {
|
||||
const error = new Error(
|
||||
`discord gateway did not reach READY after ${STARTUP_MAX_BACKOFF_ATTEMPTS + 2} startup attempts`,
|
||||
);
|
||||
const startupFailureAt = Date.now();
|
||||
pushStatus({
|
||||
connected: false,
|
||||
lastEventAt: startupFailureAt,
|
||||
lastDisconnect: {
|
||||
at: startupFailureAt,
|
||||
error: "startup-reconnect-timeout",
|
||||
},
|
||||
lastError: error.message,
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user