From 6eb7b2dc9d024e5cd798335d6d5b1000fd81e4d7 Mon Sep 17 00:00:00 2001 From: Protocol-zero-0 <257158451+Protocol-zero-0@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:19:23 +0000 Subject: [PATCH] feat(gateway): crash-loop protection with escalating backoff (#16810) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track gateway crash timestamps in a lightweight state file and apply an escalating backoff schedule when restarts pile up within a 15-minute sliding window: 1-3 crashes → immediate restart (unchanged behaviour) 4-6 crashes → 30 s delay 7-9 crashes → 5 min delay 10+ crashes → refuse to start (CrashLoopError) On a successful startup the crash history is cleared so subsequent restarts are not penalised. Co-authored-by: Cursor --- src/cli/gateway-cli/crash-loop-guard.test.ts | 165 +++++++++++++++++++ src/cli/gateway-cli/crash-loop-guard.ts | 143 ++++++++++++++++ src/cli/gateway-cli/run.ts | 42 ++++- 3 files changed, 344 insertions(+), 6 deletions(-) create mode 100644 src/cli/gateway-cli/crash-loop-guard.test.ts create mode 100644 src/cli/gateway-cli/crash-loop-guard.ts diff --git a/src/cli/gateway-cli/crash-loop-guard.test.ts b/src/cli/gateway-cli/crash-loop-guard.test.ts new file mode 100644 index 00000000000..338760d873b --- /dev/null +++ b/src/cli/gateway-cli/crash-loop-guard.test.ts @@ -0,0 +1,165 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { + applyCrashLoopGuard, + clearGatewayCrashHistory, + CrashLoopError, + recordGatewayCrash, + _TEST_ONLY, +} from "./crash-loop-guard.js"; + +const { CRASH_WINDOW_MS, BACKOFF_THRESHOLD, MAX_CRASHES } = _TEST_ONLY; + +function makeTempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), "crash-guard-")); +} + +function rmDir(dir: string): void { + fs.rmSync(dir, { recursive: true, force: true }); +} + +describe("crash-loop-guard", () => { + const dirs: string[] = []; + afterEach(() => { + for (const d of dirs) { + rmDir(d); + } + dirs.length = 0; + }); + + function tempDir(): string { + const d = makeTempDir(); + dirs.push(d); + return d; + } + + describe("recordGatewayCrash / clearGatewayCrashHistory", () => { + it("records crash timestamps and clears them", () => { + const dir = tempDir(); + recordGatewayCrash(dir, 1000); + recordGatewayCrash(dir, 2000); + + const raw = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(raw.crashes).toEqual([1000, 2000]); + + clearGatewayCrashHistory(dir); + const cleared = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(cleared.crashes).toEqual([]); + }); + + it("prunes timestamps outside the window", () => { + const dir = tempDir(); + const now = Date.now(); + const old = now - CRASH_WINDOW_MS - 1000; + recordGatewayCrash(dir, old); + recordGatewayCrash(dir, now); + + const raw = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(raw.crashes).toEqual([now]); + }); + }); + + describe("applyCrashLoopGuard", () => { + it("does nothing when crash history is empty", async () => { + const dir = tempDir(); + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("does nothing for fewer than BACKOFF_THRESHOLD crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < BACKOFF_THRESHOLD - 1; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("applies 30s backoff for 4-6 crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < 5; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + const warn = vi.fn(); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn, error: vi.fn() }, + }); + expect(sleep).toHaveBeenCalledWith(30_000); + expect(warn).toHaveBeenCalled(); + }); + + it("applies 5-min backoff for 7+ crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < 8; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).toHaveBeenCalledWith(5 * 60 * 1000); + }); + + it("throws CrashLoopError after MAX_CRASHES", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < MAX_CRASHES; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + await expect( + applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep: vi.fn(async () => {}), + logger: { warn: vi.fn(), error: vi.fn() }, + }), + ).rejects.toThrow(CrashLoopError); + }); + + it("ignores corrupt history file gracefully", async () => { + const dir = tempDir(); + fs.writeFileSync(path.join(dir, "gateway-crash-history.json"), "NOT JSON", "utf-8"); + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/src/cli/gateway-cli/crash-loop-guard.ts b/src/cli/gateway-cli/crash-loop-guard.ts new file mode 100644 index 00000000000..b517e4d50ea --- /dev/null +++ b/src/cli/gateway-cli/crash-loop-guard.ts @@ -0,0 +1,143 @@ +import fs from "node:fs"; +import path from "node:path"; + +/** + * Crash-loop protection for the gateway process. + * + * Tracks recent crash timestamps in a lightweight JSON file inside the state + * directory and applies an escalating backoff schedule when the gateway keeps + * crashing within a sliding window: + * + * 1-3 crashes → immediate restart (current behaviour) + * 4-6 crashes → 30 s delay + * 7-9 crashes → 5 min delay + * 10+ crashes → refuse to start, write crash report + * + * See issue #16810. + */ + +const CRASH_HISTORY_FILENAME = "gateway-crash-history.json"; +const CRASH_WINDOW_MS = 15 * 60 * 1000; // 15-minute sliding window +const BACKOFF_THRESHOLD = 3; +const MAX_CRASHES = 10; + +const BACKOFF_TIERS: ReadonlyArray<{ minCrashes: number; delayMs: number }> = [ + { minCrashes: 7, delayMs: 5 * 60 * 1000 }, + { minCrashes: 4, delayMs: 30 * 1000 }, +]; + +export interface CrashHistory { + crashes: number[]; +} + +export interface CrashLoopGuardDeps { + stateDir: string; + now?: () => number; + sleep?: (ms: number) => Promise; + logger: { warn: (msg: string) => void; error: (msg: string) => void }; +} + +function crashHistoryPath(stateDir: string): string { + return path.join(stateDir, CRASH_HISTORY_FILENAME); +} + +function readCrashHistory(filePath: string): CrashHistory { + try { + const raw = fs.readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as CrashHistory; + if (Array.isArray(parsed?.crashes)) { + return { crashes: parsed.crashes.filter((t) => typeof t === "number") }; + } + } catch { + // missing or corrupt — start fresh + } + return { crashes: [] }; +} + +function writeCrashHistory(filePath: string, history: CrashHistory): void { + try { + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, JSON.stringify(history, null, 2), "utf-8"); + } catch { + // best-effort — don't let crash tracking itself break startup + } +} + +function resolveBackoffMs(recentCount: number): number { + for (const tier of BACKOFF_TIERS) { + if (recentCount >= tier.minCrashes) { + return tier.delayMs; + } + } + return 0; +} + +/** + * Record a crash timestamp. Safe to call from a synchronous + * `process.on('exit')` handler because it uses only sync I/O. + */ +export function recordGatewayCrash(stateDir: string, now: number = Date.now()): void { + const filePath = crashHistoryPath(stateDir); + const history = readCrashHistory(filePath); + history.crashes.push(now); + const cutoff = now - CRASH_WINDOW_MS; + history.crashes = history.crashes.filter((t) => t > cutoff); + writeCrashHistory(filePath, history); +} + +/** + * Clear the crash history (called after a healthy startup period). + */ +export function clearGatewayCrashHistory(stateDir: string): void { + const filePath = crashHistoryPath(stateDir); + writeCrashHistory(filePath, { crashes: [] }); +} + +export class CrashLoopError extends Error { + public readonly recentCrashCount: number; + constructor(count: number, windowMinutes: number) { + super( + `CRASH LOOP DETECTED: ${count} crashes in the last ${windowMinutes} minutes. ` + + `The gateway will not restart automatically. ` + + `Fix the root cause, then run the gateway manually.`, + ); + this.name = "CrashLoopError"; + this.recentCrashCount = count; + } +} + +/** + * Inspect crash history and apply backoff if needed. Throws + * `CrashLoopError` when the crash count exceeds the hard limit. + */ +export async function applyCrashLoopGuard(deps: CrashLoopGuardDeps): Promise { + const now = (deps.now ?? Date.now)(); + const filePath = crashHistoryPath(deps.stateDir); + const history = readCrashHistory(filePath); + const cutoff = now - CRASH_WINDOW_MS; + const recentCrashes = history.crashes.filter((t) => t > cutoff); + + if (recentCrashes.length >= MAX_CRASHES) { + throw new CrashLoopError(recentCrashes.length, CRASH_WINDOW_MS / 60_000); + } + + if (recentCrashes.length >= BACKOFF_THRESHOLD) { + const delayMs = resolveBackoffMs(recentCrashes.length); + if (delayMs > 0) { + deps.logger.warn( + `Crash-loop backoff: ${recentCrashes.length} crashes in the last ${CRASH_WINDOW_MS / 60_000} min — ` + + `waiting ${delayMs / 1000}s before starting gateway.`, + ); + const sleep = deps.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms))); + await sleep(delayMs); + } + } +} + +// Re-export constants for testing +export const _TEST_ONLY = { + CRASH_WINDOW_MS, + BACKOFF_THRESHOLD, + MAX_CRASHES, + BACKOFF_TIERS, +} as const; diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 0aa0e8ff36e..6fd49112d30 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -1,8 +1,9 @@ -import fs from "node:fs"; -import path from "node:path"; import type { Command } from "commander"; import { readSecretFromFile } from "../../acp/secret-file.js"; +import fs from "node:fs"; +import path from "node:path"; import type { GatewayAuthMode, GatewayTailscaleMode } from "../../config/config.js"; +import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { CONFIG_PATH, loadConfig, @@ -13,7 +14,6 @@ import { import { hasConfiguredSecretInput } from "../../config/types.secrets.js"; import { resolveGatewayAuth } from "../../gateway/auth.js"; import { startGatewayServer } from "../../gateway/server.js"; -import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { setGatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { setVerbose } from "../../globals.js"; import { GatewayLockError } from "../../infra/gateway-lock.js"; @@ -25,6 +25,12 @@ import { defaultRuntime } from "../../runtime.js"; import { formatCliCommand } from "../command-format.js"; import { inheritOptionFromParent } from "../command-options.js"; import { forceFreePortAndWait, waitForPortBindable } from "../ports.js"; +import { + applyCrashLoopGuard, + CrashLoopError, + clearGatewayCrashHistory, + recordGatewayCrash, +} from "./crash-loop-guard.js"; import { ensureDevGatewayConfig } from "./dev.js"; import { runGatewayLoop } from "./run-loop.js"; import { @@ -194,6 +200,27 @@ async function runGatewayCommand(opts: GatewayRunOpts) { process.env.OPENCLAW_RAW_STREAM_PATH = rawStreamPath; } + const stateDir = resolveStateDir(process.env); + try { + await applyCrashLoopGuard({ + stateDir, + logger: gatewayLog, + }); + } catch (err) { + if (err instanceof CrashLoopError) { + defaultRuntime.error(err.message); + defaultRuntime.exit(1); + return; + } + throw err; + } + + process.on("exit", (code) => { + if (code !== 0) { + recordGatewayCrash(stateDir); + } + }); + if (devMode) { await ensureDevGatewayConfig({ reset: Boolean(opts.reset) }); } @@ -422,12 +449,15 @@ async function runGatewayCommand(opts: GatewayRunOpts) { await runGatewayLoop({ runtime: defaultRuntime, lockPort: port, - start: async () => - await startGatewayServer(port, { + start: async () => { + const server = await startGatewayServer(port, { bind, auth: authOverride, tailscale: tailscaleOverride, - }), + }); + clearGatewayCrashHistory(stateDir); + return server; + }, }); } catch (err) { if (