diff --git a/src/cli/gateway-cli/crash-loop-guard.test.ts b/src/cli/gateway-cli/crash-loop-guard.test.ts new file mode 100644 index 00000000000..338760d873b --- /dev/null +++ b/src/cli/gateway-cli/crash-loop-guard.test.ts @@ -0,0 +1,165 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { + applyCrashLoopGuard, + clearGatewayCrashHistory, + CrashLoopError, + recordGatewayCrash, + _TEST_ONLY, +} from "./crash-loop-guard.js"; + +const { CRASH_WINDOW_MS, BACKOFF_THRESHOLD, MAX_CRASHES } = _TEST_ONLY; + +function makeTempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), "crash-guard-")); +} + +function rmDir(dir: string): void { + fs.rmSync(dir, { recursive: true, force: true }); +} + +describe("crash-loop-guard", () => { + const dirs: string[] = []; + afterEach(() => { + for (const d of dirs) { + rmDir(d); + } + dirs.length = 0; + }); + + function tempDir(): string { + const d = makeTempDir(); + dirs.push(d); + return d; + } + + describe("recordGatewayCrash / clearGatewayCrashHistory", () => { + it("records crash timestamps and clears them", () => { + const dir = tempDir(); + recordGatewayCrash(dir, 1000); + recordGatewayCrash(dir, 2000); + + const raw = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(raw.crashes).toEqual([1000, 2000]); + + clearGatewayCrashHistory(dir); + const cleared = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(cleared.crashes).toEqual([]); + }); + + it("prunes timestamps outside the window", () => { + const dir = tempDir(); + const now = Date.now(); + const old = now - CRASH_WINDOW_MS - 1000; + recordGatewayCrash(dir, old); + recordGatewayCrash(dir, now); + + const raw = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(raw.crashes).toEqual([now]); + }); + }); + + describe("applyCrashLoopGuard", () => { + it("does nothing when crash history is empty", async () => { + const dir = tempDir(); + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("does nothing for fewer than BACKOFF_THRESHOLD crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < BACKOFF_THRESHOLD - 1; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("applies 30s backoff for 4-6 crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < 5; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + const warn = vi.fn(); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn, error: vi.fn() }, + }); + expect(sleep).toHaveBeenCalledWith(30_000); + expect(warn).toHaveBeenCalled(); + }); + + it("applies 5-min backoff for 7+ crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < 8; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).toHaveBeenCalledWith(5 * 60 * 1000); + }); + + it("throws CrashLoopError after MAX_CRASHES", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < MAX_CRASHES; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + await expect( + applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep: vi.fn(async () => {}), + logger: { warn: vi.fn(), error: vi.fn() }, + }), + ).rejects.toThrow(CrashLoopError); + }); + + it("ignores corrupt history file gracefully", async () => { + const dir = tempDir(); + fs.writeFileSync(path.join(dir, "gateway-crash-history.json"), "NOT JSON", "utf-8"); + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/src/cli/gateway-cli/crash-loop-guard.ts b/src/cli/gateway-cli/crash-loop-guard.ts new file mode 100644 index 00000000000..52660463dba --- /dev/null +++ b/src/cli/gateway-cli/crash-loop-guard.ts @@ -0,0 +1,166 @@ +import fs from "node:fs"; +import path from "node:path"; + +/** + * Crash-loop protection for the gateway process. + * + * Tracks recent crash timestamps in a lightweight JSON file inside the state + * directory and applies an escalating backoff schedule when the gateway keeps + * crashing within a sliding window: + * + * 1-3 crashes → immediate restart (current behaviour) + * 4-6 crashes → 30 s delay + * 7-9 crashes → 5 min delay + * 10+ crashes → refuse to start, write crash report + * + * See issue #16810. + */ + +const CRASH_HISTORY_FILENAME = "gateway-crash-history.json"; +const CRASH_WINDOW_MS = 15 * 60 * 1000; // 15-minute sliding window +const BACKOFF_THRESHOLD = 3; +const MAX_CRASHES = 10; + +const BACKOFF_TIERS: ReadonlyArray<{ minCrashes: number; delayMs: number }> = [ + { minCrashes: 7, delayMs: 5 * 60 * 1000 }, + { minCrashes: 4, delayMs: 30 * 1000 }, +]; + +export interface CrashHistory { + crashes: number[]; +} + +export interface CrashLoopGuardDeps { + stateDir: string; + now?: () => number; + sleep?: (ms: number) => Promise; + logger: { warn: (msg: string) => void; error: (msg: string) => void }; +} + +function crashHistoryPath(stateDir: string): string { + return path.join(stateDir, CRASH_HISTORY_FILENAME); +} + +function readCrashHistory(filePath: string): CrashHistory { + try { + const raw = fs.readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as CrashHistory; + if (Array.isArray(parsed?.crashes)) { + return { crashes: parsed.crashes.filter((t) => typeof t === "number") }; + } + } catch { + // missing or corrupt — start fresh + } + return { crashes: [] }; +} + +function writeCrashHistory(filePath: string, history: CrashHistory): void { + try { + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, JSON.stringify(history, null, 2), "utf-8"); + } catch { + // best-effort — don't let crash tracking itself break startup + } +} + +function resolveBackoffMs(recentCount: number): number { + for (const tier of BACKOFF_TIERS) { + if (recentCount >= tier.minCrashes) { + return tier.delayMs; + } + } + return 0; +} + +/** + * Record a crash timestamp. Safe to call from a synchronous + * `process.on('exit')` handler because it uses only sync I/O. + */ +export function recordGatewayCrash(stateDir: string, now: number = Date.now()): void { + const filePath = crashHistoryPath(stateDir); + const history = readCrashHistory(filePath); + history.crashes.push(now); + const cutoff = now - CRASH_WINDOW_MS; + history.crashes = history.crashes.filter((t) => t > cutoff); + writeCrashHistory(filePath, history); +} + +/** + * Clear the crash history (called after a healthy startup period). + */ +export function clearGatewayCrashHistory(stateDir: string): void { + const filePath = crashHistoryPath(stateDir); + writeCrashHistory(filePath, { crashes: [] }); +} + +export class CrashLoopError extends Error { + public readonly recentCrashCount: number; + constructor(count: number, windowMinutes: number) { + super( + `CRASH LOOP DETECTED: ${count} crashes in the last ${windowMinutes} minutes. ` + + `The gateway will not restart automatically. ` + + `Fix the root cause, then run the gateway manually. ` + + `To reset the crash counter immediately, delete the gateway-crash-history.json ` + + `file in the state directory, or wait ${windowMinutes} minutes for it to expire.`, + ); + this.name = "CrashLoopError"; + this.recentCrashCount = count; + } +} + +/** + * Inspect crash history and apply backoff if needed. Throws + * `CrashLoopError` when the crash count exceeds the hard limit. + */ +export async function applyCrashLoopGuard(deps: CrashLoopGuardDeps): Promise { + const now = (deps.now ?? Date.now)(); + const filePath = crashHistoryPath(deps.stateDir); + const history = readCrashHistory(filePath); + const cutoff = now - CRASH_WINDOW_MS; + const recentCrashes = history.crashes.filter((t) => t > cutoff); + + if (recentCrashes.length >= MAX_CRASHES) { + const reportPath = path.join(deps.stateDir, "crash-report.json"); + try { + fs.mkdirSync(path.dirname(reportPath), { recursive: true }); + fs.writeFileSync( + reportPath, + JSON.stringify( + { + detectedAt: new Date(now).toISOString(), + recentCrashCount: recentCrashes.length, + windowMinutes: CRASH_WINDOW_MS / 60_000, + crashTimestamps: recentCrashes.map((t) => new Date(t).toISOString()), + }, + null, + 2, + ), + "utf-8", + ); + deps.logger.error(`Crash report written to ${reportPath}`); + } catch { + // best-effort + } + throw new CrashLoopError(recentCrashes.length, CRASH_WINDOW_MS / 60_000); + } + + if (recentCrashes.length >= BACKOFF_THRESHOLD) { + const delayMs = resolveBackoffMs(recentCrashes.length); + if (delayMs > 0) { + deps.logger.warn( + `Crash-loop backoff: ${recentCrashes.length} crashes in the last ${CRASH_WINDOW_MS / 60_000} min — ` + + `waiting ${delayMs / 1000}s before starting gateway.`, + ); + const sleep = deps.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms))); + await sleep(delayMs); + } + } +} + +// Re-export constants for testing +export const _TEST_ONLY = { + CRASH_WINDOW_MS, + BACKOFF_THRESHOLD, + MAX_CRASHES, + BACKOFF_TIERS, +} as const; diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 0aa0e8ff36e..4dc9298ffa9 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -25,6 +25,12 @@ import { defaultRuntime } from "../../runtime.js"; import { formatCliCommand } from "../command-format.js"; import { inheritOptionFromParent } from "../command-options.js"; import { forceFreePortAndWait, waitForPortBindable } from "../ports.js"; +import { + applyCrashLoopGuard, + CrashLoopError, + clearGatewayCrashHistory, + recordGatewayCrash, +} from "./crash-loop-guard.js"; import { ensureDevGatewayConfig } from "./dev.js"; import { runGatewayLoop } from "./run-loop.js"; import { @@ -194,6 +200,21 @@ async function runGatewayCommand(opts: GatewayRunOpts) { process.env.OPENCLAW_RAW_STREAM_PATH = rawStreamPath; } + const stateDir = resolveStateDir(process.env); + try { + await applyCrashLoopGuard({ + stateDir, + logger: gatewayLog, + }); + } catch (err) { + if (err instanceof CrashLoopError) { + defaultRuntime.error(err.message); + defaultRuntime.exit(1); + return; + } + throw err; + } + if (devMode) { await ensureDevGatewayConfig({ reset: Boolean(opts.reset) }); } @@ -418,16 +439,25 @@ async function runGatewayCommand(opts: GatewayRunOpts) { } : undefined; + process.on("exit", (code) => { + if (code !== 0) { + recordGatewayCrash(stateDir); + } + }); + try { await runGatewayLoop({ runtime: defaultRuntime, lockPort: port, - start: async () => - await startGatewayServer(port, { + start: async () => { + const server = await startGatewayServer(port, { bind, auth: authOverride, tailscale: tailscaleOverride, - }), + }); + clearGatewayCrashHistory(stateDir); + return server; + }, }); } catch (err) { if (