From 6eb7b2dc9d024e5cd798335d6d5b1000fd81e4d7 Mon Sep 17 00:00:00 2001 From: Protocol-zero-0 <257158451+Protocol-zero-0@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:19:23 +0000 Subject: [PATCH 1/4] feat(gateway): crash-loop protection with escalating backoff (#16810) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track gateway crash timestamps in a lightweight state file and apply an escalating backoff schedule when restarts pile up within a 15-minute sliding window: 1-3 crashes → immediate restart (unchanged behaviour) 4-6 crashes → 30 s delay 7-9 crashes → 5 min delay 10+ crashes → refuse to start (CrashLoopError) On a successful startup the crash history is cleared so subsequent restarts are not penalised. Co-authored-by: Cursor --- src/cli/gateway-cli/crash-loop-guard.test.ts | 165 +++++++++++++++++++ src/cli/gateway-cli/crash-loop-guard.ts | 143 ++++++++++++++++ src/cli/gateway-cli/run.ts | 42 ++++- 3 files changed, 344 insertions(+), 6 deletions(-) create mode 100644 src/cli/gateway-cli/crash-loop-guard.test.ts create mode 100644 src/cli/gateway-cli/crash-loop-guard.ts diff --git a/src/cli/gateway-cli/crash-loop-guard.test.ts b/src/cli/gateway-cli/crash-loop-guard.test.ts new file mode 100644 index 00000000000..338760d873b --- /dev/null +++ b/src/cli/gateway-cli/crash-loop-guard.test.ts @@ -0,0 +1,165 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { + applyCrashLoopGuard, + clearGatewayCrashHistory, + CrashLoopError, + recordGatewayCrash, + _TEST_ONLY, +} from "./crash-loop-guard.js"; + +const { CRASH_WINDOW_MS, BACKOFF_THRESHOLD, MAX_CRASHES } = _TEST_ONLY; + +function makeTempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), "crash-guard-")); +} + +function rmDir(dir: string): void { + fs.rmSync(dir, { recursive: true, force: true }); +} + +describe("crash-loop-guard", () => { + const dirs: string[] = []; + afterEach(() => { + for (const d of dirs) { + rmDir(d); + } + dirs.length = 0; + }); + + function tempDir(): string { + const d = makeTempDir(); + dirs.push(d); + return d; + } + + describe("recordGatewayCrash / clearGatewayCrashHistory", () => { + it("records crash timestamps and clears them", () => { + const dir = tempDir(); + recordGatewayCrash(dir, 1000); + recordGatewayCrash(dir, 2000); + + const raw = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(raw.crashes).toEqual([1000, 2000]); + + clearGatewayCrashHistory(dir); + const cleared = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(cleared.crashes).toEqual([]); + }); + + it("prunes timestamps outside the window", () => { + const dir = tempDir(); + const now = Date.now(); + const old = now - CRASH_WINDOW_MS - 1000; + recordGatewayCrash(dir, old); + recordGatewayCrash(dir, now); + + const raw = JSON.parse( + fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"), + ); + expect(raw.crashes).toEqual([now]); + }); + }); + + describe("applyCrashLoopGuard", () => { + it("does nothing when crash history is empty", async () => { + const dir = tempDir(); + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("does nothing for fewer than BACKOFF_THRESHOLD crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < BACKOFF_THRESHOLD - 1; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("applies 30s backoff for 4-6 crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < 5; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + const warn = vi.fn(); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn, error: vi.fn() }, + }); + expect(sleep).toHaveBeenCalledWith(30_000); + expect(warn).toHaveBeenCalled(); + }); + + it("applies 5-min backoff for 7+ crashes", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < 8; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).toHaveBeenCalledWith(5 * 60 * 1000); + }); + + it("throws CrashLoopError after MAX_CRASHES", async () => { + const dir = tempDir(); + const now = Date.now(); + for (let i = 0; i < MAX_CRASHES; i++) { + recordGatewayCrash(dir, now - i * 1000); + } + + await expect( + applyCrashLoopGuard({ + stateDir: dir, + now: () => now, + sleep: vi.fn(async () => {}), + logger: { warn: vi.fn(), error: vi.fn() }, + }), + ).rejects.toThrow(CrashLoopError); + }); + + it("ignores corrupt history file gracefully", async () => { + const dir = tempDir(); + fs.writeFileSync(path.join(dir, "gateway-crash-history.json"), "NOT JSON", "utf-8"); + + const sleep = vi.fn(async () => {}); + await applyCrashLoopGuard({ + stateDir: dir, + sleep, + logger: { warn: vi.fn(), error: vi.fn() }, + }); + expect(sleep).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/src/cli/gateway-cli/crash-loop-guard.ts b/src/cli/gateway-cli/crash-loop-guard.ts new file mode 100644 index 00000000000..b517e4d50ea --- /dev/null +++ b/src/cli/gateway-cli/crash-loop-guard.ts @@ -0,0 +1,143 @@ +import fs from "node:fs"; +import path from "node:path"; + +/** + * Crash-loop protection for the gateway process. + * + * Tracks recent crash timestamps in a lightweight JSON file inside the state + * directory and applies an escalating backoff schedule when the gateway keeps + * crashing within a sliding window: + * + * 1-3 crashes → immediate restart (current behaviour) + * 4-6 crashes → 30 s delay + * 7-9 crashes → 5 min delay + * 10+ crashes → refuse to start, write crash report + * + * See issue #16810. + */ + +const CRASH_HISTORY_FILENAME = "gateway-crash-history.json"; +const CRASH_WINDOW_MS = 15 * 60 * 1000; // 15-minute sliding window +const BACKOFF_THRESHOLD = 3; +const MAX_CRASHES = 10; + +const BACKOFF_TIERS: ReadonlyArray<{ minCrashes: number; delayMs: number }> = [ + { minCrashes: 7, delayMs: 5 * 60 * 1000 }, + { minCrashes: 4, delayMs: 30 * 1000 }, +]; + +export interface CrashHistory { + crashes: number[]; +} + +export interface CrashLoopGuardDeps { + stateDir: string; + now?: () => number; + sleep?: (ms: number) => Promise; + logger: { warn: (msg: string) => void; error: (msg: string) => void }; +} + +function crashHistoryPath(stateDir: string): string { + return path.join(stateDir, CRASH_HISTORY_FILENAME); +} + +function readCrashHistory(filePath: string): CrashHistory { + try { + const raw = fs.readFileSync(filePath, "utf-8"); + const parsed = JSON.parse(raw) as CrashHistory; + if (Array.isArray(parsed?.crashes)) { + return { crashes: parsed.crashes.filter((t) => typeof t === "number") }; + } + } catch { + // missing or corrupt — start fresh + } + return { crashes: [] }; +} + +function writeCrashHistory(filePath: string, history: CrashHistory): void { + try { + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, JSON.stringify(history, null, 2), "utf-8"); + } catch { + // best-effort — don't let crash tracking itself break startup + } +} + +function resolveBackoffMs(recentCount: number): number { + for (const tier of BACKOFF_TIERS) { + if (recentCount >= tier.minCrashes) { + return tier.delayMs; + } + } + return 0; +} + +/** + * Record a crash timestamp. Safe to call from a synchronous + * `process.on('exit')` handler because it uses only sync I/O. + */ +export function recordGatewayCrash(stateDir: string, now: number = Date.now()): void { + const filePath = crashHistoryPath(stateDir); + const history = readCrashHistory(filePath); + history.crashes.push(now); + const cutoff = now - CRASH_WINDOW_MS; + history.crashes = history.crashes.filter((t) => t > cutoff); + writeCrashHistory(filePath, history); +} + +/** + * Clear the crash history (called after a healthy startup period). + */ +export function clearGatewayCrashHistory(stateDir: string): void { + const filePath = crashHistoryPath(stateDir); + writeCrashHistory(filePath, { crashes: [] }); +} + +export class CrashLoopError extends Error { + public readonly recentCrashCount: number; + constructor(count: number, windowMinutes: number) { + super( + `CRASH LOOP DETECTED: ${count} crashes in the last ${windowMinutes} minutes. ` + + `The gateway will not restart automatically. ` + + `Fix the root cause, then run the gateway manually.`, + ); + this.name = "CrashLoopError"; + this.recentCrashCount = count; + } +} + +/** + * Inspect crash history and apply backoff if needed. Throws + * `CrashLoopError` when the crash count exceeds the hard limit. + */ +export async function applyCrashLoopGuard(deps: CrashLoopGuardDeps): Promise { + const now = (deps.now ?? Date.now)(); + const filePath = crashHistoryPath(deps.stateDir); + const history = readCrashHistory(filePath); + const cutoff = now - CRASH_WINDOW_MS; + const recentCrashes = history.crashes.filter((t) => t > cutoff); + + if (recentCrashes.length >= MAX_CRASHES) { + throw new CrashLoopError(recentCrashes.length, CRASH_WINDOW_MS / 60_000); + } + + if (recentCrashes.length >= BACKOFF_THRESHOLD) { + const delayMs = resolveBackoffMs(recentCrashes.length); + if (delayMs > 0) { + deps.logger.warn( + `Crash-loop backoff: ${recentCrashes.length} crashes in the last ${CRASH_WINDOW_MS / 60_000} min — ` + + `waiting ${delayMs / 1000}s before starting gateway.`, + ); + const sleep = deps.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms))); + await sleep(delayMs); + } + } +} + +// Re-export constants for testing +export const _TEST_ONLY = { + CRASH_WINDOW_MS, + BACKOFF_THRESHOLD, + MAX_CRASHES, + BACKOFF_TIERS, +} as const; diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 0aa0e8ff36e..6fd49112d30 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -1,8 +1,9 @@ -import fs from "node:fs"; -import path from "node:path"; import type { Command } from "commander"; import { readSecretFromFile } from "../../acp/secret-file.js"; +import fs from "node:fs"; +import path from "node:path"; import type { GatewayAuthMode, GatewayTailscaleMode } from "../../config/config.js"; +import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { CONFIG_PATH, loadConfig, @@ -13,7 +14,6 @@ import { import { hasConfiguredSecretInput } from "../../config/types.secrets.js"; import { resolveGatewayAuth } from "../../gateway/auth.js"; import { startGatewayServer } from "../../gateway/server.js"; -import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { setGatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { setVerbose } from "../../globals.js"; import { GatewayLockError } from "../../infra/gateway-lock.js"; @@ -25,6 +25,12 @@ import { defaultRuntime } from "../../runtime.js"; import { formatCliCommand } from "../command-format.js"; import { inheritOptionFromParent } from "../command-options.js"; import { forceFreePortAndWait, waitForPortBindable } from "../ports.js"; +import { + applyCrashLoopGuard, + CrashLoopError, + clearGatewayCrashHistory, + recordGatewayCrash, +} from "./crash-loop-guard.js"; import { ensureDevGatewayConfig } from "./dev.js"; import { runGatewayLoop } from "./run-loop.js"; import { @@ -194,6 +200,27 @@ async function runGatewayCommand(opts: GatewayRunOpts) { process.env.OPENCLAW_RAW_STREAM_PATH = rawStreamPath; } + const stateDir = resolveStateDir(process.env); + try { + await applyCrashLoopGuard({ + stateDir, + logger: gatewayLog, + }); + } catch (err) { + if (err instanceof CrashLoopError) { + defaultRuntime.error(err.message); + defaultRuntime.exit(1); + return; + } + throw err; + } + + process.on("exit", (code) => { + if (code !== 0) { + recordGatewayCrash(stateDir); + } + }); + if (devMode) { await ensureDevGatewayConfig({ reset: Boolean(opts.reset) }); } @@ -422,12 +449,15 @@ async function runGatewayCommand(opts: GatewayRunOpts) { await runGatewayLoop({ runtime: defaultRuntime, lockPort: port, - start: async () => - await startGatewayServer(port, { + start: async () => { + const server = await startGatewayServer(port, { bind, auth: authOverride, tailscale: tailscaleOverride, - }), + }); + clearGatewayCrashHistory(stateDir); + return server; + }, }); } catch (err) { if ( From 430b320cd01b104cf4aa5a590589d9ab4a8e916b Mon Sep 17 00:00:00 2001 From: Protocol-zero-0 <257158451+Protocol-zero-0@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:32:57 +0000 Subject: [PATCH 2/4] fix: register crash-recording exit handler after all validation passes Move the process.on('exit') handler that records gateway crashes to after all CLI validation checks (port, auth, bind, etc.) so that user configuration errors are not incorrectly counted as gateway crashes. Addresses review feedback from @greptile-apps. Co-authored-by: Cursor --- src/cli/gateway-cli/run.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 6fd49112d30..1e28050a2d3 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -215,12 +215,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) { throw err; } - process.on("exit", (code) => { - if (code !== 0) { - recordGatewayCrash(stateDir); - } - }); - if (devMode) { await ensureDevGatewayConfig({ reset: Boolean(opts.reset) }); } @@ -445,6 +439,12 @@ async function runGatewayCommand(opts: GatewayRunOpts) { } : undefined; + process.on("exit", (code) => { + if (code !== 0) { + recordGatewayCrash(stateDir); + } + }); + try { await runGatewayLoop({ runtime: defaultRuntime, From b53c77ef664ee6ac695b9c6e5c0d0da488c03464 Mon Sep 17 00:00:00 2001 From: Protocol-zero-0 <257158451+Protocol-zero-0@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:33:57 +0000 Subject: [PATCH 3/4] fix: write crash report on loop detection and add recovery hints Address review feedback: - Move exit handler registration to after all CLI validation passes so user config errors (bad port, auth, bind) are not counted as crashes. - Write a crash-report.json with timestamps when the hard crash limit is reached. - Add recovery instructions to the CrashLoopError message (delete the history file or wait for the window to expire). Co-authored-by: Cursor --- src/cli/gateway-cli/crash-loop-guard.ts | 25 ++++++++++++++++++++++++- src/cli/gateway-cli/run.ts | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/cli/gateway-cli/crash-loop-guard.ts b/src/cli/gateway-cli/crash-loop-guard.ts index b517e4d50ea..52660463dba 100644 --- a/src/cli/gateway-cli/crash-loop-guard.ts +++ b/src/cli/gateway-cli/crash-loop-guard.ts @@ -99,7 +99,9 @@ export class CrashLoopError extends Error { super( `CRASH LOOP DETECTED: ${count} crashes in the last ${windowMinutes} minutes. ` + `The gateway will not restart automatically. ` + - `Fix the root cause, then run the gateway manually.`, + `Fix the root cause, then run the gateway manually. ` + + `To reset the crash counter immediately, delete the gateway-crash-history.json ` + + `file in the state directory, or wait ${windowMinutes} minutes for it to expire.`, ); this.name = "CrashLoopError"; this.recentCrashCount = count; @@ -118,6 +120,27 @@ export async function applyCrashLoopGuard(deps: CrashLoopGuardDeps): Promise t > cutoff); if (recentCrashes.length >= MAX_CRASHES) { + const reportPath = path.join(deps.stateDir, "crash-report.json"); + try { + fs.mkdirSync(path.dirname(reportPath), { recursive: true }); + fs.writeFileSync( + reportPath, + JSON.stringify( + { + detectedAt: new Date(now).toISOString(), + recentCrashCount: recentCrashes.length, + windowMinutes: CRASH_WINDOW_MS / 60_000, + crashTimestamps: recentCrashes.map((t) => new Date(t).toISOString()), + }, + null, + 2, + ), + "utf-8", + ); + deps.logger.error(`Crash report written to ${reportPath}`); + } catch { + // best-effort + } throw new CrashLoopError(recentCrashes.length, CRASH_WINDOW_MS / 60_000); } diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 1e28050a2d3..da3893f3cba 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -3,7 +3,6 @@ import { readSecretFromFile } from "../../acp/secret-file.js"; import fs from "node:fs"; import path from "node:path"; import type { GatewayAuthMode, GatewayTailscaleMode } from "../../config/config.js"; -import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { CONFIG_PATH, loadConfig, @@ -14,6 +13,7 @@ import { import { hasConfiguredSecretInput } from "../../config/types.secrets.js"; import { resolveGatewayAuth } from "../../gateway/auth.js"; import { startGatewayServer } from "../../gateway/server.js"; +import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { setGatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { setVerbose } from "../../globals.js"; import { GatewayLockError } from "../../infra/gateway-lock.js"; From 0151b3ebb6740d529052f268505b2a69f77a888b Mon Sep 17 00:00:00 2001 From: Protocol-zero-0 <257158451+Protocol-zero-0@users.noreply.github.com> Date: Sun, 8 Mar 2026 10:23:43 +0000 Subject: [PATCH 4/4] style: fix crash loop guard formatting after rebase Normalize import ordering in the gateway CLI runner so the crash-loop PR passes the current format check on top of latest main. Made-with: Cursor --- src/cli/gateway-cli/run.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index da3893f3cba..4dc9298ffa9 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -1,7 +1,7 @@ -import type { Command } from "commander"; -import { readSecretFromFile } from "../../acp/secret-file.js"; import fs from "node:fs"; import path from "node:path"; +import type { Command } from "commander"; +import { readSecretFromFile } from "../../acp/secret-file.js"; import type { GatewayAuthMode, GatewayTailscaleMode } from "../../config/config.js"; import { CONFIG_PATH,