feat(gateway): crash-loop protection with escalating backoff (#16810)
Track gateway crash timestamps in a lightweight state file and apply an escalating backoff schedule when restarts pile up within a 15-minute sliding window: 1-3 crashes → immediate restart (unchanged behaviour) 4-6 crashes → 30 s delay 7-9 crashes → 5 min delay 10+ crashes → refuse to start (CrashLoopError) On a successful startup the crash history is cleared so subsequent restarts are not penalised. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
ec2c6d83b9
commit
6eb7b2dc9d
165
src/cli/gateway-cli/crash-loop-guard.test.ts
Normal file
165
src/cli/gateway-cli/crash-loop-guard.test.ts
Normal file
@ -0,0 +1,165 @@
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
applyCrashLoopGuard,
|
||||
clearGatewayCrashHistory,
|
||||
CrashLoopError,
|
||||
recordGatewayCrash,
|
||||
_TEST_ONLY,
|
||||
} from "./crash-loop-guard.js";
|
||||
|
||||
const { CRASH_WINDOW_MS, BACKOFF_THRESHOLD, MAX_CRASHES } = _TEST_ONLY;
|
||||
|
||||
function makeTempDir(): string {
|
||||
return fs.mkdtempSync(path.join(os.tmpdir(), "crash-guard-"));
|
||||
}
|
||||
|
||||
function rmDir(dir: string): void {
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
describe("crash-loop-guard", () => {
|
||||
const dirs: string[] = [];
|
||||
afterEach(() => {
|
||||
for (const d of dirs) {
|
||||
rmDir(d);
|
||||
}
|
||||
dirs.length = 0;
|
||||
});
|
||||
|
||||
function tempDir(): string {
|
||||
const d = makeTempDir();
|
||||
dirs.push(d);
|
||||
return d;
|
||||
}
|
||||
|
||||
describe("recordGatewayCrash / clearGatewayCrashHistory", () => {
|
||||
it("records crash timestamps and clears them", () => {
|
||||
const dir = tempDir();
|
||||
recordGatewayCrash(dir, 1000);
|
||||
recordGatewayCrash(dir, 2000);
|
||||
|
||||
const raw = JSON.parse(
|
||||
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
|
||||
);
|
||||
expect(raw.crashes).toEqual([1000, 2000]);
|
||||
|
||||
clearGatewayCrashHistory(dir);
|
||||
const cleared = JSON.parse(
|
||||
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
|
||||
);
|
||||
expect(cleared.crashes).toEqual([]);
|
||||
});
|
||||
|
||||
it("prunes timestamps outside the window", () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
const old = now - CRASH_WINDOW_MS - 1000;
|
||||
recordGatewayCrash(dir, old);
|
||||
recordGatewayCrash(dir, now);
|
||||
|
||||
const raw = JSON.parse(
|
||||
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
|
||||
);
|
||||
expect(raw.crashes).toEqual([now]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("applyCrashLoopGuard", () => {
|
||||
it("does nothing when crash history is empty", async () => {
|
||||
const dir = tempDir();
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does nothing for fewer than BACKOFF_THRESHOLD crashes", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < BACKOFF_THRESHOLD - 1; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("applies 30s backoff for 4-6 crashes", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < 5; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
const warn = vi.fn();
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep,
|
||||
logger: { warn, error: vi.fn() },
|
||||
});
|
||||
expect(sleep).toHaveBeenCalledWith(30_000);
|
||||
expect(warn).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("applies 5-min backoff for 7+ crashes", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < 8; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).toHaveBeenCalledWith(5 * 60 * 1000);
|
||||
});
|
||||
|
||||
it("throws CrashLoopError after MAX_CRASHES", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < MAX_CRASHES; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
await expect(
|
||||
applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep: vi.fn(async () => {}),
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
}),
|
||||
).rejects.toThrow(CrashLoopError);
|
||||
});
|
||||
|
||||
it("ignores corrupt history file gracefully", async () => {
|
||||
const dir = tempDir();
|
||||
fs.writeFileSync(path.join(dir, "gateway-crash-history.json"), "NOT JSON", "utf-8");
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
});
|
||||
143
src/cli/gateway-cli/crash-loop-guard.ts
Normal file
143
src/cli/gateway-cli/crash-loop-guard.ts
Normal file
@ -0,0 +1,143 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
/**
|
||||
* Crash-loop protection for the gateway process.
|
||||
*
|
||||
* Tracks recent crash timestamps in a lightweight JSON file inside the state
|
||||
* directory and applies an escalating backoff schedule when the gateway keeps
|
||||
* crashing within a sliding window:
|
||||
*
|
||||
* 1-3 crashes → immediate restart (current behaviour)
|
||||
* 4-6 crashes → 30 s delay
|
||||
* 7-9 crashes → 5 min delay
|
||||
* 10+ crashes → refuse to start, write crash report
|
||||
*
|
||||
* See issue #16810.
|
||||
*/
|
||||
|
||||
const CRASH_HISTORY_FILENAME = "gateway-crash-history.json";
|
||||
const CRASH_WINDOW_MS = 15 * 60 * 1000; // 15-minute sliding window
|
||||
const BACKOFF_THRESHOLD = 3;
|
||||
const MAX_CRASHES = 10;
|
||||
|
||||
const BACKOFF_TIERS: ReadonlyArray<{ minCrashes: number; delayMs: number }> = [
|
||||
{ minCrashes: 7, delayMs: 5 * 60 * 1000 },
|
||||
{ minCrashes: 4, delayMs: 30 * 1000 },
|
||||
];
|
||||
|
||||
export interface CrashHistory {
|
||||
crashes: number[];
|
||||
}
|
||||
|
||||
export interface CrashLoopGuardDeps {
|
||||
stateDir: string;
|
||||
now?: () => number;
|
||||
sleep?: (ms: number) => Promise<void>;
|
||||
logger: { warn: (msg: string) => void; error: (msg: string) => void };
|
||||
}
|
||||
|
||||
function crashHistoryPath(stateDir: string): string {
|
||||
return path.join(stateDir, CRASH_HISTORY_FILENAME);
|
||||
}
|
||||
|
||||
function readCrashHistory(filePath: string): CrashHistory {
|
||||
try {
|
||||
const raw = fs.readFileSync(filePath, "utf-8");
|
||||
const parsed = JSON.parse(raw) as CrashHistory;
|
||||
if (Array.isArray(parsed?.crashes)) {
|
||||
return { crashes: parsed.crashes.filter((t) => typeof t === "number") };
|
||||
}
|
||||
} catch {
|
||||
// missing or corrupt — start fresh
|
||||
}
|
||||
return { crashes: [] };
|
||||
}
|
||||
|
||||
function writeCrashHistory(filePath: string, history: CrashHistory): void {
|
||||
try {
|
||||
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
||||
fs.writeFileSync(filePath, JSON.stringify(history, null, 2), "utf-8");
|
||||
} catch {
|
||||
// best-effort — don't let crash tracking itself break startup
|
||||
}
|
||||
}
|
||||
|
||||
function resolveBackoffMs(recentCount: number): number {
|
||||
for (const tier of BACKOFF_TIERS) {
|
||||
if (recentCount >= tier.minCrashes) {
|
||||
return tier.delayMs;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a crash timestamp. Safe to call from a synchronous
|
||||
* `process.on('exit')` handler because it uses only sync I/O.
|
||||
*/
|
||||
export function recordGatewayCrash(stateDir: string, now: number = Date.now()): void {
|
||||
const filePath = crashHistoryPath(stateDir);
|
||||
const history = readCrashHistory(filePath);
|
||||
history.crashes.push(now);
|
||||
const cutoff = now - CRASH_WINDOW_MS;
|
||||
history.crashes = history.crashes.filter((t) => t > cutoff);
|
||||
writeCrashHistory(filePath, history);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the crash history (called after a healthy startup period).
|
||||
*/
|
||||
export function clearGatewayCrashHistory(stateDir: string): void {
|
||||
const filePath = crashHistoryPath(stateDir);
|
||||
writeCrashHistory(filePath, { crashes: [] });
|
||||
}
|
||||
|
||||
export class CrashLoopError extends Error {
|
||||
public readonly recentCrashCount: number;
|
||||
constructor(count: number, windowMinutes: number) {
|
||||
super(
|
||||
`CRASH LOOP DETECTED: ${count} crashes in the last ${windowMinutes} minutes. ` +
|
||||
`The gateway will not restart automatically. ` +
|
||||
`Fix the root cause, then run the gateway manually.`,
|
||||
);
|
||||
this.name = "CrashLoopError";
|
||||
this.recentCrashCount = count;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inspect crash history and apply backoff if needed. Throws
|
||||
* `CrashLoopError` when the crash count exceeds the hard limit.
|
||||
*/
|
||||
export async function applyCrashLoopGuard(deps: CrashLoopGuardDeps): Promise<void> {
|
||||
const now = (deps.now ?? Date.now)();
|
||||
const filePath = crashHistoryPath(deps.stateDir);
|
||||
const history = readCrashHistory(filePath);
|
||||
const cutoff = now - CRASH_WINDOW_MS;
|
||||
const recentCrashes = history.crashes.filter((t) => t > cutoff);
|
||||
|
||||
if (recentCrashes.length >= MAX_CRASHES) {
|
||||
throw new CrashLoopError(recentCrashes.length, CRASH_WINDOW_MS / 60_000);
|
||||
}
|
||||
|
||||
if (recentCrashes.length >= BACKOFF_THRESHOLD) {
|
||||
const delayMs = resolveBackoffMs(recentCrashes.length);
|
||||
if (delayMs > 0) {
|
||||
deps.logger.warn(
|
||||
`Crash-loop backoff: ${recentCrashes.length} crashes in the last ${CRASH_WINDOW_MS / 60_000} min — ` +
|
||||
`waiting ${delayMs / 1000}s before starting gateway.`,
|
||||
);
|
||||
const sleep = deps.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms)));
|
||||
await sleep(delayMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Re-export constants for testing
|
||||
export const _TEST_ONLY = {
|
||||
CRASH_WINDOW_MS,
|
||||
BACKOFF_THRESHOLD,
|
||||
MAX_CRASHES,
|
||||
BACKOFF_TIERS,
|
||||
} as const;
|
||||
@ -1,8 +1,9 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import type { Command } from "commander";
|
||||
import { readSecretFromFile } from "../../acp/secret-file.js";
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import type { GatewayAuthMode, GatewayTailscaleMode } from "../../config/config.js";
|
||||
import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js";
|
||||
import {
|
||||
CONFIG_PATH,
|
||||
loadConfig,
|
||||
@ -13,7 +14,6 @@ import {
|
||||
import { hasConfiguredSecretInput } from "../../config/types.secrets.js";
|
||||
import { resolveGatewayAuth } from "../../gateway/auth.js";
|
||||
import { startGatewayServer } from "../../gateway/server.js";
|
||||
import type { GatewayWsLogStyle } from "../../gateway/ws-logging.js";
|
||||
import { setGatewayWsLogStyle } from "../../gateway/ws-logging.js";
|
||||
import { setVerbose } from "../../globals.js";
|
||||
import { GatewayLockError } from "../../infra/gateway-lock.js";
|
||||
@ -25,6 +25,12 @@ import { defaultRuntime } from "../../runtime.js";
|
||||
import { formatCliCommand } from "../command-format.js";
|
||||
import { inheritOptionFromParent } from "../command-options.js";
|
||||
import { forceFreePortAndWait, waitForPortBindable } from "../ports.js";
|
||||
import {
|
||||
applyCrashLoopGuard,
|
||||
CrashLoopError,
|
||||
clearGatewayCrashHistory,
|
||||
recordGatewayCrash,
|
||||
} from "./crash-loop-guard.js";
|
||||
import { ensureDevGatewayConfig } from "./dev.js";
|
||||
import { runGatewayLoop } from "./run-loop.js";
|
||||
import {
|
||||
@ -194,6 +200,27 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
|
||||
process.env.OPENCLAW_RAW_STREAM_PATH = rawStreamPath;
|
||||
}
|
||||
|
||||
const stateDir = resolveStateDir(process.env);
|
||||
try {
|
||||
await applyCrashLoopGuard({
|
||||
stateDir,
|
||||
logger: gatewayLog,
|
||||
});
|
||||
} catch (err) {
|
||||
if (err instanceof CrashLoopError) {
|
||||
defaultRuntime.error(err.message);
|
||||
defaultRuntime.exit(1);
|
||||
return;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
process.on("exit", (code) => {
|
||||
if (code !== 0) {
|
||||
recordGatewayCrash(stateDir);
|
||||
}
|
||||
});
|
||||
|
||||
if (devMode) {
|
||||
await ensureDevGatewayConfig({ reset: Boolean(opts.reset) });
|
||||
}
|
||||
@ -422,12 +449,15 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
|
||||
await runGatewayLoop({
|
||||
runtime: defaultRuntime,
|
||||
lockPort: port,
|
||||
start: async () =>
|
||||
await startGatewayServer(port, {
|
||||
start: async () => {
|
||||
const server = await startGatewayServer(port, {
|
||||
bind,
|
||||
auth: authOverride,
|
||||
tailscale: tailscaleOverride,
|
||||
}),
|
||||
});
|
||||
clearGatewayCrashHistory(stateDir);
|
||||
return server;
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
if (
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user