Merge 0151b3ebb6740d529052f268505b2a69f77a888b into 5e417b44e1540f528d2ae63e3e20229a902d1db2
This commit is contained in:
commit
19dc28af97
165
src/cli/gateway-cli/crash-loop-guard.test.ts
Normal file
165
src/cli/gateway-cli/crash-loop-guard.test.ts
Normal file
@ -0,0 +1,165 @@
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
applyCrashLoopGuard,
|
||||
clearGatewayCrashHistory,
|
||||
CrashLoopError,
|
||||
recordGatewayCrash,
|
||||
_TEST_ONLY,
|
||||
} from "./crash-loop-guard.js";
|
||||
|
||||
const { CRASH_WINDOW_MS, BACKOFF_THRESHOLD, MAX_CRASHES } = _TEST_ONLY;
|
||||
|
||||
function makeTempDir(): string {
|
||||
return fs.mkdtempSync(path.join(os.tmpdir(), "crash-guard-"));
|
||||
}
|
||||
|
||||
function rmDir(dir: string): void {
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
describe("crash-loop-guard", () => {
|
||||
const dirs: string[] = [];
|
||||
afterEach(() => {
|
||||
for (const d of dirs) {
|
||||
rmDir(d);
|
||||
}
|
||||
dirs.length = 0;
|
||||
});
|
||||
|
||||
function tempDir(): string {
|
||||
const d = makeTempDir();
|
||||
dirs.push(d);
|
||||
return d;
|
||||
}
|
||||
|
||||
describe("recordGatewayCrash / clearGatewayCrashHistory", () => {
|
||||
it("records crash timestamps and clears them", () => {
|
||||
const dir = tempDir();
|
||||
recordGatewayCrash(dir, 1000);
|
||||
recordGatewayCrash(dir, 2000);
|
||||
|
||||
const raw = JSON.parse(
|
||||
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
|
||||
);
|
||||
expect(raw.crashes).toEqual([1000, 2000]);
|
||||
|
||||
clearGatewayCrashHistory(dir);
|
||||
const cleared = JSON.parse(
|
||||
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
|
||||
);
|
||||
expect(cleared.crashes).toEqual([]);
|
||||
});
|
||||
|
||||
it("prunes timestamps outside the window", () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
const old = now - CRASH_WINDOW_MS - 1000;
|
||||
recordGatewayCrash(dir, old);
|
||||
recordGatewayCrash(dir, now);
|
||||
|
||||
const raw = JSON.parse(
|
||||
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
|
||||
);
|
||||
expect(raw.crashes).toEqual([now]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("applyCrashLoopGuard", () => {
|
||||
it("does nothing when crash history is empty", async () => {
|
||||
const dir = tempDir();
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does nothing for fewer than BACKOFF_THRESHOLD crashes", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < BACKOFF_THRESHOLD - 1; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("applies 30s backoff for 4-6 crashes", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < 5; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
const warn = vi.fn();
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep,
|
||||
logger: { warn, error: vi.fn() },
|
||||
});
|
||||
expect(sleep).toHaveBeenCalledWith(30_000);
|
||||
expect(warn).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("applies 5-min backoff for 7+ crashes", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < 8; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).toHaveBeenCalledWith(5 * 60 * 1000);
|
||||
});
|
||||
|
||||
it("throws CrashLoopError after MAX_CRASHES", async () => {
|
||||
const dir = tempDir();
|
||||
const now = Date.now();
|
||||
for (let i = 0; i < MAX_CRASHES; i++) {
|
||||
recordGatewayCrash(dir, now - i * 1000);
|
||||
}
|
||||
|
||||
await expect(
|
||||
applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
now: () => now,
|
||||
sleep: vi.fn(async () => {}),
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
}),
|
||||
).rejects.toThrow(CrashLoopError);
|
||||
});
|
||||
|
||||
it("ignores corrupt history file gracefully", async () => {
|
||||
const dir = tempDir();
|
||||
fs.writeFileSync(path.join(dir, "gateway-crash-history.json"), "NOT JSON", "utf-8");
|
||||
|
||||
const sleep = vi.fn(async () => {});
|
||||
await applyCrashLoopGuard({
|
||||
stateDir: dir,
|
||||
sleep,
|
||||
logger: { warn: vi.fn(), error: vi.fn() },
|
||||
});
|
||||
expect(sleep).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
});
|
||||
166
src/cli/gateway-cli/crash-loop-guard.ts
Normal file
166
src/cli/gateway-cli/crash-loop-guard.ts
Normal file
@ -0,0 +1,166 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
/**
|
||||
* Crash-loop protection for the gateway process.
|
||||
*
|
||||
* Tracks recent crash timestamps in a lightweight JSON file inside the state
|
||||
* directory and applies an escalating backoff schedule when the gateway keeps
|
||||
* crashing within a sliding window:
|
||||
*
|
||||
* 1-3 crashes → immediate restart (current behaviour)
|
||||
* 4-6 crashes → 30 s delay
|
||||
* 7-9 crashes → 5 min delay
|
||||
* 10+ crashes → refuse to start, write crash report
|
||||
*
|
||||
* See issue #16810.
|
||||
*/
|
||||
|
||||
const CRASH_HISTORY_FILENAME = "gateway-crash-history.json";
|
||||
const CRASH_WINDOW_MS = 15 * 60 * 1000; // 15-minute sliding window
|
||||
const BACKOFF_THRESHOLD = 3;
|
||||
const MAX_CRASHES = 10;
|
||||
|
||||
const BACKOFF_TIERS: ReadonlyArray<{ minCrashes: number; delayMs: number }> = [
|
||||
{ minCrashes: 7, delayMs: 5 * 60 * 1000 },
|
||||
{ minCrashes: 4, delayMs: 30 * 1000 },
|
||||
];
|
||||
|
||||
export interface CrashHistory {
|
||||
crashes: number[];
|
||||
}
|
||||
|
||||
export interface CrashLoopGuardDeps {
|
||||
stateDir: string;
|
||||
now?: () => number;
|
||||
sleep?: (ms: number) => Promise<void>;
|
||||
logger: { warn: (msg: string) => void; error: (msg: string) => void };
|
||||
}
|
||||
|
||||
function crashHistoryPath(stateDir: string): string {
|
||||
return path.join(stateDir, CRASH_HISTORY_FILENAME);
|
||||
}
|
||||
|
||||
function readCrashHistory(filePath: string): CrashHistory {
|
||||
try {
|
||||
const raw = fs.readFileSync(filePath, "utf-8");
|
||||
const parsed = JSON.parse(raw) as CrashHistory;
|
||||
if (Array.isArray(parsed?.crashes)) {
|
||||
return { crashes: parsed.crashes.filter((t) => typeof t === "number") };
|
||||
}
|
||||
} catch {
|
||||
// missing or corrupt — start fresh
|
||||
}
|
||||
return { crashes: [] };
|
||||
}
|
||||
|
||||
function writeCrashHistory(filePath: string, history: CrashHistory): void {
|
||||
try {
|
||||
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
||||
fs.writeFileSync(filePath, JSON.stringify(history, null, 2), "utf-8");
|
||||
} catch {
|
||||
// best-effort — don't let crash tracking itself break startup
|
||||
}
|
||||
}
|
||||
|
||||
function resolveBackoffMs(recentCount: number): number {
|
||||
for (const tier of BACKOFF_TIERS) {
|
||||
if (recentCount >= tier.minCrashes) {
|
||||
return tier.delayMs;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a crash timestamp. Safe to call from a synchronous
|
||||
* `process.on('exit')` handler because it uses only sync I/O.
|
||||
*/
|
||||
export function recordGatewayCrash(stateDir: string, now: number = Date.now()): void {
|
||||
const filePath = crashHistoryPath(stateDir);
|
||||
const history = readCrashHistory(filePath);
|
||||
history.crashes.push(now);
|
||||
const cutoff = now - CRASH_WINDOW_MS;
|
||||
history.crashes = history.crashes.filter((t) => t > cutoff);
|
||||
writeCrashHistory(filePath, history);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the crash history (called after a healthy startup period).
|
||||
*/
|
||||
export function clearGatewayCrashHistory(stateDir: string): void {
|
||||
const filePath = crashHistoryPath(stateDir);
|
||||
writeCrashHistory(filePath, { crashes: [] });
|
||||
}
|
||||
|
||||
export class CrashLoopError extends Error {
|
||||
public readonly recentCrashCount: number;
|
||||
constructor(count: number, windowMinutes: number) {
|
||||
super(
|
||||
`CRASH LOOP DETECTED: ${count} crashes in the last ${windowMinutes} minutes. ` +
|
||||
`The gateway will not restart automatically. ` +
|
||||
`Fix the root cause, then run the gateway manually. ` +
|
||||
`To reset the crash counter immediately, delete the gateway-crash-history.json ` +
|
||||
`file in the state directory, or wait ${windowMinutes} minutes for it to expire.`,
|
||||
);
|
||||
this.name = "CrashLoopError";
|
||||
this.recentCrashCount = count;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inspect crash history and apply backoff if needed. Throws
|
||||
* `CrashLoopError` when the crash count exceeds the hard limit.
|
||||
*/
|
||||
export async function applyCrashLoopGuard(deps: CrashLoopGuardDeps): Promise<void> {
|
||||
const now = (deps.now ?? Date.now)();
|
||||
const filePath = crashHistoryPath(deps.stateDir);
|
||||
const history = readCrashHistory(filePath);
|
||||
const cutoff = now - CRASH_WINDOW_MS;
|
||||
const recentCrashes = history.crashes.filter((t) => t > cutoff);
|
||||
|
||||
if (recentCrashes.length >= MAX_CRASHES) {
|
||||
const reportPath = path.join(deps.stateDir, "crash-report.json");
|
||||
try {
|
||||
fs.mkdirSync(path.dirname(reportPath), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
reportPath,
|
||||
JSON.stringify(
|
||||
{
|
||||
detectedAt: new Date(now).toISOString(),
|
||||
recentCrashCount: recentCrashes.length,
|
||||
windowMinutes: CRASH_WINDOW_MS / 60_000,
|
||||
crashTimestamps: recentCrashes.map((t) => new Date(t).toISOString()),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
"utf-8",
|
||||
);
|
||||
deps.logger.error(`Crash report written to ${reportPath}`);
|
||||
} catch {
|
||||
// best-effort
|
||||
}
|
||||
throw new CrashLoopError(recentCrashes.length, CRASH_WINDOW_MS / 60_000);
|
||||
}
|
||||
|
||||
if (recentCrashes.length >= BACKOFF_THRESHOLD) {
|
||||
const delayMs = resolveBackoffMs(recentCrashes.length);
|
||||
if (delayMs > 0) {
|
||||
deps.logger.warn(
|
||||
`Crash-loop backoff: ${recentCrashes.length} crashes in the last ${CRASH_WINDOW_MS / 60_000} min — ` +
|
||||
`waiting ${delayMs / 1000}s before starting gateway.`,
|
||||
);
|
||||
const sleep = deps.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms)));
|
||||
await sleep(delayMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Re-export constants for testing
|
||||
export const _TEST_ONLY = {
|
||||
CRASH_WINDOW_MS,
|
||||
BACKOFF_THRESHOLD,
|
||||
MAX_CRASHES,
|
||||
BACKOFF_TIERS,
|
||||
} as const;
|
||||
@ -25,6 +25,12 @@ import { defaultRuntime } from "../../runtime.js";
|
||||
import { formatCliCommand } from "../command-format.js";
|
||||
import { inheritOptionFromParent } from "../command-options.js";
|
||||
import { forceFreePortAndWait, waitForPortBindable } from "../ports.js";
|
||||
import {
|
||||
applyCrashLoopGuard,
|
||||
CrashLoopError,
|
||||
clearGatewayCrashHistory,
|
||||
recordGatewayCrash,
|
||||
} from "./crash-loop-guard.js";
|
||||
import { ensureDevGatewayConfig } from "./dev.js";
|
||||
import { runGatewayLoop } from "./run-loop.js";
|
||||
import {
|
||||
@ -194,6 +200,21 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
|
||||
process.env.OPENCLAW_RAW_STREAM_PATH = rawStreamPath;
|
||||
}
|
||||
|
||||
const stateDir = resolveStateDir(process.env);
|
||||
try {
|
||||
await applyCrashLoopGuard({
|
||||
stateDir,
|
||||
logger: gatewayLog,
|
||||
});
|
||||
} catch (err) {
|
||||
if (err instanceof CrashLoopError) {
|
||||
defaultRuntime.error(err.message);
|
||||
defaultRuntime.exit(1);
|
||||
return;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
if (devMode) {
|
||||
await ensureDevGatewayConfig({ reset: Boolean(opts.reset) });
|
||||
}
|
||||
@ -418,16 +439,25 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
|
||||
}
|
||||
: undefined;
|
||||
|
||||
process.on("exit", (code) => {
|
||||
if (code !== 0) {
|
||||
recordGatewayCrash(stateDir);
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
await runGatewayLoop({
|
||||
runtime: defaultRuntime,
|
||||
lockPort: port,
|
||||
start: async () =>
|
||||
await startGatewayServer(port, {
|
||||
start: async () => {
|
||||
const server = await startGatewayServer(port, {
|
||||
bind,
|
||||
auth: authOverride,
|
||||
tailscale: tailscaleOverride,
|
||||
}),
|
||||
});
|
||||
clearGatewayCrashHistory(stateDir);
|
||||
return server;
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
if (
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user