Merge 0151b3ebb6740d529052f268505b2a69f77a888b into 5e417b44e1540f528d2ae63e3e20229a902d1db2

This commit is contained in:
Protocol Zero 2026-03-21 05:00:31 +03:00 committed by GitHub
commit 19dc28af97
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 364 additions and 3 deletions

View File

@ -0,0 +1,165 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import {
applyCrashLoopGuard,
clearGatewayCrashHistory,
CrashLoopError,
recordGatewayCrash,
_TEST_ONLY,
} from "./crash-loop-guard.js";
const { CRASH_WINDOW_MS, BACKOFF_THRESHOLD, MAX_CRASHES } = _TEST_ONLY;
function makeTempDir(): string {
return fs.mkdtempSync(path.join(os.tmpdir(), "crash-guard-"));
}
function rmDir(dir: string): void {
fs.rmSync(dir, { recursive: true, force: true });
}
describe("crash-loop-guard", () => {
const dirs: string[] = [];
afterEach(() => {
for (const d of dirs) {
rmDir(d);
}
dirs.length = 0;
});
function tempDir(): string {
const d = makeTempDir();
dirs.push(d);
return d;
}
describe("recordGatewayCrash / clearGatewayCrashHistory", () => {
it("records crash timestamps and clears them", () => {
const dir = tempDir();
recordGatewayCrash(dir, 1000);
recordGatewayCrash(dir, 2000);
const raw = JSON.parse(
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
);
expect(raw.crashes).toEqual([1000, 2000]);
clearGatewayCrashHistory(dir);
const cleared = JSON.parse(
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
);
expect(cleared.crashes).toEqual([]);
});
it("prunes timestamps outside the window", () => {
const dir = tempDir();
const now = Date.now();
const old = now - CRASH_WINDOW_MS - 1000;
recordGatewayCrash(dir, old);
recordGatewayCrash(dir, now);
const raw = JSON.parse(
fs.readFileSync(path.join(dir, "gateway-crash-history.json"), "utf-8"),
);
expect(raw.crashes).toEqual([now]);
});
});
describe("applyCrashLoopGuard", () => {
it("does nothing when crash history is empty", async () => {
const dir = tempDir();
const sleep = vi.fn(async () => {});
await applyCrashLoopGuard({
stateDir: dir,
sleep,
logger: { warn: vi.fn(), error: vi.fn() },
});
expect(sleep).not.toHaveBeenCalled();
});
it("does nothing for fewer than BACKOFF_THRESHOLD crashes", async () => {
const dir = tempDir();
const now = Date.now();
for (let i = 0; i < BACKOFF_THRESHOLD - 1; i++) {
recordGatewayCrash(dir, now - i * 1000);
}
const sleep = vi.fn(async () => {});
await applyCrashLoopGuard({
stateDir: dir,
now: () => now,
sleep,
logger: { warn: vi.fn(), error: vi.fn() },
});
expect(sleep).not.toHaveBeenCalled();
});
it("applies 30s backoff for 4-6 crashes", async () => {
const dir = tempDir();
const now = Date.now();
for (let i = 0; i < 5; i++) {
recordGatewayCrash(dir, now - i * 1000);
}
const sleep = vi.fn(async () => {});
const warn = vi.fn();
await applyCrashLoopGuard({
stateDir: dir,
now: () => now,
sleep,
logger: { warn, error: vi.fn() },
});
expect(sleep).toHaveBeenCalledWith(30_000);
expect(warn).toHaveBeenCalled();
});
it("applies 5-min backoff for 7+ crashes", async () => {
const dir = tempDir();
const now = Date.now();
for (let i = 0; i < 8; i++) {
recordGatewayCrash(dir, now - i * 1000);
}
const sleep = vi.fn(async () => {});
await applyCrashLoopGuard({
stateDir: dir,
now: () => now,
sleep,
logger: { warn: vi.fn(), error: vi.fn() },
});
expect(sleep).toHaveBeenCalledWith(5 * 60 * 1000);
});
it("throws CrashLoopError after MAX_CRASHES", async () => {
const dir = tempDir();
const now = Date.now();
for (let i = 0; i < MAX_CRASHES; i++) {
recordGatewayCrash(dir, now - i * 1000);
}
await expect(
applyCrashLoopGuard({
stateDir: dir,
now: () => now,
sleep: vi.fn(async () => {}),
logger: { warn: vi.fn(), error: vi.fn() },
}),
).rejects.toThrow(CrashLoopError);
});
it("ignores corrupt history file gracefully", async () => {
const dir = tempDir();
fs.writeFileSync(path.join(dir, "gateway-crash-history.json"), "NOT JSON", "utf-8");
const sleep = vi.fn(async () => {});
await applyCrashLoopGuard({
stateDir: dir,
sleep,
logger: { warn: vi.fn(), error: vi.fn() },
});
expect(sleep).not.toHaveBeenCalled();
});
});
});

View File

@ -0,0 +1,166 @@
import fs from "node:fs";
import path from "node:path";
/**
* Crash-loop protection for the gateway process.
*
* Tracks recent crash timestamps in a lightweight JSON file inside the state
* directory and applies an escalating backoff schedule when the gateway keeps
* crashing within a sliding window:
*
* 1-3 crashes immediate restart (current behaviour)
* 4-6 crashes 30 s delay
* 7-9 crashes 5 min delay
* 10+ crashes refuse to start, write crash report
*
* See issue #16810.
*/
const CRASH_HISTORY_FILENAME = "gateway-crash-history.json";
const CRASH_WINDOW_MS = 15 * 60 * 1000; // 15-minute sliding window
const BACKOFF_THRESHOLD = 3;
const MAX_CRASHES = 10;
const BACKOFF_TIERS: ReadonlyArray<{ minCrashes: number; delayMs: number }> = [
{ minCrashes: 7, delayMs: 5 * 60 * 1000 },
{ minCrashes: 4, delayMs: 30 * 1000 },
];
export interface CrashHistory {
crashes: number[];
}
export interface CrashLoopGuardDeps {
stateDir: string;
now?: () => number;
sleep?: (ms: number) => Promise<void>;
logger: { warn: (msg: string) => void; error: (msg: string) => void };
}
function crashHistoryPath(stateDir: string): string {
return path.join(stateDir, CRASH_HISTORY_FILENAME);
}
function readCrashHistory(filePath: string): CrashHistory {
try {
const raw = fs.readFileSync(filePath, "utf-8");
const parsed = JSON.parse(raw) as CrashHistory;
if (Array.isArray(parsed?.crashes)) {
return { crashes: parsed.crashes.filter((t) => typeof t === "number") };
}
} catch {
// missing or corrupt — start fresh
}
return { crashes: [] };
}
function writeCrashHistory(filePath: string, history: CrashHistory): void {
try {
fs.mkdirSync(path.dirname(filePath), { recursive: true });
fs.writeFileSync(filePath, JSON.stringify(history, null, 2), "utf-8");
} catch {
// best-effort — don't let crash tracking itself break startup
}
}
function resolveBackoffMs(recentCount: number): number {
for (const tier of BACKOFF_TIERS) {
if (recentCount >= tier.minCrashes) {
return tier.delayMs;
}
}
return 0;
}
/**
* Record a crash timestamp. Safe to call from a synchronous
* `process.on('exit')` handler because it uses only sync I/O.
*/
export function recordGatewayCrash(stateDir: string, now: number = Date.now()): void {
const filePath = crashHistoryPath(stateDir);
const history = readCrashHistory(filePath);
history.crashes.push(now);
const cutoff = now - CRASH_WINDOW_MS;
history.crashes = history.crashes.filter((t) => t > cutoff);
writeCrashHistory(filePath, history);
}
/**
* Clear the crash history (called after a healthy startup period).
*/
export function clearGatewayCrashHistory(stateDir: string): void {
const filePath = crashHistoryPath(stateDir);
writeCrashHistory(filePath, { crashes: [] });
}
export class CrashLoopError extends Error {
public readonly recentCrashCount: number;
constructor(count: number, windowMinutes: number) {
super(
`CRASH LOOP DETECTED: ${count} crashes in the last ${windowMinutes} minutes. ` +
`The gateway will not restart automatically. ` +
`Fix the root cause, then run the gateway manually. ` +
`To reset the crash counter immediately, delete the gateway-crash-history.json ` +
`file in the state directory, or wait ${windowMinutes} minutes for it to expire.`,
);
this.name = "CrashLoopError";
this.recentCrashCount = count;
}
}
/**
* Inspect crash history and apply backoff if needed. Throws
* `CrashLoopError` when the crash count exceeds the hard limit.
*/
export async function applyCrashLoopGuard(deps: CrashLoopGuardDeps): Promise<void> {
const now = (deps.now ?? Date.now)();
const filePath = crashHistoryPath(deps.stateDir);
const history = readCrashHistory(filePath);
const cutoff = now - CRASH_WINDOW_MS;
const recentCrashes = history.crashes.filter((t) => t > cutoff);
if (recentCrashes.length >= MAX_CRASHES) {
const reportPath = path.join(deps.stateDir, "crash-report.json");
try {
fs.mkdirSync(path.dirname(reportPath), { recursive: true });
fs.writeFileSync(
reportPath,
JSON.stringify(
{
detectedAt: new Date(now).toISOString(),
recentCrashCount: recentCrashes.length,
windowMinutes: CRASH_WINDOW_MS / 60_000,
crashTimestamps: recentCrashes.map((t) => new Date(t).toISOString()),
},
null,
2,
),
"utf-8",
);
deps.logger.error(`Crash report written to ${reportPath}`);
} catch {
// best-effort
}
throw new CrashLoopError(recentCrashes.length, CRASH_WINDOW_MS / 60_000);
}
if (recentCrashes.length >= BACKOFF_THRESHOLD) {
const delayMs = resolveBackoffMs(recentCrashes.length);
if (delayMs > 0) {
deps.logger.warn(
`Crash-loop backoff: ${recentCrashes.length} crashes in the last ${CRASH_WINDOW_MS / 60_000} min — ` +
`waiting ${delayMs / 1000}s before starting gateway.`,
);
const sleep = deps.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms)));
await sleep(delayMs);
}
}
}
// Re-export constants for testing
export const _TEST_ONLY = {
CRASH_WINDOW_MS,
BACKOFF_THRESHOLD,
MAX_CRASHES,
BACKOFF_TIERS,
} as const;

View File

@ -25,6 +25,12 @@ import { defaultRuntime } from "../../runtime.js";
import { formatCliCommand } from "../command-format.js";
import { inheritOptionFromParent } from "../command-options.js";
import { forceFreePortAndWait, waitForPortBindable } from "../ports.js";
import {
applyCrashLoopGuard,
CrashLoopError,
clearGatewayCrashHistory,
recordGatewayCrash,
} from "./crash-loop-guard.js";
import { ensureDevGatewayConfig } from "./dev.js";
import { runGatewayLoop } from "./run-loop.js";
import {
@ -194,6 +200,21 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
process.env.OPENCLAW_RAW_STREAM_PATH = rawStreamPath;
}
const stateDir = resolveStateDir(process.env);
try {
await applyCrashLoopGuard({
stateDir,
logger: gatewayLog,
});
} catch (err) {
if (err instanceof CrashLoopError) {
defaultRuntime.error(err.message);
defaultRuntime.exit(1);
return;
}
throw err;
}
if (devMode) {
await ensureDevGatewayConfig({ reset: Boolean(opts.reset) });
}
@ -418,16 +439,25 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
}
: undefined;
process.on("exit", (code) => {
if (code !== 0) {
recordGatewayCrash(stateDir);
}
});
try {
await runGatewayLoop({
runtime: defaultRuntime,
lockPort: port,
start: async () =>
await startGatewayServer(port, {
start: async () => {
const server = await startGatewayServer(port, {
bind,
auth: authOverride,
tailscale: tailscaleOverride,
}),
});
clearGatewayCrashHistory(stateDir);
return server;
},
});
} catch (err) {
if (