Merge 46cd41bde0b6bba075f9ceeac4d76081cc505fe0 into 8a05c05596ca9ba0735dafd8e359885de4c2c969

This commit is contained in:
David Aronchick 2026-03-21 06:53:31 +01:00 committed by GitHub
commit 0b6c054b0d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 365 additions and 0 deletions

View File

@ -0,0 +1,174 @@
import { describe, it, expect, beforeEach, afterEach } from "vitest";
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import {
readCrashTrackerState,
writeCrashTrackerState,
recordStartupAndCheckCrashLoop,
saveLastKnownGood,
hasLastKnownGood,
revertToLastKnownGood,
clearCrashTracker,
getLastKnownGoodPath,
getFailedConfigPath,
} from "./crash-tracker.js";
describe("crash-tracker", () => {
let tmpDir: string;
let stateDir: string;
let configPath: string;
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "crash-tracker-test-"));
stateDir = path.join(tmpDir, "state");
fs.mkdirSync(stateDir, { recursive: true });
configPath = path.join(tmpDir, "openclaw.json");
fs.writeFileSync(configPath, '{"valid": true}');
});
afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});
describe("readCrashTrackerState", () => {
it("returns empty state when file does not exist", () => {
const state = readCrashTrackerState(stateDir);
expect(state.startupTimestamps).toEqual([]);
});
it("reads existing state", () => {
const expected = { startupTimestamps: [1000, 2000, 3000] };
writeCrashTrackerState(stateDir, expected);
const state = readCrashTrackerState(stateDir);
expect(state.startupTimestamps).toEqual([1000, 2000, 3000]);
});
it("handles corrupted state file", () => {
fs.writeFileSync(path.join(stateDir, "crash-tracker.json"), "not json");
const state = readCrashTrackerState(stateDir);
expect(state.startupTimestamps).toEqual([]);
});
});
describe("recordStartupAndCheckCrashLoop", () => {
it("returns false for first startup", () => {
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
expect(isCrashLoop).toBe(false);
});
it("returns false for 2 startups", () => {
recordStartupAndCheckCrashLoop(stateDir);
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
expect(isCrashLoop).toBe(false);
});
it("returns true after 3 startups within window", () => {
recordStartupAndCheckCrashLoop(stateDir);
recordStartupAndCheckCrashLoop(stateDir);
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
expect(isCrashLoop).toBe(true);
});
it("respects custom maxCrashes", () => {
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
expect(isCrashLoop).toBe(false);
});
it("ignores old timestamps outside window", () => {
// Manually write old timestamps
const oldTime = Date.now() - 120_000; // 2 minutes ago
writeCrashTrackerState(stateDir, {
startupTimestamps: [oldTime, oldTime + 1000],
});
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
expect(isCrashLoop).toBe(false);
});
});
describe("saveLastKnownGood", () => {
it("copies config to .last-known-good", () => {
const result = saveLastKnownGood(configPath);
expect(result).toBe(true);
const lkgContent = fs.readFileSync(getLastKnownGoodPath(configPath), "utf-8");
expect(lkgContent).toBe('{"valid": true}');
});
it("returns false if config does not exist", () => {
const result = saveLastKnownGood(path.join(tmpDir, "nonexistent.json"));
expect(result).toBe(false);
});
});
describe("hasLastKnownGood", () => {
it("returns false when no LKG exists", () => {
expect(hasLastKnownGood(configPath)).toBe(false);
});
it("returns true after saving LKG", () => {
saveLastKnownGood(configPath);
expect(hasLastKnownGood(configPath)).toBe(true);
});
});
describe("revertToLastKnownGood", () => {
it("reverts config to last-known-good", () => {
// Save good config as LKG
saveLastKnownGood(configPath);
// Write bad config
fs.writeFileSync(configPath, '{"bad": true}');
// Revert
const result = revertToLastKnownGood(configPath, stateDir);
expect(result).toBe(true);
// Config should be restored
const content = fs.readFileSync(configPath, "utf-8");
expect(content).toBe('{"valid": true}');
});
it("saves failed config for debugging", () => {
saveLastKnownGood(configPath);
fs.writeFileSync(configPath, '{"bad": true}');
revertToLastKnownGood(configPath, stateDir);
// Find the failed config file
const files = fs.readdirSync(tmpDir).filter((f) => f.includes(".failed-"));
expect(files.length).toBe(1);
const failedContent = fs.readFileSync(path.join(tmpDir, files[0]!), "utf-8");
expect(failedContent).toBe('{"bad": true}');
});
it("resets crash tracker after revert", () => {
saveLastKnownGood(configPath);
recordStartupAndCheckCrashLoop(stateDir);
recordStartupAndCheckCrashLoop(stateDir);
revertToLastKnownGood(configPath, stateDir);
const state = readCrashTrackerState(stateDir);
expect(state.startupTimestamps).toEqual([]);
expect(state.lastRevertTimestamp).toBeGreaterThan(0);
});
it("returns false when no LKG exists", () => {
const result = revertToLastKnownGood(configPath, stateDir);
expect(result).toBe(false);
});
});
describe("clearCrashTracker", () => {
it("clears startup timestamps", () => {
recordStartupAndCheckCrashLoop(stateDir);
recordStartupAndCheckCrashLoop(stateDir);
clearCrashTracker(stateDir);
const state = readCrashTrackerState(stateDir);
expect(state.startupTimestamps).toEqual([]);
});
});
});

171
src/config/crash-tracker.ts Normal file
View File

@ -0,0 +1,171 @@
/**
* Crash-loop detection and last-known-good config management.
*
* Tracks gateway startup timestamps and detects crash loops.
* When a crash loop is detected, automatically reverts to the
* last-known-good config.
*/
import fs from "node:fs";
import path from "node:path";
export type CrashTrackerState = {
startupTimestamps: number[];
lastRevertTimestamp?: number;
};
export type CrashTrackerOptions = {
/** Max crashes within the window before triggering revert. Default: 3 */
maxCrashes?: number;
/** Time window in ms. Default: 60_000 (60s) */
windowMs?: number;
/** Seconds to wait before marking startup as healthy. Default: 10 */
healthyAfterSeconds?: number;
};
const DEFAULT_MAX_CRASHES = 3;
const DEFAULT_WINDOW_MS = 60_000;
const DEFAULT_HEALTHY_AFTER_SECONDS = 10;
const CRASH_TRACKER_FILENAME = "crash-tracker.json";
const LAST_KNOWN_GOOD_SUFFIX = ".last-known-good";
const FAILED_CONFIG_PREFIX = ".failed-";
export function getCrashTrackerPath(stateDir: string): string {
return path.join(stateDir, CRASH_TRACKER_FILENAME);
}
export function getLastKnownGoodPath(configPath: string): string {
return configPath + LAST_KNOWN_GOOD_SUFFIX;
}
export function getFailedConfigPath(configPath: string, timestamp?: number): string {
const ts = timestamp ?? Date.now();
return configPath + FAILED_CONFIG_PREFIX + ts;
}
export function readCrashTrackerState(stateDir: string): CrashTrackerState {
const trackerPath = getCrashTrackerPath(stateDir);
try {
const raw = fs.readFileSync(trackerPath, "utf-8");
const parsed = JSON.parse(raw) as CrashTrackerState;
if (!Array.isArray(parsed.startupTimestamps)) {
return { startupTimestamps: [] };
}
return parsed;
} catch {
return { startupTimestamps: [] };
}
}
export function writeCrashTrackerState(stateDir: string, state: CrashTrackerState): void {
const trackerPath = getCrashTrackerPath(stateDir);
fs.mkdirSync(path.dirname(trackerPath), { recursive: true });
fs.writeFileSync(trackerPath, JSON.stringify(state, null, 2));
}
/**
* Record a startup timestamp and check if we're in a crash loop.
* Returns true if a crash loop is detected (caller should revert config).
*/
export function recordStartupAndCheckCrashLoop(
stateDir: string,
options: CrashTrackerOptions = {},
): boolean {
const maxCrashes = options.maxCrashes ?? DEFAULT_MAX_CRASHES;
const windowMs = options.windowMs ?? DEFAULT_WINDOW_MS;
const now = Date.now();
const state = readCrashTrackerState(stateDir);
// Add current startup
state.startupTimestamps.push(now);
// Prune timestamps outside the window
state.startupTimestamps = state.startupTimestamps.filter((ts) => now - ts <= windowMs);
writeCrashTrackerState(stateDir, state);
return state.startupTimestamps.length >= maxCrashes;
}
/**
* Save the current config as last-known-good.
* Should be called after gateway has been healthy for N seconds.
*/
export function saveLastKnownGood(configPath: string): boolean {
const lkgPath = getLastKnownGoodPath(configPath);
try {
fs.copyFileSync(configPath, lkgPath);
return true;
} catch {
return false;
}
}
/**
* Check if a last-known-good config exists.
*/
export function hasLastKnownGood(configPath: string): boolean {
return fs.existsSync(getLastKnownGoodPath(configPath));
}
/**
* Revert to last-known-good config.
* Saves the current (bad) config as a failed config for debugging.
* Returns true if revert was successful.
*/
export function revertToLastKnownGood(configPath: string, stateDir: string): boolean {
const lkgPath = getLastKnownGoodPath(configPath);
if (!fs.existsSync(lkgPath)) {
return false;
}
try {
// Save current (bad) config for debugging
const failedPath = getFailedConfigPath(configPath);
if (fs.existsSync(configPath)) {
fs.copyFileSync(configPath, failedPath);
}
// Revert to last-known-good
fs.copyFileSync(lkgPath, configPath);
// Record the revert
const state = readCrashTrackerState(stateDir);
state.lastRevertTimestamp = Date.now();
state.startupTimestamps = []; // Reset crash counter after revert
writeCrashTrackerState(stateDir, state);
return true;
} catch {
return false;
}
}
/**
* Clear the crash tracker (call after healthy startup confirmed).
*/
export function clearCrashTracker(stateDir: string): void {
const state = readCrashTrackerState(stateDir);
state.startupTimestamps = [];
writeCrashTrackerState(stateDir, state);
}
/**
* Schedule saving last-known-good after healthy startup.
* Returns a cancel function.
*/
export function scheduleLastKnownGoodSave(
configPath: string,
stateDir: string,
options: CrashTrackerOptions = {},
): () => void {
const healthyAfterSeconds = options.healthyAfterSeconds ?? DEFAULT_HEALTHY_AFTER_SECONDS;
const timer = setTimeout(() => {
saveLastKnownGood(configPath);
clearCrashTracker(stateDir);
}, healthyAfterSeconds * 1000);
// Don't keep the process alive just for this timer
timer.unref();
return () => clearTimeout(timer);
}

View File

@ -19,7 +19,14 @@ import {
readConfigFileSnapshot, readConfigFileSnapshot,
writeConfigFile, writeConfigFile,
} from "../config/config.js"; } from "../config/config.js";
import {
hasLastKnownGood,
recordStartupAndCheckCrashLoop,
revertToLastKnownGood,
scheduleLastKnownGoodSave,
} from "../config/crash-tracker.js";
import { formatConfigIssueLines } from "../config/issue-format.js"; import { formatConfigIssueLines } from "../config/issue-format.js";
import { resolveStateDir } from "../config/paths.js";
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js"; import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
import { resolveMainSessionKey } from "../config/sessions.js"; import { resolveMainSessionKey } from "../config/sessions.js";
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js"; import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
@ -363,6 +370,17 @@ export async function startGatewayServer(
port = 18789, port = 18789,
opts: GatewayServerOptions = {}, opts: GatewayServerOptions = {},
): Promise<GatewayServer> { ): Promise<GatewayServer> {
const stateDir = resolveStateDir();
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
if (isCrashLoop) {
if (hasLastKnownGood(CONFIG_PATH)) {
log.warn("gateway: crash loop detected; reverting to last-known-good config");
revertToLastKnownGood(CONFIG_PATH, stateDir);
} else {
log.warn("gateway: crash loop detected but no last-known-good config found");
}
}
const minimalTestGateway = const minimalTestGateway =
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1"; process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
@ -1327,6 +1345,8 @@ export async function startGatewayServer(
httpServers, httpServers,
}); });
scheduleLastKnownGoodSave(CONFIG_PATH, stateDir);
return { return {
close: async (opts) => { close: async (opts) => {
// Run gateway_stop plugin hook before shutdown // Run gateway_stop plugin hook before shutdown