diff --git a/src/config/crash-tracker.test.ts b/src/config/crash-tracker.test.ts new file mode 100644 index 00000000000..19f166d1449 --- /dev/null +++ b/src/config/crash-tracker.test.ts @@ -0,0 +1,174 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { + readCrashTrackerState, + writeCrashTrackerState, + recordStartupAndCheckCrashLoop, + saveLastKnownGood, + hasLastKnownGood, + revertToLastKnownGood, + clearCrashTracker, + getLastKnownGoodPath, + getFailedConfigPath, +} from "./crash-tracker.js"; + +describe("crash-tracker", () => { + let tmpDir: string; + let stateDir: string; + let configPath: string; + + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "crash-tracker-test-")); + stateDir = path.join(tmpDir, "state"); + fs.mkdirSync(stateDir, { recursive: true }); + configPath = path.join(tmpDir, "openclaw.json"); + fs.writeFileSync(configPath, '{"valid": true}'); + }); + + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + describe("readCrashTrackerState", () => { + it("returns empty state when file does not exist", () => { + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + }); + + it("reads existing state", () => { + const expected = { startupTimestamps: [1000, 2000, 3000] }; + writeCrashTrackerState(stateDir, expected); + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([1000, 2000, 3000]); + }); + + it("handles corrupted state file", () => { + fs.writeFileSync(path.join(stateDir, "crash-tracker.json"), "not json"); + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + }); + }); + + describe("recordStartupAndCheckCrashLoop", () => { + it("returns false for first startup", () => { + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(false); + }); + + it("returns false for 2 startups", () => { + recordStartupAndCheckCrashLoop(stateDir); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(false); + }); + + it("returns true after 3 startups within window", () => { + recordStartupAndCheckCrashLoop(stateDir); + recordStartupAndCheckCrashLoop(stateDir); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(true); + }); + + it("respects custom maxCrashes", () => { + recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + expect(isCrashLoop).toBe(false); + }); + + it("ignores old timestamps outside window", () => { + // Manually write old timestamps + const oldTime = Date.now() - 120_000; // 2 minutes ago + writeCrashTrackerState(stateDir, { + startupTimestamps: [oldTime, oldTime + 1000], + }); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(false); + }); + }); + + describe("saveLastKnownGood", () => { + it("copies config to .last-known-good", () => { + const result = saveLastKnownGood(configPath); + expect(result).toBe(true); + const lkgContent = fs.readFileSync(getLastKnownGoodPath(configPath), "utf-8"); + expect(lkgContent).toBe('{"valid": true}'); + }); + + it("returns false if config does not exist", () => { + const result = saveLastKnownGood(path.join(tmpDir, "nonexistent.json")); + expect(result).toBe(false); + }); + }); + + describe("hasLastKnownGood", () => { + it("returns false when no LKG exists", () => { + expect(hasLastKnownGood(configPath)).toBe(false); + }); + + it("returns true after saving LKG", () => { + saveLastKnownGood(configPath); + expect(hasLastKnownGood(configPath)).toBe(true); + }); + }); + + describe("revertToLastKnownGood", () => { + it("reverts config to last-known-good", () => { + // Save good config as LKG + saveLastKnownGood(configPath); + + // Write bad config + fs.writeFileSync(configPath, '{"bad": true}'); + + // Revert + const result = revertToLastKnownGood(configPath, stateDir); + expect(result).toBe(true); + + // Config should be restored + const content = fs.readFileSync(configPath, "utf-8"); + expect(content).toBe('{"valid": true}'); + }); + + it("saves failed config for debugging", () => { + saveLastKnownGood(configPath); + fs.writeFileSync(configPath, '{"bad": true}'); + + revertToLastKnownGood(configPath, stateDir); + + // Find the failed config file + const files = fs.readdirSync(tmpDir).filter((f) => f.includes(".failed-")); + expect(files.length).toBe(1); + const failedContent = fs.readFileSync(path.join(tmpDir, files[0]!), "utf-8"); + expect(failedContent).toBe('{"bad": true}'); + }); + + it("resets crash tracker after revert", () => { + saveLastKnownGood(configPath); + recordStartupAndCheckCrashLoop(stateDir); + recordStartupAndCheckCrashLoop(stateDir); + + revertToLastKnownGood(configPath, stateDir); + + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + expect(state.lastRevertTimestamp).toBeGreaterThan(0); + }); + + it("returns false when no LKG exists", () => { + const result = revertToLastKnownGood(configPath, stateDir); + expect(result).toBe(false); + }); + }); + + describe("clearCrashTracker", () => { + it("clears startup timestamps", () => { + recordStartupAndCheckCrashLoop(stateDir); + recordStartupAndCheckCrashLoop(stateDir); + clearCrashTracker(stateDir); + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + }); + }); +}); diff --git a/src/config/crash-tracker.ts b/src/config/crash-tracker.ts new file mode 100644 index 00000000000..fe11e7cddde --- /dev/null +++ b/src/config/crash-tracker.ts @@ -0,0 +1,171 @@ +/** + * Crash-loop detection and last-known-good config management. + * + * Tracks gateway startup timestamps and detects crash loops. + * When a crash loop is detected, automatically reverts to the + * last-known-good config. + */ + +import fs from "node:fs"; +import path from "node:path"; + +export type CrashTrackerState = { + startupTimestamps: number[]; + lastRevertTimestamp?: number; +}; + +export type CrashTrackerOptions = { + /** Max crashes within the window before triggering revert. Default: 3 */ + maxCrashes?: number; + /** Time window in ms. Default: 60_000 (60s) */ + windowMs?: number; + /** Seconds to wait before marking startup as healthy. Default: 10 */ + healthyAfterSeconds?: number; +}; + +const DEFAULT_MAX_CRASHES = 3; +const DEFAULT_WINDOW_MS = 60_000; +const DEFAULT_HEALTHY_AFTER_SECONDS = 10; +const CRASH_TRACKER_FILENAME = "crash-tracker.json"; +const LAST_KNOWN_GOOD_SUFFIX = ".last-known-good"; +const FAILED_CONFIG_PREFIX = ".failed-"; + +export function getCrashTrackerPath(stateDir: string): string { + return path.join(stateDir, CRASH_TRACKER_FILENAME); +} + +export function getLastKnownGoodPath(configPath: string): string { + return configPath + LAST_KNOWN_GOOD_SUFFIX; +} + +export function getFailedConfigPath(configPath: string, timestamp?: number): string { + const ts = timestamp ?? Date.now(); + return configPath + FAILED_CONFIG_PREFIX + ts; +} + +export function readCrashTrackerState(stateDir: string): CrashTrackerState { + const trackerPath = getCrashTrackerPath(stateDir); + try { + const raw = fs.readFileSync(trackerPath, "utf-8"); + const parsed = JSON.parse(raw) as CrashTrackerState; + if (!Array.isArray(parsed.startupTimestamps)) { + return { startupTimestamps: [] }; + } + return parsed; + } catch { + return { startupTimestamps: [] }; + } +} + +export function writeCrashTrackerState(stateDir: string, state: CrashTrackerState): void { + const trackerPath = getCrashTrackerPath(stateDir); + fs.mkdirSync(path.dirname(trackerPath), { recursive: true }); + fs.writeFileSync(trackerPath, JSON.stringify(state, null, 2)); +} + +/** + * Record a startup timestamp and check if we're in a crash loop. + * Returns true if a crash loop is detected (caller should revert config). + */ +export function recordStartupAndCheckCrashLoop( + stateDir: string, + options: CrashTrackerOptions = {}, +): boolean { + const maxCrashes = options.maxCrashes ?? DEFAULT_MAX_CRASHES; + const windowMs = options.windowMs ?? DEFAULT_WINDOW_MS; + const now = Date.now(); + + const state = readCrashTrackerState(stateDir); + + // Add current startup + state.startupTimestamps.push(now); + + // Prune timestamps outside the window + state.startupTimestamps = state.startupTimestamps.filter((ts) => now - ts <= windowMs); + + writeCrashTrackerState(stateDir, state); + + return state.startupTimestamps.length >= maxCrashes; +} + +/** + * Save the current config as last-known-good. + * Should be called after gateway has been healthy for N seconds. + */ +export function saveLastKnownGood(configPath: string): boolean { + const lkgPath = getLastKnownGoodPath(configPath); + try { + fs.copyFileSync(configPath, lkgPath); + return true; + } catch { + return false; + } +} + +/** + * Check if a last-known-good config exists. + */ +export function hasLastKnownGood(configPath: string): boolean { + return fs.existsSync(getLastKnownGoodPath(configPath)); +} + +/** + * Revert to last-known-good config. + * Saves the current (bad) config as a failed config for debugging. + * Returns true if revert was successful. + */ +export function revertToLastKnownGood(configPath: string, stateDir: string): boolean { + const lkgPath = getLastKnownGoodPath(configPath); + if (!fs.existsSync(lkgPath)) { + return false; + } + + try { + // Save current (bad) config for debugging + const failedPath = getFailedConfigPath(configPath); + if (fs.existsSync(configPath)) { + fs.copyFileSync(configPath, failedPath); + } + + // Revert to last-known-good + fs.copyFileSync(lkgPath, configPath); + + // Record the revert + const state = readCrashTrackerState(stateDir); + state.lastRevertTimestamp = Date.now(); + state.startupTimestamps = []; // Reset crash counter after revert + writeCrashTrackerState(stateDir, state); + + return true; + } catch { + return false; + } +} + +/** + * Clear the crash tracker (call after healthy startup confirmed). + */ +export function clearCrashTracker(stateDir: string): void { + const state = readCrashTrackerState(stateDir); + state.startupTimestamps = []; + writeCrashTrackerState(stateDir, state); +} + +/** + * Schedule saving last-known-good after healthy startup. + * Returns a cancel function. + */ +export function scheduleLastKnownGoodSave( + configPath: string, + stateDir: string, + options: CrashTrackerOptions = {}, +): () => void { + const healthyAfterSeconds = options.healthyAfterSeconds ?? DEFAULT_HEALTHY_AFTER_SECONDS; + const timer = setTimeout(() => { + saveLastKnownGood(configPath); + clearCrashTracker(stateDir); + }, healthyAfterSeconds * 1000); + // Don't keep the process alive just for this timer + timer.unref(); + return () => clearTimeout(timer); +} diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 7a4c18b6593..7b4326d5603 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -19,7 +19,14 @@ import { readConfigFileSnapshot, writeConfigFile, } from "../config/config.js"; +import { + hasLastKnownGood, + recordStartupAndCheckCrashLoop, + revertToLastKnownGood, + scheduleLastKnownGoodSave, +} from "../config/crash-tracker.js"; import { formatConfigIssueLines } from "../config/issue-format.js"; +import { resolveStateDir } from "../config/paths.js"; import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js"; import { resolveMainSessionKey } from "../config/sessions.js"; import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js"; @@ -363,6 +370,17 @@ export async function startGatewayServer( port = 18789, opts: GatewayServerOptions = {}, ): Promise { + const stateDir = resolveStateDir(); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + if (isCrashLoop) { + if (hasLastKnownGood(CONFIG_PATH)) { + log.warn("gateway: crash loop detected; reverting to last-known-good config"); + revertToLastKnownGood(CONFIG_PATH, stateDir); + } else { + log.warn("gateway: crash loop detected but no last-known-good config found"); + } + } + const minimalTestGateway = process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1"; @@ -1327,6 +1345,8 @@ export async function startGatewayServer( httpServers, }); + scheduleLastKnownGoodSave(CONFIG_PATH, stateDir); + return { close: async (opts) => { // Run gateway_stop plugin hook before shutdown