Merge 46cd41bde0b6bba075f9ceeac4d76081cc505fe0 into 8a05c05596ca9ba0735dafd8e359885de4c2c969
This commit is contained in:
commit
0b6c054b0d
174
src/config/crash-tracker.test.ts
Normal file
174
src/config/crash-tracker.test.ts
Normal file
@ -0,0 +1,174 @@
|
||||
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import {
|
||||
readCrashTrackerState,
|
||||
writeCrashTrackerState,
|
||||
recordStartupAndCheckCrashLoop,
|
||||
saveLastKnownGood,
|
||||
hasLastKnownGood,
|
||||
revertToLastKnownGood,
|
||||
clearCrashTracker,
|
||||
getLastKnownGoodPath,
|
||||
getFailedConfigPath,
|
||||
} from "./crash-tracker.js";
|
||||
|
||||
describe("crash-tracker", () => {
|
||||
let tmpDir: string;
|
||||
let stateDir: string;
|
||||
let configPath: string;
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "crash-tracker-test-"));
|
||||
stateDir = path.join(tmpDir, "state");
|
||||
fs.mkdirSync(stateDir, { recursive: true });
|
||||
configPath = path.join(tmpDir, "openclaw.json");
|
||||
fs.writeFileSync(configPath, '{"valid": true}');
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe("readCrashTrackerState", () => {
|
||||
it("returns empty state when file does not exist", () => {
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
expect(state.startupTimestamps).toEqual([]);
|
||||
});
|
||||
|
||||
it("reads existing state", () => {
|
||||
const expected = { startupTimestamps: [1000, 2000, 3000] };
|
||||
writeCrashTrackerState(stateDir, expected);
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
expect(state.startupTimestamps).toEqual([1000, 2000, 3000]);
|
||||
});
|
||||
|
||||
it("handles corrupted state file", () => {
|
||||
fs.writeFileSync(path.join(stateDir, "crash-tracker.json"), "not json");
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
expect(state.startupTimestamps).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("recordStartupAndCheckCrashLoop", () => {
|
||||
it("returns false for first startup", () => {
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||
expect(isCrashLoop).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for 2 startups", () => {
|
||||
recordStartupAndCheckCrashLoop(stateDir);
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||
expect(isCrashLoop).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true after 3 startups within window", () => {
|
||||
recordStartupAndCheckCrashLoop(stateDir);
|
||||
recordStartupAndCheckCrashLoop(stateDir);
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||
expect(isCrashLoop).toBe(true);
|
||||
});
|
||||
|
||||
it("respects custom maxCrashes", () => {
|
||||
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||
expect(isCrashLoop).toBe(false);
|
||||
});
|
||||
|
||||
it("ignores old timestamps outside window", () => {
|
||||
// Manually write old timestamps
|
||||
const oldTime = Date.now() - 120_000; // 2 minutes ago
|
||||
writeCrashTrackerState(stateDir, {
|
||||
startupTimestamps: [oldTime, oldTime + 1000],
|
||||
});
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||
expect(isCrashLoop).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("saveLastKnownGood", () => {
|
||||
it("copies config to .last-known-good", () => {
|
||||
const result = saveLastKnownGood(configPath);
|
||||
expect(result).toBe(true);
|
||||
const lkgContent = fs.readFileSync(getLastKnownGoodPath(configPath), "utf-8");
|
||||
expect(lkgContent).toBe('{"valid": true}');
|
||||
});
|
||||
|
||||
it("returns false if config does not exist", () => {
|
||||
const result = saveLastKnownGood(path.join(tmpDir, "nonexistent.json"));
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("hasLastKnownGood", () => {
|
||||
it("returns false when no LKG exists", () => {
|
||||
expect(hasLastKnownGood(configPath)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true after saving LKG", () => {
|
||||
saveLastKnownGood(configPath);
|
||||
expect(hasLastKnownGood(configPath)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("revertToLastKnownGood", () => {
|
||||
it("reverts config to last-known-good", () => {
|
||||
// Save good config as LKG
|
||||
saveLastKnownGood(configPath);
|
||||
|
||||
// Write bad config
|
||||
fs.writeFileSync(configPath, '{"bad": true}');
|
||||
|
||||
// Revert
|
||||
const result = revertToLastKnownGood(configPath, stateDir);
|
||||
expect(result).toBe(true);
|
||||
|
||||
// Config should be restored
|
||||
const content = fs.readFileSync(configPath, "utf-8");
|
||||
expect(content).toBe('{"valid": true}');
|
||||
});
|
||||
|
||||
it("saves failed config for debugging", () => {
|
||||
saveLastKnownGood(configPath);
|
||||
fs.writeFileSync(configPath, '{"bad": true}');
|
||||
|
||||
revertToLastKnownGood(configPath, stateDir);
|
||||
|
||||
// Find the failed config file
|
||||
const files = fs.readdirSync(tmpDir).filter((f) => f.includes(".failed-"));
|
||||
expect(files.length).toBe(1);
|
||||
const failedContent = fs.readFileSync(path.join(tmpDir, files[0]!), "utf-8");
|
||||
expect(failedContent).toBe('{"bad": true}');
|
||||
});
|
||||
|
||||
it("resets crash tracker after revert", () => {
|
||||
saveLastKnownGood(configPath);
|
||||
recordStartupAndCheckCrashLoop(stateDir);
|
||||
recordStartupAndCheckCrashLoop(stateDir);
|
||||
|
||||
revertToLastKnownGood(configPath, stateDir);
|
||||
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
expect(state.startupTimestamps).toEqual([]);
|
||||
expect(state.lastRevertTimestamp).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("returns false when no LKG exists", () => {
|
||||
const result = revertToLastKnownGood(configPath, stateDir);
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("clearCrashTracker", () => {
|
||||
it("clears startup timestamps", () => {
|
||||
recordStartupAndCheckCrashLoop(stateDir);
|
||||
recordStartupAndCheckCrashLoop(stateDir);
|
||||
clearCrashTracker(stateDir);
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
expect(state.startupTimestamps).toEqual([]);
|
||||
});
|
||||
});
|
||||
});
|
||||
171
src/config/crash-tracker.ts
Normal file
171
src/config/crash-tracker.ts
Normal file
@ -0,0 +1,171 @@
|
||||
/**
|
||||
* Crash-loop detection and last-known-good config management.
|
||||
*
|
||||
* Tracks gateway startup timestamps and detects crash loops.
|
||||
* When a crash loop is detected, automatically reverts to the
|
||||
* last-known-good config.
|
||||
*/
|
||||
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
export type CrashTrackerState = {
|
||||
startupTimestamps: number[];
|
||||
lastRevertTimestamp?: number;
|
||||
};
|
||||
|
||||
export type CrashTrackerOptions = {
|
||||
/** Max crashes within the window before triggering revert. Default: 3 */
|
||||
maxCrashes?: number;
|
||||
/** Time window in ms. Default: 60_000 (60s) */
|
||||
windowMs?: number;
|
||||
/** Seconds to wait before marking startup as healthy. Default: 10 */
|
||||
healthyAfterSeconds?: number;
|
||||
};
|
||||
|
||||
const DEFAULT_MAX_CRASHES = 3;
|
||||
const DEFAULT_WINDOW_MS = 60_000;
|
||||
const DEFAULT_HEALTHY_AFTER_SECONDS = 10;
|
||||
const CRASH_TRACKER_FILENAME = "crash-tracker.json";
|
||||
const LAST_KNOWN_GOOD_SUFFIX = ".last-known-good";
|
||||
const FAILED_CONFIG_PREFIX = ".failed-";
|
||||
|
||||
export function getCrashTrackerPath(stateDir: string): string {
|
||||
return path.join(stateDir, CRASH_TRACKER_FILENAME);
|
||||
}
|
||||
|
||||
export function getLastKnownGoodPath(configPath: string): string {
|
||||
return configPath + LAST_KNOWN_GOOD_SUFFIX;
|
||||
}
|
||||
|
||||
export function getFailedConfigPath(configPath: string, timestamp?: number): string {
|
||||
const ts = timestamp ?? Date.now();
|
||||
return configPath + FAILED_CONFIG_PREFIX + ts;
|
||||
}
|
||||
|
||||
export function readCrashTrackerState(stateDir: string): CrashTrackerState {
|
||||
const trackerPath = getCrashTrackerPath(stateDir);
|
||||
try {
|
||||
const raw = fs.readFileSync(trackerPath, "utf-8");
|
||||
const parsed = JSON.parse(raw) as CrashTrackerState;
|
||||
if (!Array.isArray(parsed.startupTimestamps)) {
|
||||
return { startupTimestamps: [] };
|
||||
}
|
||||
return parsed;
|
||||
} catch {
|
||||
return { startupTimestamps: [] };
|
||||
}
|
||||
}
|
||||
|
||||
export function writeCrashTrackerState(stateDir: string, state: CrashTrackerState): void {
|
||||
const trackerPath = getCrashTrackerPath(stateDir);
|
||||
fs.mkdirSync(path.dirname(trackerPath), { recursive: true });
|
||||
fs.writeFileSync(trackerPath, JSON.stringify(state, null, 2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a startup timestamp and check if we're in a crash loop.
|
||||
* Returns true if a crash loop is detected (caller should revert config).
|
||||
*/
|
||||
export function recordStartupAndCheckCrashLoop(
|
||||
stateDir: string,
|
||||
options: CrashTrackerOptions = {},
|
||||
): boolean {
|
||||
const maxCrashes = options.maxCrashes ?? DEFAULT_MAX_CRASHES;
|
||||
const windowMs = options.windowMs ?? DEFAULT_WINDOW_MS;
|
||||
const now = Date.now();
|
||||
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
|
||||
// Add current startup
|
||||
state.startupTimestamps.push(now);
|
||||
|
||||
// Prune timestamps outside the window
|
||||
state.startupTimestamps = state.startupTimestamps.filter((ts) => now - ts <= windowMs);
|
||||
|
||||
writeCrashTrackerState(stateDir, state);
|
||||
|
||||
return state.startupTimestamps.length >= maxCrashes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the current config as last-known-good.
|
||||
* Should be called after gateway has been healthy for N seconds.
|
||||
*/
|
||||
export function saveLastKnownGood(configPath: string): boolean {
|
||||
const lkgPath = getLastKnownGoodPath(configPath);
|
||||
try {
|
||||
fs.copyFileSync(configPath, lkgPath);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a last-known-good config exists.
|
||||
*/
|
||||
export function hasLastKnownGood(configPath: string): boolean {
|
||||
return fs.existsSync(getLastKnownGoodPath(configPath));
|
||||
}
|
||||
|
||||
/**
|
||||
* Revert to last-known-good config.
|
||||
* Saves the current (bad) config as a failed config for debugging.
|
||||
* Returns true if revert was successful.
|
||||
*/
|
||||
export function revertToLastKnownGood(configPath: string, stateDir: string): boolean {
|
||||
const lkgPath = getLastKnownGoodPath(configPath);
|
||||
if (!fs.existsSync(lkgPath)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
// Save current (bad) config for debugging
|
||||
const failedPath = getFailedConfigPath(configPath);
|
||||
if (fs.existsSync(configPath)) {
|
||||
fs.copyFileSync(configPath, failedPath);
|
||||
}
|
||||
|
||||
// Revert to last-known-good
|
||||
fs.copyFileSync(lkgPath, configPath);
|
||||
|
||||
// Record the revert
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
state.lastRevertTimestamp = Date.now();
|
||||
state.startupTimestamps = []; // Reset crash counter after revert
|
||||
writeCrashTrackerState(stateDir, state);
|
||||
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the crash tracker (call after healthy startup confirmed).
|
||||
*/
|
||||
export function clearCrashTracker(stateDir: string): void {
|
||||
const state = readCrashTrackerState(stateDir);
|
||||
state.startupTimestamps = [];
|
||||
writeCrashTrackerState(stateDir, state);
|
||||
}
|
||||
|
||||
/**
|
||||
* Schedule saving last-known-good after healthy startup.
|
||||
* Returns a cancel function.
|
||||
*/
|
||||
export function scheduleLastKnownGoodSave(
|
||||
configPath: string,
|
||||
stateDir: string,
|
||||
options: CrashTrackerOptions = {},
|
||||
): () => void {
|
||||
const healthyAfterSeconds = options.healthyAfterSeconds ?? DEFAULT_HEALTHY_AFTER_SECONDS;
|
||||
const timer = setTimeout(() => {
|
||||
saveLastKnownGood(configPath);
|
||||
clearCrashTracker(stateDir);
|
||||
}, healthyAfterSeconds * 1000);
|
||||
// Don't keep the process alive just for this timer
|
||||
timer.unref();
|
||||
return () => clearTimeout(timer);
|
||||
}
|
||||
@ -19,7 +19,14 @@ import {
|
||||
readConfigFileSnapshot,
|
||||
writeConfigFile,
|
||||
} from "../config/config.js";
|
||||
import {
|
||||
hasLastKnownGood,
|
||||
recordStartupAndCheckCrashLoop,
|
||||
revertToLastKnownGood,
|
||||
scheduleLastKnownGoodSave,
|
||||
} from "../config/crash-tracker.js";
|
||||
import { formatConfigIssueLines } from "../config/issue-format.js";
|
||||
import { resolveStateDir } from "../config/paths.js";
|
||||
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
|
||||
import { resolveMainSessionKey } from "../config/sessions.js";
|
||||
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
|
||||
@ -363,6 +370,17 @@ export async function startGatewayServer(
|
||||
port = 18789,
|
||||
opts: GatewayServerOptions = {},
|
||||
): Promise<GatewayServer> {
|
||||
const stateDir = resolveStateDir();
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||
if (isCrashLoop) {
|
||||
if (hasLastKnownGood(CONFIG_PATH)) {
|
||||
log.warn("gateway: crash loop detected; reverting to last-known-good config");
|
||||
revertToLastKnownGood(CONFIG_PATH, stateDir);
|
||||
} else {
|
||||
log.warn("gateway: crash loop detected but no last-known-good config found");
|
||||
}
|
||||
}
|
||||
|
||||
const minimalTestGateway =
|
||||
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
|
||||
|
||||
@ -1327,6 +1345,8 @@ export async function startGatewayServer(
|
||||
httpServers,
|
||||
});
|
||||
|
||||
scheduleLastKnownGoodSave(CONFIG_PATH, stateDir);
|
||||
|
||||
return {
|
||||
close: async (opts) => {
|
||||
// Run gateway_stop plugin hook before shutdown
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user