Merge 46cd41bde0b6bba075f9ceeac4d76081cc505fe0 into 8a05c05596ca9ba0735dafd8e359885de4c2c969
This commit is contained in:
commit
0b6c054b0d
174
src/config/crash-tracker.test.ts
Normal file
174
src/config/crash-tracker.test.ts
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||||
|
import fs from "node:fs";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
import {
|
||||||
|
readCrashTrackerState,
|
||||||
|
writeCrashTrackerState,
|
||||||
|
recordStartupAndCheckCrashLoop,
|
||||||
|
saveLastKnownGood,
|
||||||
|
hasLastKnownGood,
|
||||||
|
revertToLastKnownGood,
|
||||||
|
clearCrashTracker,
|
||||||
|
getLastKnownGoodPath,
|
||||||
|
getFailedConfigPath,
|
||||||
|
} from "./crash-tracker.js";
|
||||||
|
|
||||||
|
describe("crash-tracker", () => {
|
||||||
|
let tmpDir: string;
|
||||||
|
let stateDir: string;
|
||||||
|
let configPath: string;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "crash-tracker-test-"));
|
||||||
|
stateDir = path.join(tmpDir, "state");
|
||||||
|
fs.mkdirSync(stateDir, { recursive: true });
|
||||||
|
configPath = path.join(tmpDir, "openclaw.json");
|
||||||
|
fs.writeFileSync(configPath, '{"valid": true}');
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("readCrashTrackerState", () => {
|
||||||
|
it("returns empty state when file does not exist", () => {
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
expect(state.startupTimestamps).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("reads existing state", () => {
|
||||||
|
const expected = { startupTimestamps: [1000, 2000, 3000] };
|
||||||
|
writeCrashTrackerState(stateDir, expected);
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
expect(state.startupTimestamps).toEqual([1000, 2000, 3000]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("handles corrupted state file", () => {
|
||||||
|
fs.writeFileSync(path.join(stateDir, "crash-tracker.json"), "not json");
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
expect(state.startupTimestamps).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("recordStartupAndCheckCrashLoop", () => {
|
||||||
|
it("returns false for first startup", () => {
|
||||||
|
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
expect(isCrashLoop).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for 2 startups", () => {
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
expect(isCrashLoop).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true after 3 startups within window", () => {
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
expect(isCrashLoop).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("respects custom maxCrashes", () => {
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||||
|
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 });
|
||||||
|
expect(isCrashLoop).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("ignores old timestamps outside window", () => {
|
||||||
|
// Manually write old timestamps
|
||||||
|
const oldTime = Date.now() - 120_000; // 2 minutes ago
|
||||||
|
writeCrashTrackerState(stateDir, {
|
||||||
|
startupTimestamps: [oldTime, oldTime + 1000],
|
||||||
|
});
|
||||||
|
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
expect(isCrashLoop).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("saveLastKnownGood", () => {
|
||||||
|
it("copies config to .last-known-good", () => {
|
||||||
|
const result = saveLastKnownGood(configPath);
|
||||||
|
expect(result).toBe(true);
|
||||||
|
const lkgContent = fs.readFileSync(getLastKnownGoodPath(configPath), "utf-8");
|
||||||
|
expect(lkgContent).toBe('{"valid": true}');
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false if config does not exist", () => {
|
||||||
|
const result = saveLastKnownGood(path.join(tmpDir, "nonexistent.json"));
|
||||||
|
expect(result).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("hasLastKnownGood", () => {
|
||||||
|
it("returns false when no LKG exists", () => {
|
||||||
|
expect(hasLastKnownGood(configPath)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true after saving LKG", () => {
|
||||||
|
saveLastKnownGood(configPath);
|
||||||
|
expect(hasLastKnownGood(configPath)).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("revertToLastKnownGood", () => {
|
||||||
|
it("reverts config to last-known-good", () => {
|
||||||
|
// Save good config as LKG
|
||||||
|
saveLastKnownGood(configPath);
|
||||||
|
|
||||||
|
// Write bad config
|
||||||
|
fs.writeFileSync(configPath, '{"bad": true}');
|
||||||
|
|
||||||
|
// Revert
|
||||||
|
const result = revertToLastKnownGood(configPath, stateDir);
|
||||||
|
expect(result).toBe(true);
|
||||||
|
|
||||||
|
// Config should be restored
|
||||||
|
const content = fs.readFileSync(configPath, "utf-8");
|
||||||
|
expect(content).toBe('{"valid": true}');
|
||||||
|
});
|
||||||
|
|
||||||
|
it("saves failed config for debugging", () => {
|
||||||
|
saveLastKnownGood(configPath);
|
||||||
|
fs.writeFileSync(configPath, '{"bad": true}');
|
||||||
|
|
||||||
|
revertToLastKnownGood(configPath, stateDir);
|
||||||
|
|
||||||
|
// Find the failed config file
|
||||||
|
const files = fs.readdirSync(tmpDir).filter((f) => f.includes(".failed-"));
|
||||||
|
expect(files.length).toBe(1);
|
||||||
|
const failedContent = fs.readFileSync(path.join(tmpDir, files[0]!), "utf-8");
|
||||||
|
expect(failedContent).toBe('{"bad": true}');
|
||||||
|
});
|
||||||
|
|
||||||
|
it("resets crash tracker after revert", () => {
|
||||||
|
saveLastKnownGood(configPath);
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
|
||||||
|
revertToLastKnownGood(configPath, stateDir);
|
||||||
|
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
expect(state.startupTimestamps).toEqual([]);
|
||||||
|
expect(state.lastRevertTimestamp).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false when no LKG exists", () => {
|
||||||
|
const result = revertToLastKnownGood(configPath, stateDir);
|
||||||
|
expect(result).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("clearCrashTracker", () => {
|
||||||
|
it("clears startup timestamps", () => {
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
clearCrashTracker(stateDir);
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
expect(state.startupTimestamps).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
171
src/config/crash-tracker.ts
Normal file
171
src/config/crash-tracker.ts
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
/**
|
||||||
|
* Crash-loop detection and last-known-good config management.
|
||||||
|
*
|
||||||
|
* Tracks gateway startup timestamps and detects crash loops.
|
||||||
|
* When a crash loop is detected, automatically reverts to the
|
||||||
|
* last-known-good config.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import fs from "node:fs";
|
||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
export type CrashTrackerState = {
|
||||||
|
startupTimestamps: number[];
|
||||||
|
lastRevertTimestamp?: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type CrashTrackerOptions = {
|
||||||
|
/** Max crashes within the window before triggering revert. Default: 3 */
|
||||||
|
maxCrashes?: number;
|
||||||
|
/** Time window in ms. Default: 60_000 (60s) */
|
||||||
|
windowMs?: number;
|
||||||
|
/** Seconds to wait before marking startup as healthy. Default: 10 */
|
||||||
|
healthyAfterSeconds?: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
const DEFAULT_MAX_CRASHES = 3;
|
||||||
|
const DEFAULT_WINDOW_MS = 60_000;
|
||||||
|
const DEFAULT_HEALTHY_AFTER_SECONDS = 10;
|
||||||
|
const CRASH_TRACKER_FILENAME = "crash-tracker.json";
|
||||||
|
const LAST_KNOWN_GOOD_SUFFIX = ".last-known-good";
|
||||||
|
const FAILED_CONFIG_PREFIX = ".failed-";
|
||||||
|
|
||||||
|
export function getCrashTrackerPath(stateDir: string): string {
|
||||||
|
return path.join(stateDir, CRASH_TRACKER_FILENAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getLastKnownGoodPath(configPath: string): string {
|
||||||
|
return configPath + LAST_KNOWN_GOOD_SUFFIX;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getFailedConfigPath(configPath: string, timestamp?: number): string {
|
||||||
|
const ts = timestamp ?? Date.now();
|
||||||
|
return configPath + FAILED_CONFIG_PREFIX + ts;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function readCrashTrackerState(stateDir: string): CrashTrackerState {
|
||||||
|
const trackerPath = getCrashTrackerPath(stateDir);
|
||||||
|
try {
|
||||||
|
const raw = fs.readFileSync(trackerPath, "utf-8");
|
||||||
|
const parsed = JSON.parse(raw) as CrashTrackerState;
|
||||||
|
if (!Array.isArray(parsed.startupTimestamps)) {
|
||||||
|
return { startupTimestamps: [] };
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
} catch {
|
||||||
|
return { startupTimestamps: [] };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function writeCrashTrackerState(stateDir: string, state: CrashTrackerState): void {
|
||||||
|
const trackerPath = getCrashTrackerPath(stateDir);
|
||||||
|
fs.mkdirSync(path.dirname(trackerPath), { recursive: true });
|
||||||
|
fs.writeFileSync(trackerPath, JSON.stringify(state, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record a startup timestamp and check if we're in a crash loop.
|
||||||
|
* Returns true if a crash loop is detected (caller should revert config).
|
||||||
|
*/
|
||||||
|
export function recordStartupAndCheckCrashLoop(
|
||||||
|
stateDir: string,
|
||||||
|
options: CrashTrackerOptions = {},
|
||||||
|
): boolean {
|
||||||
|
const maxCrashes = options.maxCrashes ?? DEFAULT_MAX_CRASHES;
|
||||||
|
const windowMs = options.windowMs ?? DEFAULT_WINDOW_MS;
|
||||||
|
const now = Date.now();
|
||||||
|
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
|
||||||
|
// Add current startup
|
||||||
|
state.startupTimestamps.push(now);
|
||||||
|
|
||||||
|
// Prune timestamps outside the window
|
||||||
|
state.startupTimestamps = state.startupTimestamps.filter((ts) => now - ts <= windowMs);
|
||||||
|
|
||||||
|
writeCrashTrackerState(stateDir, state);
|
||||||
|
|
||||||
|
return state.startupTimestamps.length >= maxCrashes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save the current config as last-known-good.
|
||||||
|
* Should be called after gateway has been healthy for N seconds.
|
||||||
|
*/
|
||||||
|
export function saveLastKnownGood(configPath: string): boolean {
|
||||||
|
const lkgPath = getLastKnownGoodPath(configPath);
|
||||||
|
try {
|
||||||
|
fs.copyFileSync(configPath, lkgPath);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a last-known-good config exists.
|
||||||
|
*/
|
||||||
|
export function hasLastKnownGood(configPath: string): boolean {
|
||||||
|
return fs.existsSync(getLastKnownGoodPath(configPath));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Revert to last-known-good config.
|
||||||
|
* Saves the current (bad) config as a failed config for debugging.
|
||||||
|
* Returns true if revert was successful.
|
||||||
|
*/
|
||||||
|
export function revertToLastKnownGood(configPath: string, stateDir: string): boolean {
|
||||||
|
const lkgPath = getLastKnownGoodPath(configPath);
|
||||||
|
if (!fs.existsSync(lkgPath)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Save current (bad) config for debugging
|
||||||
|
const failedPath = getFailedConfigPath(configPath);
|
||||||
|
if (fs.existsSync(configPath)) {
|
||||||
|
fs.copyFileSync(configPath, failedPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Revert to last-known-good
|
||||||
|
fs.copyFileSync(lkgPath, configPath);
|
||||||
|
|
||||||
|
// Record the revert
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
state.lastRevertTimestamp = Date.now();
|
||||||
|
state.startupTimestamps = []; // Reset crash counter after revert
|
||||||
|
writeCrashTrackerState(stateDir, state);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear the crash tracker (call after healthy startup confirmed).
|
||||||
|
*/
|
||||||
|
export function clearCrashTracker(stateDir: string): void {
|
||||||
|
const state = readCrashTrackerState(stateDir);
|
||||||
|
state.startupTimestamps = [];
|
||||||
|
writeCrashTrackerState(stateDir, state);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Schedule saving last-known-good after healthy startup.
|
||||||
|
* Returns a cancel function.
|
||||||
|
*/
|
||||||
|
export function scheduleLastKnownGoodSave(
|
||||||
|
configPath: string,
|
||||||
|
stateDir: string,
|
||||||
|
options: CrashTrackerOptions = {},
|
||||||
|
): () => void {
|
||||||
|
const healthyAfterSeconds = options.healthyAfterSeconds ?? DEFAULT_HEALTHY_AFTER_SECONDS;
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
saveLastKnownGood(configPath);
|
||||||
|
clearCrashTracker(stateDir);
|
||||||
|
}, healthyAfterSeconds * 1000);
|
||||||
|
// Don't keep the process alive just for this timer
|
||||||
|
timer.unref();
|
||||||
|
return () => clearTimeout(timer);
|
||||||
|
}
|
||||||
@ -19,7 +19,14 @@ import {
|
|||||||
readConfigFileSnapshot,
|
readConfigFileSnapshot,
|
||||||
writeConfigFile,
|
writeConfigFile,
|
||||||
} from "../config/config.js";
|
} from "../config/config.js";
|
||||||
|
import {
|
||||||
|
hasLastKnownGood,
|
||||||
|
recordStartupAndCheckCrashLoop,
|
||||||
|
revertToLastKnownGood,
|
||||||
|
scheduleLastKnownGoodSave,
|
||||||
|
} from "../config/crash-tracker.js";
|
||||||
import { formatConfigIssueLines } from "../config/issue-format.js";
|
import { formatConfigIssueLines } from "../config/issue-format.js";
|
||||||
|
import { resolveStateDir } from "../config/paths.js";
|
||||||
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
|
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
|
||||||
import { resolveMainSessionKey } from "../config/sessions.js";
|
import { resolveMainSessionKey } from "../config/sessions.js";
|
||||||
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
|
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
|
||||||
@ -363,6 +370,17 @@ export async function startGatewayServer(
|
|||||||
port = 18789,
|
port = 18789,
|
||||||
opts: GatewayServerOptions = {},
|
opts: GatewayServerOptions = {},
|
||||||
): Promise<GatewayServer> {
|
): Promise<GatewayServer> {
|
||||||
|
const stateDir = resolveStateDir();
|
||||||
|
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||||
|
if (isCrashLoop) {
|
||||||
|
if (hasLastKnownGood(CONFIG_PATH)) {
|
||||||
|
log.warn("gateway: crash loop detected; reverting to last-known-good config");
|
||||||
|
revertToLastKnownGood(CONFIG_PATH, stateDir);
|
||||||
|
} else {
|
||||||
|
log.warn("gateway: crash loop detected but no last-known-good config found");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const minimalTestGateway =
|
const minimalTestGateway =
|
||||||
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
|
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
|
||||||
|
|
||||||
@ -1327,6 +1345,8 @@ export async function startGatewayServer(
|
|||||||
httpServers,
|
httpServers,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
scheduleLastKnownGoodSave(CONFIG_PATH, stateDir);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
close: async (opts) => {
|
close: async (opts) => {
|
||||||
// Run gateway_stop plugin hook before shutdown
|
// Run gateway_stop plugin hook before shutdown
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user