From 7e4761ccfd7fd6576c6ecd4de492fc968cb12e3e Mon Sep 17 00:00:00 2001 From: Claude Code Date: Mon, 16 Feb 2026 03:16:21 +0000 Subject: [PATCH 1/5] feat: crash-loop detection and last-known-good config rollback Phase 1 of atomic config management: - Track gateway startup timestamps in crash-tracker.json - Detect crash loops (>3 crashes in 60s) - Save last-known-good config after healthy startup (10s) - Auto-revert to last-known-good on crash loop detection - Save failed config for debugging Closes #17700 (Phase 1) --- src/config/crash-tracker.test.ts | 174 +++++++++++++++++++++++++++++++ src/config/crash-tracker.ts | 171 ++++++++++++++++++++++++++++++ 2 files changed, 345 insertions(+) create mode 100644 src/config/crash-tracker.test.ts create mode 100644 src/config/crash-tracker.ts diff --git a/src/config/crash-tracker.test.ts b/src/config/crash-tracker.test.ts new file mode 100644 index 00000000000..19f166d1449 --- /dev/null +++ b/src/config/crash-tracker.test.ts @@ -0,0 +1,174 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { + readCrashTrackerState, + writeCrashTrackerState, + recordStartupAndCheckCrashLoop, + saveLastKnownGood, + hasLastKnownGood, + revertToLastKnownGood, + clearCrashTracker, + getLastKnownGoodPath, + getFailedConfigPath, +} from "./crash-tracker.js"; + +describe("crash-tracker", () => { + let tmpDir: string; + let stateDir: string; + let configPath: string; + + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "crash-tracker-test-")); + stateDir = path.join(tmpDir, "state"); + fs.mkdirSync(stateDir, { recursive: true }); + configPath = path.join(tmpDir, "openclaw.json"); + fs.writeFileSync(configPath, '{"valid": true}'); + }); + + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + describe("readCrashTrackerState", () => { + it("returns empty state when file does not exist", () => { + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + }); + + it("reads existing state", () => { + const expected = { startupTimestamps: [1000, 2000, 3000] }; + writeCrashTrackerState(stateDir, expected); + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([1000, 2000, 3000]); + }); + + it("handles corrupted state file", () => { + fs.writeFileSync(path.join(stateDir, "crash-tracker.json"), "not json"); + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + }); + }); + + describe("recordStartupAndCheckCrashLoop", () => { + it("returns false for first startup", () => { + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(false); + }); + + it("returns false for 2 startups", () => { + recordStartupAndCheckCrashLoop(stateDir); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(false); + }); + + it("returns true after 3 startups within window", () => { + recordStartupAndCheckCrashLoop(stateDir); + recordStartupAndCheckCrashLoop(stateDir); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(true); + }); + + it("respects custom maxCrashes", () => { + recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir, { maxCrashes: 5 }); + expect(isCrashLoop).toBe(false); + }); + + it("ignores old timestamps outside window", () => { + // Manually write old timestamps + const oldTime = Date.now() - 120_000; // 2 minutes ago + writeCrashTrackerState(stateDir, { + startupTimestamps: [oldTime, oldTime + 1000], + }); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); + expect(isCrashLoop).toBe(false); + }); + }); + + describe("saveLastKnownGood", () => { + it("copies config to .last-known-good", () => { + const result = saveLastKnownGood(configPath); + expect(result).toBe(true); + const lkgContent = fs.readFileSync(getLastKnownGoodPath(configPath), "utf-8"); + expect(lkgContent).toBe('{"valid": true}'); + }); + + it("returns false if config does not exist", () => { + const result = saveLastKnownGood(path.join(tmpDir, "nonexistent.json")); + expect(result).toBe(false); + }); + }); + + describe("hasLastKnownGood", () => { + it("returns false when no LKG exists", () => { + expect(hasLastKnownGood(configPath)).toBe(false); + }); + + it("returns true after saving LKG", () => { + saveLastKnownGood(configPath); + expect(hasLastKnownGood(configPath)).toBe(true); + }); + }); + + describe("revertToLastKnownGood", () => { + it("reverts config to last-known-good", () => { + // Save good config as LKG + saveLastKnownGood(configPath); + + // Write bad config + fs.writeFileSync(configPath, '{"bad": true}'); + + // Revert + const result = revertToLastKnownGood(configPath, stateDir); + expect(result).toBe(true); + + // Config should be restored + const content = fs.readFileSync(configPath, "utf-8"); + expect(content).toBe('{"valid": true}'); + }); + + it("saves failed config for debugging", () => { + saveLastKnownGood(configPath); + fs.writeFileSync(configPath, '{"bad": true}'); + + revertToLastKnownGood(configPath, stateDir); + + // Find the failed config file + const files = fs.readdirSync(tmpDir).filter((f) => f.includes(".failed-")); + expect(files.length).toBe(1); + const failedContent = fs.readFileSync(path.join(tmpDir, files[0]!), "utf-8"); + expect(failedContent).toBe('{"bad": true}'); + }); + + it("resets crash tracker after revert", () => { + saveLastKnownGood(configPath); + recordStartupAndCheckCrashLoop(stateDir); + recordStartupAndCheckCrashLoop(stateDir); + + revertToLastKnownGood(configPath, stateDir); + + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + expect(state.lastRevertTimestamp).toBeGreaterThan(0); + }); + + it("returns false when no LKG exists", () => { + const result = revertToLastKnownGood(configPath, stateDir); + expect(result).toBe(false); + }); + }); + + describe("clearCrashTracker", () => { + it("clears startup timestamps", () => { + recordStartupAndCheckCrashLoop(stateDir); + recordStartupAndCheckCrashLoop(stateDir); + clearCrashTracker(stateDir); + const state = readCrashTrackerState(stateDir); + expect(state.startupTimestamps).toEqual([]); + }); + }); +}); diff --git a/src/config/crash-tracker.ts b/src/config/crash-tracker.ts new file mode 100644 index 00000000000..fe11e7cddde --- /dev/null +++ b/src/config/crash-tracker.ts @@ -0,0 +1,171 @@ +/** + * Crash-loop detection and last-known-good config management. + * + * Tracks gateway startup timestamps and detects crash loops. + * When a crash loop is detected, automatically reverts to the + * last-known-good config. + */ + +import fs from "node:fs"; +import path from "node:path"; + +export type CrashTrackerState = { + startupTimestamps: number[]; + lastRevertTimestamp?: number; +}; + +export type CrashTrackerOptions = { + /** Max crashes within the window before triggering revert. Default: 3 */ + maxCrashes?: number; + /** Time window in ms. Default: 60_000 (60s) */ + windowMs?: number; + /** Seconds to wait before marking startup as healthy. Default: 10 */ + healthyAfterSeconds?: number; +}; + +const DEFAULT_MAX_CRASHES = 3; +const DEFAULT_WINDOW_MS = 60_000; +const DEFAULT_HEALTHY_AFTER_SECONDS = 10; +const CRASH_TRACKER_FILENAME = "crash-tracker.json"; +const LAST_KNOWN_GOOD_SUFFIX = ".last-known-good"; +const FAILED_CONFIG_PREFIX = ".failed-"; + +export function getCrashTrackerPath(stateDir: string): string { + return path.join(stateDir, CRASH_TRACKER_FILENAME); +} + +export function getLastKnownGoodPath(configPath: string): string { + return configPath + LAST_KNOWN_GOOD_SUFFIX; +} + +export function getFailedConfigPath(configPath: string, timestamp?: number): string { + const ts = timestamp ?? Date.now(); + return configPath + FAILED_CONFIG_PREFIX + ts; +} + +export function readCrashTrackerState(stateDir: string): CrashTrackerState { + const trackerPath = getCrashTrackerPath(stateDir); + try { + const raw = fs.readFileSync(trackerPath, "utf-8"); + const parsed = JSON.parse(raw) as CrashTrackerState; + if (!Array.isArray(parsed.startupTimestamps)) { + return { startupTimestamps: [] }; + } + return parsed; + } catch { + return { startupTimestamps: [] }; + } +} + +export function writeCrashTrackerState(stateDir: string, state: CrashTrackerState): void { + const trackerPath = getCrashTrackerPath(stateDir); + fs.mkdirSync(path.dirname(trackerPath), { recursive: true }); + fs.writeFileSync(trackerPath, JSON.stringify(state, null, 2)); +} + +/** + * Record a startup timestamp and check if we're in a crash loop. + * Returns true if a crash loop is detected (caller should revert config). + */ +export function recordStartupAndCheckCrashLoop( + stateDir: string, + options: CrashTrackerOptions = {}, +): boolean { + const maxCrashes = options.maxCrashes ?? DEFAULT_MAX_CRASHES; + const windowMs = options.windowMs ?? DEFAULT_WINDOW_MS; + const now = Date.now(); + + const state = readCrashTrackerState(stateDir); + + // Add current startup + state.startupTimestamps.push(now); + + // Prune timestamps outside the window + state.startupTimestamps = state.startupTimestamps.filter((ts) => now - ts <= windowMs); + + writeCrashTrackerState(stateDir, state); + + return state.startupTimestamps.length >= maxCrashes; +} + +/** + * Save the current config as last-known-good. + * Should be called after gateway has been healthy for N seconds. + */ +export function saveLastKnownGood(configPath: string): boolean { + const lkgPath = getLastKnownGoodPath(configPath); + try { + fs.copyFileSync(configPath, lkgPath); + return true; + } catch { + return false; + } +} + +/** + * Check if a last-known-good config exists. + */ +export function hasLastKnownGood(configPath: string): boolean { + return fs.existsSync(getLastKnownGoodPath(configPath)); +} + +/** + * Revert to last-known-good config. + * Saves the current (bad) config as a failed config for debugging. + * Returns true if revert was successful. + */ +export function revertToLastKnownGood(configPath: string, stateDir: string): boolean { + const lkgPath = getLastKnownGoodPath(configPath); + if (!fs.existsSync(lkgPath)) { + return false; + } + + try { + // Save current (bad) config for debugging + const failedPath = getFailedConfigPath(configPath); + if (fs.existsSync(configPath)) { + fs.copyFileSync(configPath, failedPath); + } + + // Revert to last-known-good + fs.copyFileSync(lkgPath, configPath); + + // Record the revert + const state = readCrashTrackerState(stateDir); + state.lastRevertTimestamp = Date.now(); + state.startupTimestamps = []; // Reset crash counter after revert + writeCrashTrackerState(stateDir, state); + + return true; + } catch { + return false; + } +} + +/** + * Clear the crash tracker (call after healthy startup confirmed). + */ +export function clearCrashTracker(stateDir: string): void { + const state = readCrashTrackerState(stateDir); + state.startupTimestamps = []; + writeCrashTrackerState(stateDir, state); +} + +/** + * Schedule saving last-known-good after healthy startup. + * Returns a cancel function. + */ +export function scheduleLastKnownGoodSave( + configPath: string, + stateDir: string, + options: CrashTrackerOptions = {}, +): () => void { + const healthyAfterSeconds = options.healthyAfterSeconds ?? DEFAULT_HEALTHY_AFTER_SECONDS; + const timer = setTimeout(() => { + saveLastKnownGood(configPath); + clearCrashTracker(stateDir); + }, healthyAfterSeconds * 1000); + // Don't keep the process alive just for this timer + timer.unref(); + return () => clearTimeout(timer); +} From 6945242e034d697eb27663b66e01f379a903e2b2 Mon Sep 17 00:00:00 2001 From: David Aronchick Date: Fri, 20 Feb 2026 03:09:47 +0000 Subject: [PATCH 2/5] infra: add stopGatewayViaLock and export lock helpers --- src/infra/gateway-lock.ts | 52 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/src/infra/gateway-lock.ts b/src/infra/gateway-lock.ts index ef89f42a101..a58f4f4e16c 100644 --- a/src/infra/gateway-lock.ts +++ b/src/infra/gateway-lock.ts @@ -139,7 +139,7 @@ function resolveGatewayOwnerStatus( return isGatewayArgv(args) ? "alive" : "dead"; } -async function readLockPayload(lockPath: string): Promise { +export async function readLockPayload(lockPath: string): Promise { try { const raw = await fs.readFile(lockPath, "utf8"); const parsed = JSON.parse(raw) as Partial; @@ -164,7 +164,7 @@ async function readLockPayload(lockPath: string): Promise { } } -function resolveGatewayLockPath(env: NodeJS.ProcessEnv) { +export function resolveGatewayLockPath(env: NodeJS.ProcessEnv) { const stateDir = resolveStateDir(env); const configPath = resolveConfigPath(env, stateDir); const hash = createHash("sha1").update(configPath).digest("hex").slice(0, 8); @@ -173,6 +173,54 @@ function resolveGatewayLockPath(env: NodeJS.ProcessEnv) { return { lockPath, configPath }; } +/** + * Attempt to stop a running gateway by reading its PID from the lock file + * and sending SIGTERM. + */ +export async function stopGatewayViaLock(opts: { + env: NodeJS.ProcessEnv; + stdout: NodeJS.WritableStream; + timeoutMs?: number; +}): Promise { + const { lockPath } = resolveGatewayLockPath(opts.env); + const payload = await readLockPayload(lockPath); + if (!payload || !payload.pid) { + return false; + } + + const pid = payload.pid; + if (!isAlive(pid)) { + return false; + } + + opts.stdout.write(`Stopping gateway (pid ${pid})...\n`); + try { + process.kill(pid, "SIGTERM"); + } catch (err) { + opts.stdout.write(`Failed to send SIGTERM to pid ${pid}: ${String(err)}\n`); + return false; + } + + // Wait for it to die + const timeoutMs = opts.timeoutMs ?? 5000; + const startedAt = Date.now(); + while (Date.now() - startedAt < timeoutMs) { + if (!isAlive(pid)) { + opts.stdout.write(`Gateway (pid ${pid}) stopped.\n`); + return true; + } + await new Promise((r) => setTimeout(r, 200)); + } + + opts.stdout.write(`Gateway (pid ${pid}) did not stop after ${timeoutMs}ms. Sending SIGKILL...\n`); + try { + process.kill(pid, "SIGKILL"); + return true; + } catch { + return false; + } +} + export async function acquireGatewayLock( opts: GatewayLockOptions = {}, ): Promise { From f310efee096f6f55551142ce55827f9ae670f192 Mon Sep 17 00:00:00 2001 From: David Aronchick Date: Fri, 20 Feb 2026 03:09:48 +0000 Subject: [PATCH 3/5] daemon: fallback to PID-based stop in systemd restart/stop when systemctl unavailable --- src/daemon/systemd.ts | 59 ++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/src/daemon/systemd.ts b/src/daemon/systemd.ts index 1a5dd72b968..87103a0dce8 100644 --- a/src/daemon/systemd.ts +++ b/src/daemon/systemd.ts @@ -1,7 +1,7 @@ import { execFile } from "node:child_process"; import fs from "node:fs/promises"; import path from "node:path"; -import { promisify } from "node:util"; +import { stopGatewayViaLock } from "../infra/gateway-lock.js"; import type { GatewayServiceRuntime } from "./service-runtime.js"; import { colorize, isRich, theme } from "../terminal/theme.js"; import { @@ -296,14 +296,28 @@ export async function stopSystemdService({ stdout: NodeJS.WritableStream; env?: Record; }): Promise { - await assertSystemdAvailable(); - const serviceName = resolveSystemdServiceName(env ?? {}); - const unitName = `${serviceName}.service`; - const res = await execSystemctl(["--user", "stop", unitName]); - if (res.code !== 0) { - throw new Error(`systemctl stop failed: ${res.stderr || res.stdout}`.trim()); + try { + await assertSystemdAvailable(); + const serviceName = resolveSystemdServiceName(env ?? {}); + const unitName = `${serviceName}.service`; + const res = await execSystemctl(["--user", "stop", unitName]); + if (res.code !== 0) { + throw new Error(`systemctl stop failed: ${res.stderr || res.stdout}`.trim()); + } + stdout.write(`${formatLine("Stopped systemd service", unitName)}\n`); + } catch (err) { + const detail = String(err); + if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) { + const stopped = await stopGatewayViaLock({ + env: (env ?? process.env) as NodeJS.ProcessEnv, + stdout, + }); + if (stopped) { + return; + } + } + throw err; } - stdout.write(`${formatLine("Stopped systemd service", unitName)}\n`); } export async function restartSystemdService({ @@ -313,14 +327,29 @@ export async function restartSystemdService({ stdout: NodeJS.WritableStream; env?: Record; }): Promise { - await assertSystemdAvailable(); - const serviceName = resolveSystemdServiceName(env ?? {}); - const unitName = `${serviceName}.service`; - const res = await execSystemctl(["--user", "restart", unitName]); - if (res.code !== 0) { - throw new Error(`systemctl restart failed: ${res.stderr || res.stdout}`.trim()); + try { + await assertSystemdAvailable(); + const serviceName = resolveSystemdServiceName(env ?? {}); + const unitName = `${serviceName}.service`; + const res = await execSystemctl(["--user", "restart", unitName]); + if (res.code !== 0) { + throw new Error(`systemctl restart failed: ${res.stderr || res.stdout}`.trim()); + } + stdout.write(`${formatLine("Restarted systemd service", unitName)}\n`); + } catch (err) { + const detail = String(err); + if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) { + const stopped = await stopGatewayViaLock({ + env: (env ?? process.env) as NodeJS.ProcessEnv, + stdout, + }); + if (stopped) { + stdout.write("Gateway process stopped via fallback; it will be restarted by the system (if using cron/watcher) or can be started manually.\n"); + return; + } + } + throw err; } - stdout.write(`${formatLine("Restarted systemd service", unitName)}\n`); } export async function isSystemdServiceEnabled(args: { From 6e6f3599db440dd0b2a9b5a6d02c26cee76eea4b Mon Sep 17 00:00:00 2001 From: David Aronchick Date: Fri, 20 Feb 2026 03:09:51 +0000 Subject: [PATCH 4/5] gateway: integrate crash-loop detection and LKG rollback --- src/gateway/server.impl.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index d46a38ef3d6..21abb6687bf 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -12,12 +12,19 @@ import { formatCliCommand } from "../cli/command-format.js"; import { createDefaultDeps } from "../cli/deps.js"; import { CONFIG_PATH, + STATE_DIR, isNixMode, loadConfig, migrateLegacyConfig, readConfigFileSnapshot, writeConfigFile, } from "../config/config.js"; +import { + hasLastKnownGood, + recordStartupAndCheckCrashLoop, + revertToLastKnownGood, + scheduleLastKnownGoodSave, +} from "../config/crash-tracker.js"; import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js"; import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js"; import { @@ -156,6 +163,16 @@ export async function startGatewayServer( port = 18789, opts: GatewayServerOptions = {}, ): Promise { + const isCrashLoop = recordStartupAndCheckCrashLoop(STATE_DIR); + if (isCrashLoop) { + if (hasLastKnownGood(CONFIG_PATH)) { + log.warn("gateway: crash loop detected; reverting to last-known-good config"); + revertToLastKnownGood(CONFIG_PATH, STATE_DIR); + } else { + log.warn("gateway: crash loop detected but no last-known-good config found"); + } + } + // Ensure all default port derivations (browser/canvas) see the actual runtime port. process.env.OPENCLAW_GATEWAY_PORT = String(port); logAcceptedEnvOption({ @@ -622,6 +639,8 @@ export async function startGatewayServer( httpServers, }); + scheduleLastKnownGoodSave(CONFIG_PATH, STATE_DIR); + return { close: async (opts) => { if (diagnosticsEnabled) { From 46cd41bde0b6bba075f9ceeac4d76081cc505fe0 Mon Sep 17 00:00:00 2001 From: Aaron Aronchick Date: Sun, 15 Mar 2026 05:01:39 +0000 Subject: [PATCH 5/5] refactor: focus crash-loop rollback PR --- src/daemon/systemd.ts | 54 -------------------------------------- src/gateway/server.impl.ts | 10 ++++--- src/infra/gateway-lock.ts | 52 ++---------------------------------- 3 files changed, 8 insertions(+), 108 deletions(-) diff --git a/src/daemon/systemd.ts b/src/daemon/systemd.ts index c3cf2d249b9..62ab2dfa146 100644 --- a/src/daemon/systemd.ts +++ b/src/daemon/systemd.ts @@ -1,9 +1,6 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; -import { stopGatewayViaLock } from "../infra/gateway-lock.js"; -import type { GatewayServiceRuntime } from "./service-runtime.js"; -import { colorize, isRich, theme } from "../terminal/theme.js"; import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js"; import { splitArgsPreservingQuotes } from "./arg-split.js"; import { @@ -544,30 +541,6 @@ export async function uninstallSystemdService({ async function runSystemdServiceAction(params: { stdout: NodeJS.WritableStream; - env?: Record; -}): Promise { - try { - await assertSystemdAvailable(); - const serviceName = resolveSystemdServiceName(env ?? {}); - const unitName = `${serviceName}.service`; - const res = await execSystemctl(["--user", "stop", unitName]); - if (res.code !== 0) { - throw new Error(`systemctl stop failed: ${res.stderr || res.stdout}`.trim()); - } - stdout.write(`${formatLine("Stopped systemd service", unitName)}\n`); - } catch (err) { - const detail = String(err); - if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) { - const stopped = await stopGatewayViaLock({ - env: (env ?? process.env) as NodeJS.ProcessEnv, - stdout, - }); - if (stopped) { - return; - } - } - throw err; - } env?: GatewayServiceEnv; action: "stop" | "restart"; label: string; @@ -598,33 +571,6 @@ export async function stopSystemdService({ export async function restartSystemdService({ stdout, env, -}: { - stdout: NodeJS.WritableStream; - env?: Record; -}): Promise { - try { - await assertSystemdAvailable(); - const serviceName = resolveSystemdServiceName(env ?? {}); - const unitName = `${serviceName}.service`; - const res = await execSystemctl(["--user", "restart", unitName]); - if (res.code !== 0) { - throw new Error(`systemctl restart failed: ${res.stderr || res.stdout}`.trim()); - } - stdout.write(`${formatLine("Restarted systemd service", unitName)}\n`); - } catch (err) { - const detail = String(err); - if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) { - const stopped = await stopGatewayViaLock({ - env: (env ?? process.env) as NodeJS.ProcessEnv, - stdout, - }); - if (stopped) { - stdout.write("Gateway process stopped via fallback; it will be restarted by the system (if using cron/watcher) or can be started manually.\n"); - return; - } - } - throw err; - } }: GatewayServiceControlArgs): Promise { await runSystemdServiceAction({ stdout, diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 6a970ceb1b9..f821d5bc710 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -11,7 +11,6 @@ import { createDefaultDeps } from "../cli/deps.js"; import { isRestartEnabled } from "../config/commands.js"; import { CONFIG_PATH, - STATE_DIR, type OpenClawConfig, isNixMode, loadConfig, @@ -26,6 +25,7 @@ import { scheduleLastKnownGoodSave, } from "../config/crash-tracker.js"; import { formatConfigIssueLines } from "../config/issue-format.js"; +import { resolveStateDir } from "../config/paths.js"; import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js"; import { resolveMainSessionKey } from "../config/sessions.js"; import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js"; @@ -275,15 +275,17 @@ export async function startGatewayServer( port = 18789, opts: GatewayServerOptions = {}, ): Promise { - const isCrashLoop = recordStartupAndCheckCrashLoop(STATE_DIR); + const stateDir = resolveStateDir(); + const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir); if (isCrashLoop) { if (hasLastKnownGood(CONFIG_PATH)) { log.warn("gateway: crash loop detected; reverting to last-known-good config"); - revertToLastKnownGood(CONFIG_PATH, STATE_DIR); + revertToLastKnownGood(CONFIG_PATH, stateDir); } else { log.warn("gateway: crash loop detected but no last-known-good config found"); } } + const minimalTestGateway = process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1"; @@ -1079,7 +1081,7 @@ export async function startGatewayServer( httpServers, }); - scheduleLastKnownGoodSave(CONFIG_PATH, STATE_DIR); + scheduleLastKnownGoodSave(CONFIG_PATH, stateDir); return { close: async (opts) => { diff --git a/src/infra/gateway-lock.ts b/src/infra/gateway-lock.ts index da3213a48c9..502e06dec3a 100644 --- a/src/infra/gateway-lock.ts +++ b/src/infra/gateway-lock.ts @@ -136,7 +136,7 @@ async function resolveGatewayOwnerStatus( return isGatewayArgv(args) ? "alive" : "dead"; } -export async function readLockPayload(lockPath: string): Promise { +async function readLockPayload(lockPath: string): Promise { try { const raw = await fs.readFile(lockPath, "utf8"); const parsed = JSON.parse(raw) as Partial; @@ -161,7 +161,7 @@ export async function readLockPayload(lockPath: string): Promise { - const { lockPath } = resolveGatewayLockPath(opts.env); - const payload = await readLockPayload(lockPath); - if (!payload || !payload.pid) { - return false; - } - - const pid = payload.pid; - if (!isAlive(pid)) { - return false; - } - - opts.stdout.write(`Stopping gateway (pid ${pid})...\n`); - try { - process.kill(pid, "SIGTERM"); - } catch (err) { - opts.stdout.write(`Failed to send SIGTERM to pid ${pid}: ${String(err)}\n`); - return false; - } - - // Wait for it to die - const timeoutMs = opts.timeoutMs ?? 5000; - const startedAt = Date.now(); - while (Date.now() - startedAt < timeoutMs) { - if (!isAlive(pid)) { - opts.stdout.write(`Gateway (pid ${pid}) stopped.\n`); - return true; - } - await new Promise((r) => setTimeout(r, 200)); - } - - opts.stdout.write(`Gateway (pid ${pid}) did not stop after ${timeoutMs}ms. Sending SIGKILL...\n`); - try { - process.kill(pid, "SIGKILL"); - return true; - } catch { - return false; - } -} - export async function acquireGatewayLock( opts: GatewayLockOptions = {}, ): Promise {