refactor: focus crash-loop rollback PR

This commit is contained in:
Aaron Aronchick 2026-03-15 05:01:39 +00:00
parent eecd8a9711
commit 46cd41bde0
3 changed files with 8 additions and 108 deletions

View File

@ -1,9 +1,6 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { stopGatewayViaLock } from "../infra/gateway-lock.js";
import type { GatewayServiceRuntime } from "./service-runtime.js";
import { colorize, isRich, theme } from "../terminal/theme.js";
import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js";
import { splitArgsPreservingQuotes } from "./arg-split.js";
import {
@ -544,30 +541,6 @@ export async function uninstallSystemdService({
async function runSystemdServiceAction(params: {
stdout: NodeJS.WritableStream;
env?: Record<string, string | undefined>;
}): Promise<void> {
try {
await assertSystemdAvailable();
const serviceName = resolveSystemdServiceName(env ?? {});
const unitName = `${serviceName}.service`;
const res = await execSystemctl(["--user", "stop", unitName]);
if (res.code !== 0) {
throw new Error(`systemctl stop failed: ${res.stderr || res.stdout}`.trim());
}
stdout.write(`${formatLine("Stopped systemd service", unitName)}\n`);
} catch (err) {
const detail = String(err);
if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) {
const stopped = await stopGatewayViaLock({
env: (env ?? process.env) as NodeJS.ProcessEnv,
stdout,
});
if (stopped) {
return;
}
}
throw err;
}
env?: GatewayServiceEnv;
action: "stop" | "restart";
label: string;
@ -598,33 +571,6 @@ export async function stopSystemdService({
export async function restartSystemdService({
stdout,
env,
}: {
stdout: NodeJS.WritableStream;
env?: Record<string, string | undefined>;
}): Promise<void> {
try {
await assertSystemdAvailable();
const serviceName = resolveSystemdServiceName(env ?? {});
const unitName = `${serviceName}.service`;
const res = await execSystemctl(["--user", "restart", unitName]);
if (res.code !== 0) {
throw new Error(`systemctl restart failed: ${res.stderr || res.stdout}`.trim());
}
stdout.write(`${formatLine("Restarted systemd service", unitName)}\n`);
} catch (err) {
const detail = String(err);
if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) {
const stopped = await stopGatewayViaLock({
env: (env ?? process.env) as NodeJS.ProcessEnv,
stdout,
});
if (stopped) {
stdout.write("Gateway process stopped via fallback; it will be restarted by the system (if using cron/watcher) or can be started manually.\n");
return;
}
}
throw err;
}
}: GatewayServiceControlArgs): Promise<GatewayServiceRestartResult> {
await runSystemdServiceAction({
stdout,

View File

@ -11,7 +11,6 @@ import { createDefaultDeps } from "../cli/deps.js";
import { isRestartEnabled } from "../config/commands.js";
import {
CONFIG_PATH,
STATE_DIR,
type OpenClawConfig,
isNixMode,
loadConfig,
@ -26,6 +25,7 @@ import {
scheduleLastKnownGoodSave,
} from "../config/crash-tracker.js";
import { formatConfigIssueLines } from "../config/issue-format.js";
import { resolveStateDir } from "../config/paths.js";
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
import { resolveMainSessionKey } from "../config/sessions.js";
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
@ -275,15 +275,17 @@ export async function startGatewayServer(
port = 18789,
opts: GatewayServerOptions = {},
): Promise<GatewayServer> {
const isCrashLoop = recordStartupAndCheckCrashLoop(STATE_DIR);
const stateDir = resolveStateDir();
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
if (isCrashLoop) {
if (hasLastKnownGood(CONFIG_PATH)) {
log.warn("gateway: crash loop detected; reverting to last-known-good config");
revertToLastKnownGood(CONFIG_PATH, STATE_DIR);
revertToLastKnownGood(CONFIG_PATH, stateDir);
} else {
log.warn("gateway: crash loop detected but no last-known-good config found");
}
}
const minimalTestGateway =
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
@ -1079,7 +1081,7 @@ export async function startGatewayServer(
httpServers,
});
scheduleLastKnownGoodSave(CONFIG_PATH, STATE_DIR);
scheduleLastKnownGoodSave(CONFIG_PATH, stateDir);
return {
close: async (opts) => {

View File

@ -136,7 +136,7 @@ async function resolveGatewayOwnerStatus(
return isGatewayArgv(args) ? "alive" : "dead";
}
export async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
try {
const raw = await fs.readFile(lockPath, "utf8");
const parsed = JSON.parse(raw) as Partial<LockPayload>;
@ -161,7 +161,7 @@ export async function readLockPayload(lockPath: string): Promise<LockPayload | n
}
}
export function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
const stateDir = resolveStateDir(env);
const configPath = resolveConfigPath(env, stateDir);
const hash = createHash("sha256").update(configPath).digest("hex").slice(0, 8);
@ -170,54 +170,6 @@ export function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
return { lockPath, configPath };
}
/**
* Attempt to stop a running gateway by reading its PID from the lock file
* and sending SIGTERM.
*/
export async function stopGatewayViaLock(opts: {
env: NodeJS.ProcessEnv;
stdout: NodeJS.WritableStream;
timeoutMs?: number;
}): Promise<boolean> {
const { lockPath } = resolveGatewayLockPath(opts.env);
const payload = await readLockPayload(lockPath);
if (!payload || !payload.pid) {
return false;
}
const pid = payload.pid;
if (!isAlive(pid)) {
return false;
}
opts.stdout.write(`Stopping gateway (pid ${pid})...\n`);
try {
process.kill(pid, "SIGTERM");
} catch (err) {
opts.stdout.write(`Failed to send SIGTERM to pid ${pid}: ${String(err)}\n`);
return false;
}
// Wait for it to die
const timeoutMs = opts.timeoutMs ?? 5000;
const startedAt = Date.now();
while (Date.now() - startedAt < timeoutMs) {
if (!isAlive(pid)) {
opts.stdout.write(`Gateway (pid ${pid}) stopped.\n`);
return true;
}
await new Promise((r) => setTimeout(r, 200));
}
opts.stdout.write(`Gateway (pid ${pid}) did not stop after ${timeoutMs}ms. Sending SIGKILL...\n`);
try {
process.kill(pid, "SIGKILL");
return true;
} catch {
return false;
}
}
export async function acquireGatewayLock(
opts: GatewayLockOptions = {},
): Promise<GatewayLockHandle | null> {