refactor: focus crash-loop rollback PR
This commit is contained in:
parent
eecd8a9711
commit
46cd41bde0
@ -1,9 +1,6 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { stopGatewayViaLock } from "../infra/gateway-lock.js";
|
||||
import type { GatewayServiceRuntime } from "./service-runtime.js";
|
||||
import { colorize, isRich, theme } from "../terminal/theme.js";
|
||||
import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js";
|
||||
import { splitArgsPreservingQuotes } from "./arg-split.js";
|
||||
import {
|
||||
@ -544,30 +541,6 @@ export async function uninstallSystemdService({
|
||||
|
||||
async function runSystemdServiceAction(params: {
|
||||
stdout: NodeJS.WritableStream;
|
||||
env?: Record<string, string | undefined>;
|
||||
}): Promise<void> {
|
||||
try {
|
||||
await assertSystemdAvailable();
|
||||
const serviceName = resolveSystemdServiceName(env ?? {});
|
||||
const unitName = `${serviceName}.service`;
|
||||
const res = await execSystemctl(["--user", "stop", unitName]);
|
||||
if (res.code !== 0) {
|
||||
throw new Error(`systemctl stop failed: ${res.stderr || res.stdout}`.trim());
|
||||
}
|
||||
stdout.write(`${formatLine("Stopped systemd service", unitName)}\n`);
|
||||
} catch (err) {
|
||||
const detail = String(err);
|
||||
if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) {
|
||||
const stopped = await stopGatewayViaLock({
|
||||
env: (env ?? process.env) as NodeJS.ProcessEnv,
|
||||
stdout,
|
||||
});
|
||||
if (stopped) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
env?: GatewayServiceEnv;
|
||||
action: "stop" | "restart";
|
||||
label: string;
|
||||
@ -598,33 +571,6 @@ export async function stopSystemdService({
|
||||
export async function restartSystemdService({
|
||||
stdout,
|
||||
env,
|
||||
}: {
|
||||
stdout: NodeJS.WritableStream;
|
||||
env?: Record<string, string | undefined>;
|
||||
}): Promise<void> {
|
||||
try {
|
||||
await assertSystemdAvailable();
|
||||
const serviceName = resolveSystemdServiceName(env ?? {});
|
||||
const unitName = `${serviceName}.service`;
|
||||
const res = await execSystemctl(["--user", "restart", unitName]);
|
||||
if (res.code !== 0) {
|
||||
throw new Error(`systemctl restart failed: ${res.stderr || res.stdout}`.trim());
|
||||
}
|
||||
stdout.write(`${formatLine("Restarted systemd service", unitName)}\n`);
|
||||
} catch (err) {
|
||||
const detail = String(err);
|
||||
if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) {
|
||||
const stopped = await stopGatewayViaLock({
|
||||
env: (env ?? process.env) as NodeJS.ProcessEnv,
|
||||
stdout,
|
||||
});
|
||||
if (stopped) {
|
||||
stdout.write("Gateway process stopped via fallback; it will be restarted by the system (if using cron/watcher) or can be started manually.\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}: GatewayServiceControlArgs): Promise<GatewayServiceRestartResult> {
|
||||
await runSystemdServiceAction({
|
||||
stdout,
|
||||
|
||||
@ -11,7 +11,6 @@ import { createDefaultDeps } from "../cli/deps.js";
|
||||
import { isRestartEnabled } from "../config/commands.js";
|
||||
import {
|
||||
CONFIG_PATH,
|
||||
STATE_DIR,
|
||||
type OpenClawConfig,
|
||||
isNixMode,
|
||||
loadConfig,
|
||||
@ -26,6 +25,7 @@ import {
|
||||
scheduleLastKnownGoodSave,
|
||||
} from "../config/crash-tracker.js";
|
||||
import { formatConfigIssueLines } from "../config/issue-format.js";
|
||||
import { resolveStateDir } from "../config/paths.js";
|
||||
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
|
||||
import { resolveMainSessionKey } from "../config/sessions.js";
|
||||
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
|
||||
@ -275,15 +275,17 @@ export async function startGatewayServer(
|
||||
port = 18789,
|
||||
opts: GatewayServerOptions = {},
|
||||
): Promise<GatewayServer> {
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(STATE_DIR);
|
||||
const stateDir = resolveStateDir();
|
||||
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||
if (isCrashLoop) {
|
||||
if (hasLastKnownGood(CONFIG_PATH)) {
|
||||
log.warn("gateway: crash loop detected; reverting to last-known-good config");
|
||||
revertToLastKnownGood(CONFIG_PATH, STATE_DIR);
|
||||
revertToLastKnownGood(CONFIG_PATH, stateDir);
|
||||
} else {
|
||||
log.warn("gateway: crash loop detected but no last-known-good config found");
|
||||
}
|
||||
}
|
||||
|
||||
const minimalTestGateway =
|
||||
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
|
||||
|
||||
@ -1079,7 +1081,7 @@ export async function startGatewayServer(
|
||||
httpServers,
|
||||
});
|
||||
|
||||
scheduleLastKnownGoodSave(CONFIG_PATH, STATE_DIR);
|
||||
scheduleLastKnownGoodSave(CONFIG_PATH, stateDir);
|
||||
|
||||
return {
|
||||
close: async (opts) => {
|
||||
|
||||
@ -136,7 +136,7 @@ async function resolveGatewayOwnerStatus(
|
||||
return isGatewayArgv(args) ? "alive" : "dead";
|
||||
}
|
||||
|
||||
export async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
||||
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
||||
try {
|
||||
const raw = await fs.readFile(lockPath, "utf8");
|
||||
const parsed = JSON.parse(raw) as Partial<LockPayload>;
|
||||
@ -161,7 +161,7 @@ export async function readLockPayload(lockPath: string): Promise<LockPayload | n
|
||||
}
|
||||
}
|
||||
|
||||
export function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
|
||||
function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
|
||||
const stateDir = resolveStateDir(env);
|
||||
const configPath = resolveConfigPath(env, stateDir);
|
||||
const hash = createHash("sha256").update(configPath).digest("hex").slice(0, 8);
|
||||
@ -170,54 +170,6 @@ export function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
|
||||
return { lockPath, configPath };
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to stop a running gateway by reading its PID from the lock file
|
||||
* and sending SIGTERM.
|
||||
*/
|
||||
export async function stopGatewayViaLock(opts: {
|
||||
env: NodeJS.ProcessEnv;
|
||||
stdout: NodeJS.WritableStream;
|
||||
timeoutMs?: number;
|
||||
}): Promise<boolean> {
|
||||
const { lockPath } = resolveGatewayLockPath(opts.env);
|
||||
const payload = await readLockPayload(lockPath);
|
||||
if (!payload || !payload.pid) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pid = payload.pid;
|
||||
if (!isAlive(pid)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
opts.stdout.write(`Stopping gateway (pid ${pid})...\n`);
|
||||
try {
|
||||
process.kill(pid, "SIGTERM");
|
||||
} catch (err) {
|
||||
opts.stdout.write(`Failed to send SIGTERM to pid ${pid}: ${String(err)}\n`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Wait for it to die
|
||||
const timeoutMs = opts.timeoutMs ?? 5000;
|
||||
const startedAt = Date.now();
|
||||
while (Date.now() - startedAt < timeoutMs) {
|
||||
if (!isAlive(pid)) {
|
||||
opts.stdout.write(`Gateway (pid ${pid}) stopped.\n`);
|
||||
return true;
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 200));
|
||||
}
|
||||
|
||||
opts.stdout.write(`Gateway (pid ${pid}) did not stop after ${timeoutMs}ms. Sending SIGKILL...\n`);
|
||||
try {
|
||||
process.kill(pid, "SIGKILL");
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export async function acquireGatewayLock(
|
||||
opts: GatewayLockOptions = {},
|
||||
): Promise<GatewayLockHandle | null> {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user